📄 page.cpp
字号:
} s = s.substr(idx); idxHref=0; idxArea=0; }while(1); s = dest; if( s.length() < 20 ) return false; // erase all '"' , '\'', " ". CStrFun::EraseStr(s, "\""); CStrFun::EraseStr(s, "'"); CStrFun::EraseStr(s, " "); // Keep URLs and anchor text. idxHref=0; const string delims( " #>"); dest.clear(); do{ if( s.empty() ) break; idxHref = CStrFun::FindCase(s, "href"); if( idxHref == string::npos) break; pre_idx = idxHref; //#### idx = s.find('=', idxHref); if( idx == string::npos ) break; s = s.substr(idx+1); while( s.length() > 0 && s[0] == ' ' ){ s.erase(0,1); } if( s.length() == 0 ) break; idx = s.find_first_of(delims,1); //cout << endl << s.substr(0, idx) << endl; if( idx == string::npos ) break; dest += '"' + s.substr(0, idx); //cout << endl << dest << endl; idx = s.find('>'); if( idx == string::npos ) break; dest += '>'; s = s.substr(idx +1); idx = s.find('<'); if( !s.empty() ){ idxHref = CStrFun::FindCase(s, "href"); idx = idx > idxHref ? idxHref: idx; } if( idx == string::npos ){ dest += s; break; }/* if( idx == idxHref ){ dest += '"' + s.substr(0,idx); }else{*/ dest += s.substr(0,idx); //} //#### idxHref=0; }while(1); // look for empty filenames. idx = 0; while( (idx = dest.find("\"\"",idx)) != string::npos ){ dest.erase(idx, 1); } s = dest; return( s.length() < 20 ? false: true );} /******************************************************************* Function name: GetLinkInfo4History()** Input argv:** -- ** Output argv:** --** Return: true: success false: fail** Function Description: Get links for history archiving** Version: 1.0** Be careful:*****************************************************************/bool CPage::GetLinkInfo4History(){ if( m_sContentLinkInfo.empty() ) return false; m_sLinkInfo4History = m_sContentLinkInfo; string& s = this->m_sLinkInfo4History; // Keep only <img ...> tags. string::size_type idxImg=0, idx,pre_idx; string dest; do{ if( s.empty() ) break; idxImg = CStrFun::FindCase(s, "<img"); pre_idx = idxImg; if( pre_idx == string::npos) break; s = s.substr(pre_idx); idx = s.find_first_of('<',1); if( idx != string::npos ){ dest = dest + s.substr(0,idx); }else if (idx == string::npos && pre_idx != string::npos){ dest = dest + s; break; }else{ break; } s = s.substr(idx); idxImg=0; }while(1); s = dest; if( s.length() < 20 ) return false; // erase all '"'. '\''," ". CStrFun::EraseStr(s , "\""); CStrFun::EraseStr(s , "'"); CStrFun::EraseStr(s , " "); // Keep URLs and anchor text. idxImg=0; string::size_type idxSrc = 0; const string delims( " #>"); dest.clear(); do{ if( s.empty() ) break; idxImg = CStrFun::FindCase(s, "img"); if( idxImg == string::npos) break; pre_idx = idxImg; s = s.substr(idxImg+3); // skip "img" //#### idx = s.find('>', idxImg); if( idxImg == string::npos) break; if( s.empty() ) break; idxSrc = CStrFun::FindCase(s, "src"); if( idxSrc > idxImg ) continue; s = s.substr(idxSrc); idx = s.find('=', idxImg); if( idx == string::npos ) break; s = s.substr(idx+1); while( s.length() > 0 && s[0] == ' ' ){ s.erase(0,1); } if( s.length() == 0 ) break; idx = s.find_first_of(delims,1); if( idx == string::npos ) break; if( s.at(0) == '"'){ dest += s.substr(0, idx); }else{ dest += '"' + s.substr(0, idx); } idx = s.find('>'); if( idx == string::npos ) break; dest += '>'; s = s.substr(idx +1); idx = s.find('<'); if( idx == string::npos ){ dest += s; break; } dest += s.substr(0,idx); //#### idxImg=0; }while(1); // look for empty filenames. idx = 0; while( (idx = dest.find("\"\"",idx)) != string::npos ){ dest.erase(idx, 1); } s = dest; return( s.length() < 20 ? false: true );}bool CPage::NormalizeUrl(string& strUrl){ string::size_type idx; if( CStrFun::FindCase(strUrl, "http://") == string::npos ) return false; // convert "http://e.pku.cn" to "http://e.pku.cn/" idx = strUrl.rfind('/'); if( idx < 8 ) { strUrl = strUrl + "/"; return true; } while( (idx=strUrl.find("/./")) != string::npos ){ if( idx != string::npos ) strUrl.erase(idx,2); } while( (idx = strUrl.find("/../")) != string::npos ){ string strPre,strSuf; strPre = strUrl.substr(0, idx); if( strUrl.length() > idx+4 ) strSuf = strUrl.substr(idx+4); idx = strPre.rfind("/"); if( idx != string::npos) strPre = strPre.substr(0,idx+1); if( strPre.length() < 10 ) return false; strUrl = strPre + strSuf; } if( CStrFun::FindCase(strUrl, "http://") != 0 ) return false; return true;}bool CPage::FindRefLink4SE(){ if( m_sLinkInfo4SE.empty() ) return false; char *buffer = (char*)m_sLinkInfo4SE.c_str(); int urlnum=0,len; char *ptr ; static char buf[URL_REFERENCE_LEN]; memset(buf, 0, URL_REFERENCE_LEN); len = strlen(buffer); if( len < 8 ) return false; len = len < URL_REFERENCE_LEN -1 ? len : URL_REFERENCE_LEN - 1; strncpy( buf, buffer, len);/*first *------> */ ptr = buf; while( ptr - buf < len && *ptr ){ while( *ptr == '"' && *ptr) ptr++; if ( !*ptr ) break; this->m_RefLink4SE[ urlnum].link = ptr; while( *ptr && *ptr != '>'){ if(*ptr == ' ') *ptr = '\0'; ptr++; } if ( !*ptr ){ urlnum++; break; } if ( *ptr == '>' ){ *ptr++='\0'; if( !*ptr ){ urlnum++; break; } if( *ptr == '"' ){ this->m_RefLink4SE[urlnum].anchor_text = NULL; }else{ this->m_RefLink4SE[urlnum].anchor_text = ptr; while( *ptr && *ptr != '"') ptr++; if (!*ptr){ urlnum++; break; } if ( *ptr == '"') *ptr='\0'; } } //cout << endl << this->m_RefLink4SE[ urlnum].link << '\t'; //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl; ptr++; urlnum++; if ( urlnum == MAX_URL_REFERENCES) break; } //cout << endl << this->m_RefLink4SE[ urlnum].link << endl; //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl; this->m_nRefLink4SENum = urlnum;/*second *------> */ //typedef map<string,string,less<string> >::value_type valType; typedef map<string,string>::value_type valType; m_mapLink4SE.clear(); //string strRootUrl= m_sUrl; CUrl iUrl; if( iUrl.ParseUrlEx(m_sUrl) == false ){ cout << "ParseUrlEx error in FindRefLink4SE(): " << m_sUrl << endl; return false; } for(int i=0; i<m_nRefLink4SENum; i++){ string str; string::size_type idx; const string delims(" #"); str = m_RefLink4SE[i].link; idx = str.find_first_of(delims, 0 ); if( idx != string::npos ){ str = str.substr(0, idx); } if( str.size() == 0 || str.size() > URL_LEN - 1 || str.size() < 4 ) continue; string::size_type idx1; idx1 = CStrFun::FindCase(str, "http"); if( idx1 != 0 ){ char c1 = m_sUrl.at(m_sUrl.length()-1); char c2 = str.at(0); if( c2=='/' ){ if( iUrl.m_nPort != 80 ){ cout << iUrl.m_sHost << endl; cout << str << endl; //str = "http://" + iUrl.m_sHost + ":" + (const char*)(iUrl.m_nPort) + str; str = "http://" + iUrl.m_sHost + ":" + CStrFun::itos(iUrl.m_nPort) + str; } else { str = "http://" + iUrl.m_sHost + str; } } else if( c1!='/' && c2!='/'){ string::size_type idx; idx = m_sUrl.rfind('/'); if( idx != string::npos ){ if( idx > 6 ){ // > strlen("http://..") str = m_sUrl.substr(0, idx+1) + str; } else { str = m_sUrl + "/" + str; } } else { continue; } } else { if( c1=='/' ){ str = m_sUrl + str; } else { str = m_sUrl + "/" + str; } } } if( NormalizeUrl(str) == false ) continue; if( IsFilterLink(str) ) continue; //debug //cout << "reflink: " << reflink << endl; if( str == m_sUrl ){ continue; }else{ if( m_RefLink4SE[i].anchor_text ){ if( m_mapLink4SE.find(str) == m_mapLink4SE.end() ){ m_mapLink4SE.insert( valType( str, m_RefLink4SE[i].anchor_text)); } }else{ if( m_mapLink4SE.find(str) == m_mapLink4SE.end() ){ m_mapLink4SE.insert( valType( str, "\0") ); cout << "."; } } } } m_nRefLink4SENum = m_mapLink4SE.size(); //cout << endl; return true;}bool CPage::FindRefLink4History(){ if( m_sLinkInfo4History.empty() ) return false; char *buffer = (char*)m_sLinkInfo4History.c_str(); int urlnum=0,len; char *ptr ; static char buf[URL_REFERENCE_LEN/2]; memset(buf, 0, URL_REFERENCE_LEN/2); len = strlen(buffer); if( len < 8 ) return false; len = len < URL_REFERENCE_LEN/2 - 1? len : URL_REFERENCE_LEN/2 -1; strncpy( buf, buffer, len);/*first *------> */ ptr = buf; while( ptr - buf < len && *ptr ){ while( *ptr == '"' && *ptr) ptr++; if ( !*ptr ) break; this->m_RefLink4History[ urlnum].link = ptr; while( *ptr && *ptr != '>'){ if( *ptr == ' ') *ptr='\0'; ptr++; } if( !*ptr){ urlnum++; break; } if( *ptr == '>' ){ *ptr++ = 0; if( !*ptr ){ urlnum++; break; } if( *ptr == '"' ){ }else{ while( *ptr && *ptr != '"') ptr++; if( !*ptr ){ urlnum++; break; } if ( *ptr == '"' ) *ptr++='\0'; } } ptr++; urlnum++; if ( urlnum == MAX_URL_REFERENCES/2) break; } this->m_nRefLink4HistoryNum = urlnum;/*second *------> */ m_vecLink4History.clear(); //string strRootUrl= m_sUrl; CUrl iUrl; if( iUrl.ParseUrlEx(m_sUrl) == false ){ cout << "ParseUrlEx error in FindRefLink4History(): " << m_sUrl << endl; return false; } for(int i=0; i<m_nRefLink4HistoryNum; i++){ string str; //string::size_type idx; str = m_RefLink4History[i].link; if( str.size()==0 || str.size() > URL_LEN - 1 || str.size() < 4 ) continue;/* char *pdest1, *pdest2; pdest1 = strstr( str.c_str(), "http" ); pdest2 = strstr( str.c_str(), "HTTP" ); if( pdest1==NULL && pdest2==NULL ){*/ string::size_type idx1; idx1 = CStrFun::FindCase(str, "http"); if( idx1 != 0 ){ char c1 = m_sUrl.at(m_sUrl.length()-1); char c2 = str.at(0); if( c2=='/' ){ if( iUrl.m_nPort != 80 ){ str = "http://" + iUrl.m_sHost + ":" + CStrFun::itos(iUrl.m_nPort) + str; } else { str = "http://" + iUrl.m_sHost + str; } } else if( c1!='/' && c2!='/'){ string::size_type idx; idx = m_sUrl.rfind('/'); if( idx != string::npos ){ if( idx > 6 ){ // > strlen("http://..") str = m_sUrl.substr(0, idx+1) + str; } else { str = m_sUrl + "/" + str; } } else { continue; } } else { if( c1=='/' ){ str = m_sUrl + str; } else { str = m_sUrl + "/" + str; } } } // due to bad link parser/* idx = reflink.find(' '); if(idx != string::npos){ reflink = reflink.substr(0,idx); } idx = reflink.find('"'); if(idx != string::npos){ reflink = reflink.substr(0,idx); }*/ //############# if( NormalizeUrl(str) == false ) continue; if( IsFilterLink(str) ) continue; if( str == m_sUrl ){ continue; }else{ vector<string>::iterator it; it = find(m_vecLink4History.begin(), m_vecLink4History.end(),str); if( it == m_vecLink4History.end() ){ m_vecLink4History.push_back( str); cout << "."; } } } m_nRefLink4HistoryNum = m_vecLink4History.size(); //cout << endl; return true;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -