📄 page.cpp
字号:
if( idx != string::npos) { pre_idx = idx + sizeof("charset=") -1; idx = headerBuf.find_first_of(delims, pre_idx ); if(idx != string::npos){ this->m_sCharset = headerBuf.substr(pre_idx, idx - pre_idx); } } return 0;}int CPage::GetContentEncoding(){ string headerBuf=this->m_sHeader, strConEncoding; string::size_type pre_idx,idx; const string delims("\r\n"); CStrFun::Str2Lower( headerBuf, headerBuf.length() ); idx = headerBuf.find("content-encoding:"); if( idx != string::npos) { pre_idx = idx + sizeof("content-encoding: ") -1; idx = headerBuf.find_first_of(delims, pre_idx ); if(idx != string::npos){ this->m_sContentEncoding = headerBuf.substr(pre_idx, idx - pre_idx); } } return 0;}int CPage::GetContentType(){ string headerBuf=this->m_sHeader, strContentType; string::size_type pre_idx,idx; const string delims(";\r\n"); CStrFun::Str2Lower( headerBuf, headerBuf.length() ); idx = headerBuf.find("content-type:"); if( idx != string::npos) { pre_idx = idx + sizeof("content-type: ") -1; idx = headerBuf.find_first_of(delims, pre_idx ); if(idx != string::npos){ this->m_sContentType = headerBuf.substr(pre_idx, idx - pre_idx); } } return 0;}void CPage::FindRefLink4SE(){ assert( this->m_sLinkInfo4SE.empty() == false ); char *buffer = (char*)this->m_sLinkInfo4SE.c_str(); int urlnum=0,len; char *ptr ; static char buf[URL_REFERENCE_LEN]; memset(buf, 0, URL_REFERENCE_LEN); len = strlen(buffer); if( len < 8 ) return; len = len < URL_REFERENCE_LEN -1 ? len : URL_REFERENCE_LEN - 1; strncpy( buf, buffer, len);/*first *------> */ ptr = buf; while( ptr - buf < len && *ptr ){ while( *ptr == '"' && *ptr) ptr++; if ( !*ptr ) break; this->m_RefLink4SE[ urlnum].link = ptr; while( *ptr && *ptr != '>') ptr++; if ( !*ptr ){ urlnum++; break; } if ( *ptr == '>' ){ *ptr++='\0'; if( !*ptr ){ urlnum++; break; } if( *ptr == '"' ){ this->m_RefLink4SE[urlnum].anchor_text = NULL; }else{ this->m_RefLink4SE[urlnum].anchor_text = ptr; while( *ptr && *ptr != '"') ptr++; if (!*ptr){ urlnum++; break; } if ( *ptr == '"') *ptr='\0'; } } //cout << endl << this->m_RefLink4SE[ urlnum].link << '\t'; //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl; ptr++; urlnum++; if ( urlnum == MAX_URL_REFERENCES) break; } //cout << endl << this->m_RefLink4SE[ urlnum].link << endl; //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl; this->m_nRefLink4SENum = urlnum;/*second *------> */ typedef map<string,string,less<string> >::value_type valType; m_mapLink4SE.clear(); string strRootUrl= this->m_sUrl; for(int i=0; i<this->m_nRefLink4SENum; i++){ string reflink = this->m_RefLink4SE[i].link; string::size_type idx; const string delims(" #"); idx = reflink.find_first_of(delims, 0 ); if( idx != string::npos ){ reflink = reflink.substr(0, idx); } if( reflink.length() == 0 ) continue; if( strlen(this->m_RefLink4SE[i].link) > URL_LEN - 1 || strlen(this->m_RefLink4SE[i].link) < 4 ) continue; if ( memcmp("http",this->m_RefLink4SE[i].link,4) ){ if( strRootUrl[strRootUrl.length()-1] != '/' && *(this->m_RefLink4SE[i].link) != '/'){ idx = strRootUrl.rfind('/'); if(idx != string::npos && idx > 6){ // > strlen("http://) reflink = strRootUrl.substr(0, idx+1) + this->m_RefLink4SE[i].link; } else{ reflink = strRootUrl + "/" + this->m_RefLink4SE[i].link; } }else if( *(this->m_RefLink4SE[i].link) == '/' ){ CUrl iUrl; iUrl.ParseUrl(strRootUrl); reflink = "http://" + iUrl.m_sHost + this->m_RefLink4SE[i].link; }else{ reflink = strRootUrl + this->m_RefLink4SE[i].link; } }else{ reflink = this->m_RefLink4SE[i].link; }/* // due to bad link parser idx = reflink.find(' '); if(idx != string::npos){ reflink = reflink.substr(0,idx); } idx = reflink.find('"'); if(idx != string::npos){ reflink = reflink.substr(0,idx); } //#############*/ if( NormallizeUrl(reflink) == -1 ) continue; if( IsFilterLink(reflink) ) continue; //debug //cout << "reflink: " << reflink << endl; if( reflink == strRootUrl ){ continue; }else{ if( this->m_RefLink4SE[i].anchor_text ){ if( m_mapLink4SE.count(reflink) == 0 ){ m_mapLink4SE.insert( valType( reflink, this->m_RefLink4SE[i].anchor_text)); } }else{ if( m_mapLink4SE.count(reflink) == 0 ){ m_mapLink4SE.insert( valType( reflink, "\0")); cout << "."; } } } } cout << endl;}void CPage::FindRefLink4History(){ char *buffer = (char*)this->m_sLinkInfo4History.c_str(); int urlnum=0,len; char *ptr ; static char buf[URL_REFERENCE_LEN/2]; memset(buf, 0, URL_REFERENCE_LEN/2); len = strlen(buffer); if( len < 8 ) return; len = len < URL_REFERENCE_LEN/2 - 1? len : URL_REFERENCE_LEN/2 -1; strncpy( buf, buffer, len);/*first *------> */ ptr = buf; while( ptr - buf < len && *ptr ){ while( *ptr == '"' && *ptr) ptr++; if ( !*ptr ) break; this->m_RefLink4History[ urlnum].link = ptr; while( *ptr && *ptr != '>') ptr++; if( !*ptr){ urlnum++; break; } if( *ptr == '>' ){ *ptr++ = 0; if( !*ptr ){ urlnum++; break; } if( *ptr == '"' ){ }else{ while( *ptr && *ptr != '"') ptr++; if( !*ptr ){ urlnum++; break; } if ( *ptr == '"' ) *ptr++='\0'; } } ptr++; urlnum++; if ( urlnum == MAX_URL_REFERENCES/2) break; } this->m_nRefLink4HistoryNum = urlnum;/*second *------> */ string strRootUrl= this->m_sUrl; m_vecLink4History.clear(); for(int i=0; i<this->m_nRefLink4HistoryNum; i++){ string reflink; string::size_type idx; if( strlen(this->m_RefLink4History[i].link) > URL_LEN - 1 || strlen(this->m_RefLink4History[i].link) < 4 ) continue; if ( memcmp("http",this->m_RefLink4History[i].link,4) ){ if( strRootUrl[strRootUrl.length()-1] != '/' && *(this->m_RefLink4History[i].link) != '/'){ idx = strRootUrl.rfind('/'); if(idx != string::npos && idx > 6){ // > strlen("http://) reflink = strRootUrl.substr(0, idx+1) + this->m_RefLink4History[i].link; } else{ reflink = strRootUrl + "/" + this->m_RefLink4History[i].link; } }else{ reflink = strRootUrl + this->m_RefLink4History[i].link; } }else if( *(this->m_RefLink4History[i].link) == '/' ){ CUrl iUrl; iUrl.ParseUrl(strRootUrl); reflink = "http://" + iUrl.m_sHost + this->m_RefLink4History[i].link; }else{ reflink = this->m_RefLink4History[i].link; } // due to bad link parser idx = reflink.find(' '); if(idx != string::npos){ reflink = reflink.substr(0,idx); } idx = reflink.find('"'); if(idx != string::npos){ reflink = reflink.substr(0,idx); } //############# if( NormallizeUrl(reflink) == -1 ) continue; if( IsFilterLink(reflink) ) continue; if( reflink == strRootUrl ){ continue; }else{ vector<string>::iterator it; it = find(m_vecLink4History.begin(), m_vecLink4History.end(),reflink); if( it == m_vecLink4History.end() ){ m_vecLink4History.push_back( reflink); cout << "."; } } } cout << endl;}/* * Filter spam links * If it is, return ture; otherwise false */bool CPage::IsFilterLink(string plink){ assert(plink.c_str() != NULL); const char *filter_str[]={ "gate","search","library","data/scop","uhtbin","staff/staff", "enter","userid","pstmail?","pst?","find?","ccc?", "fwd?","tcon?","&","Counter?","forum","cgisirsi", "+","{","}","proxy","login","mailto:", "javascript:" }; int filter_str_num = 25; string link = plink; CStrFun::Str2Lower( link, link.length() ); for(int i=0; i<filter_str_num; i++){ if( link.find(filter_str[i]) != string::npos) return true; } return false;}int CPage::NormallizeUrl(string& strUrl){ string::size_type idx;/* First * ---------> find real url */ if( strUrl.find("http://") ==string::npos ) return -1; while( (idx=strUrl.find("/./")) != string::npos ){ if( idx != string::npos ) strUrl.erase(idx,2); } while( (idx = strUrl.find("/../")) != string::npos ){ string strPre,strSuf; strPre = strUrl.substr(0, idx); if( strUrl.length() > idx+4 ) strSuf = strUrl.substr(idx+4); idx = strPre.rfind("/"); if( idx != string::npos) strPre = strPre.substr(0,idx+1); if( strPre.length() < 10 ) return - 1; strUrl = strPre + strSuf; } if( strUrl.find("http://") ==string::npos ) return -1;/* Second * ---------> handle error url * such as: * http://www.itpub.net/showthread.php?s=&threadid=40236/usercp.php? * s=e62967982835ef10708a815274e56816/*/ idx = strUrl.find('?'); if(idx != string::npos){ string::size_type idx2; idx2 = strUrl.find('?', idx+1); if(idx2 != string::npos){ strUrl = strUrl.substr(0, idx2); idx = strUrl.rfind('/'); if(idx2 != string::npos){ strUrl = strUrl.substr(0, idx2); }else{ return -1; } } } return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -