📄 page.cpp

📁 小型搜索引擎,用C/C++编写,属于全文搜索引擎
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
	if( idx != string::npos) {		pre_idx = idx + sizeof("charset=") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if(idx != string::npos){			this->m_sCharset = headerBuf.substr(pre_idx, idx - pre_idx);		}	}	return 0;}int CPage::GetContentEncoding(){	string	headerBuf=this->m_sHeader, 		strConEncoding;	string::size_type pre_idx,idx;	const string delims("\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("content-encoding:");	if( idx != string::npos) {		pre_idx = idx + sizeof("content-encoding: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if(idx != string::npos){			this->m_sContentEncoding = headerBuf.substr(pre_idx, idx - pre_idx);		}	}	return 0;}int CPage::GetContentType(){	string	headerBuf=this->m_sHeader, 		strContentType;	string::size_type pre_idx,idx;	const string delims(";\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("content-type:");	if( idx != string::npos) {		pre_idx = idx + sizeof("content-type: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if(idx != string::npos){			this->m_sContentType = headerBuf.substr(pre_idx, idx - pre_idx);		}	}	return 0;}void CPage::FindRefLink4SE(){	assert( this->m_sLinkInfo4SE.empty() == false );	char *buffer = (char*)this->m_sLinkInfo4SE.c_str();	int urlnum=0,len;	char *ptr ;	static char buf[URL_REFERENCE_LEN];	memset(buf, 0, URL_REFERENCE_LEN);	len = strlen(buffer);	if( len < 8 ) return;	len = len < URL_REFERENCE_LEN -1 ? len : URL_REFERENCE_LEN - 1;	strncpy( buf, buffer, len);/*first *------> */	ptr = buf;	while( ptr - buf < len  && *ptr ){		while( *ptr == '"' && *ptr) ptr++;		if ( !*ptr ) break;		this->m_RefLink4SE[ urlnum].link = ptr;		while( *ptr && *ptr != '>') ptr++;		if ( !*ptr ){			urlnum++;			break;		}		if ( *ptr == '>' ){			*ptr++='\0';			if( !*ptr ){				urlnum++;				break;			}			if( *ptr == '"' ){				this->m_RefLink4SE[urlnum].anchor_text = NULL;			}else{				this->m_RefLink4SE[urlnum].anchor_text = ptr;				while( *ptr && *ptr != '"') ptr++;				if (!*ptr){					urlnum++;					break;				}				if ( *ptr == '"') *ptr='\0';			}		}				//cout << endl << this->m_RefLink4SE[ urlnum].link << '\t';		//cout << this->m_RefLink4SE[ urlnum].anchor_text << endl;		ptr++;		urlnum++;		if ( urlnum == MAX_URL_REFERENCES) break;	}	//cout << endl << this->m_RefLink4SE[ urlnum].link << endl;	//cout << this->m_RefLink4SE[ urlnum].anchor_text << endl;	this->m_nRefLink4SENum = urlnum;/*second *------> */	typedef map<string,string,less<string> >::value_type valType;	m_mapLink4SE.clear();	string strRootUrl= this->m_sUrl;		for(int i=0; i<this->m_nRefLink4SENum; i++){		string reflink = this->m_RefLink4SE[i].link;		string::size_type idx;		const string delims(" #");		idx = reflink.find_first_of(delims, 0 );		if( idx != string::npos ){			reflink = reflink.substr(0, idx);		}		if( reflink.length() == 0 ) continue;		if( strlen(this->m_RefLink4SE[i].link) > URL_LEN - 1 			|| strlen(this->m_RefLink4SE[i].link) < 4 ) continue;		if ( memcmp("http",this->m_RefLink4SE[i].link,4) ){			if( strRootUrl[strRootUrl.length()-1] != '/' 				&& *(this->m_RefLink4SE[i].link) != '/'){				idx = strRootUrl.rfind('/');				if(idx != string::npos && idx > 6){ // > strlen("http://)					reflink = strRootUrl.substr(0, idx+1) + this->m_RefLink4SE[i].link;				} else{					reflink = strRootUrl + "/" + this->m_RefLink4SE[i].link;				}			}else if( *(this->m_RefLink4SE[i].link) == '/' ){				CUrl iUrl;				iUrl.ParseUrl(strRootUrl);				reflink = "http://" + iUrl.m_sHost + this->m_RefLink4SE[i].link;			}else{				reflink = strRootUrl + this->m_RefLink4SE[i].link;			}		}else{			reflink = this->m_RefLink4SE[i].link;		}/*		// due to bad link parser		idx = reflink.find(' ');		if(idx != string::npos){			reflink = reflink.substr(0,idx);		}		idx = reflink.find('"');		if(idx != string::npos){			reflink = reflink.substr(0,idx);		}		//#############*/		if( NormallizeUrl(reflink) == -1 ) continue;				if( IsFilterLink(reflink) ) continue;		//debug		//cout << "reflink: " << reflink << endl;		if( reflink == strRootUrl ){			continue;		}else{			if( this->m_RefLink4SE[i].anchor_text ){				if( m_mapLink4SE.count(reflink) == 0 ){					m_mapLink4SE.insert( valType( reflink, this->m_RefLink4SE[i].anchor_text));				}			}else{				if( m_mapLink4SE.count(reflink) == 0 ){					m_mapLink4SE.insert( valType( reflink, "\0"));					cout << ".";				}			}		}				}	cout << endl;}void CPage::FindRefLink4History(){	char *buffer = (char*)this->m_sLinkInfo4History.c_str();	int urlnum=0,len;	char *ptr ;	static char buf[URL_REFERENCE_LEN/2];	memset(buf, 0, URL_REFERENCE_LEN/2);	len = strlen(buffer);	if( len < 8 ) return;	len = len < URL_REFERENCE_LEN/2 - 1? len : URL_REFERENCE_LEN/2 -1;	strncpy( buf, buffer, len);/*first *------> */	ptr = buf;	while( ptr - buf < len  && *ptr ){		while( *ptr == '"' && *ptr) ptr++;		if ( !*ptr ) break;		this->m_RefLink4History[ urlnum].link = ptr;		while( *ptr && *ptr != '>') ptr++;		if( !*ptr){			urlnum++;			break;		}		if( *ptr == '>' ){			*ptr++ = 0;			if( !*ptr ){				urlnum++;				break;			}			if( *ptr == '"' ){						}else{				while( *ptr && *ptr != '"') ptr++;				if( !*ptr ){					urlnum++;					break;				}				if ( *ptr == '"' ) *ptr++='\0';			}		}				ptr++;		urlnum++;		if ( urlnum == MAX_URL_REFERENCES/2) break;	}	this->m_nRefLink4HistoryNum = urlnum;/*second *------> */	string strRootUrl= this->m_sUrl;	m_vecLink4History.clear();	for(int i=0; i<this->m_nRefLink4HistoryNum; i++){		string reflink;		string::size_type idx;		if( strlen(this->m_RefLink4History[i].link) > URL_LEN - 1 			|| strlen(this->m_RefLink4History[i].link) < 4 ) continue;		if ( memcmp("http",this->m_RefLink4History[i].link,4) ){			if( strRootUrl[strRootUrl.length()-1] != '/' 				&& *(this->m_RefLink4History[i].link) != '/'){				idx = strRootUrl.rfind('/');				if(idx != string::npos && idx > 6){ // > strlen("http://)					reflink = strRootUrl.substr(0, idx+1) + this->m_RefLink4History[i].link;				} else{					reflink = strRootUrl + "/" + this->m_RefLink4History[i].link;				}			}else{				reflink = strRootUrl + this->m_RefLink4History[i].link;			}		}else if( *(this->m_RefLink4History[i].link) == '/' ){			CUrl iUrl;			iUrl.ParseUrl(strRootUrl);			reflink = "http://" + iUrl.m_sHost + this->m_RefLink4History[i].link;		}else{			reflink = this->m_RefLink4History[i].link;		}		// due to bad link parser		idx = reflink.find(' ');		if(idx != string::npos){			reflink = reflink.substr(0,idx);		}		idx = reflink.find('"');		if(idx != string::npos){			reflink = reflink.substr(0,idx);		}		//#############		if( NormallizeUrl(reflink) == -1 ) continue;		if( IsFilterLink(reflink) ) continue;		if( reflink == strRootUrl ){			continue;		}else{			vector<string>::iterator it;			it = find(m_vecLink4History.begin(), m_vecLink4History.end(),reflink);			if( it == m_vecLink4History.end() ){				m_vecLink4History.push_back( reflink);				cout << ".";			}		}				}	cout << endl;}/* * Filter spam links * If it is, return ture; otherwise false */bool CPage::IsFilterLink(string plink){	assert(plink.c_str() != NULL);	const char *filter_str[]={		"gate","search","library","data/scop","uhtbin","staff/staff",		"enter","userid","pstmail?","pst?","find?","ccc?",		"fwd?","tcon?","&amp","Counter?","forum","cgisirsi",		"+","{","}","proxy","login","mailto:",		"javascript:"		};	int filter_str_num = 25;	string link = plink;		CStrFun::Str2Lower( link, link.length() );	for(int i=0; i<filter_str_num; i++){		if( link.find(filter_str[i]) != string::npos)		return true;	}		return false;}int CPage::NormallizeUrl(string& strUrl){	string::size_type idx;/* First * ---------> find real url */	if( strUrl.find("http://") ==string::npos ) return -1;	while( (idx=strUrl.find("/./")) != string::npos ){		if( idx != string::npos ) strUrl.erase(idx,2);	}	while( (idx = strUrl.find("/../")) != string::npos ){		string strPre,strSuf;		strPre = strUrl.substr(0, idx);		if( strUrl.length() > idx+4 )			strSuf = strUrl.substr(idx+4);		idx = strPre.rfind("/");		if( idx != string::npos)			strPre = strPre.substr(0,idx+1);		if( strPre.length() < 10 ) return - 1;		strUrl = strPre + strSuf;	}	if( strUrl.find("http://") ==string::npos ) return -1;/* Second * ---------> handle error url * such as: * http://www.itpub.net/showthread.php?s=&threadid=40236/usercp.php? * 	s=e62967982835ef10708a815274e56816/*/	idx = strUrl.find('?');	if(idx != string::npos){		string::size_type idx2;		idx2 = strUrl.find('?', idx+1);		if(idx2 != string::npos){			strUrl = strUrl.substr(0, idx2);			idx = strUrl.rfind('/');			if(idx2 != string::npos){				strUrl = strUrl.substr(0, idx2);			}else{				return -1;			}		}	}	return 0;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -