📄 page.cpp

📁 小型搜索引擎,用C/C++编写,属于全文搜索引擎
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
 /*Page handling */#include <iostream>#include <string>#include <map>#include <vector>#include <iterator>#include "Url.h"#include "Page.h"#include "StrFun.h"CPage::CPage(){	this->m_sLocation = "";	this->m_sCharset = "";	this->m_sContentEncoding = "";	this->m_sContentType = "";	this->m_sContentLinkInfo = "";        this->m_sLinkInfo4SE = "";        this->m_sLinkInfo4History = "";        this->m_sContentNoTags = "";	this->m_nRefLink4SENum = 0;	this->m_nRefLink4HistoryNum = 0;        this->m_eType = PLAIN_TEXT;	for(int i=0; i< MAX_URL_REFERENCES; i++ ){		this->m_RefLink4SE[i].link = NULL;		this->m_RefLink4SE[i].anchor_text = NULL;		this->m_RefLink4SE[i].strCharset = "";		if(i < MAX_URL_REFERENCES/2){			this->m_RefLink4History[i].link = NULL;		}	}}CPage::CPage( string strUrl, string strLocation, char* header, char* body, int nLenBody){	assert( header != NULL );	assert( body != NULL );	assert( nLenBody > 0 );	//CPage();	this->m_sLocation = "";	this->m_sCharset = "";	this->m_sContentEncoding = "";	this->m_sContentType = "";	this->m_sContentLinkInfo = "";        this->m_sLinkInfo4SE = "";        this->m_sLinkInfo4History = "";        this->m_sContentNoTags = "";	this->m_nRefLink4SENum = 0;	this->m_nRefLink4HistoryNum = 0;        this->m_eType = PLAIN_TEXT;	for(int i=0; i< MAX_URL_REFERENCES; i++ ){		this->m_RefLink4SE[i].link = NULL;		this->m_RefLink4SE[i].anchor_text = NULL;		this->m_RefLink4SE[i].strCharset = "";		if(i < MAX_URL_REFERENCES/2){			this->m_RefLink4History[i].link = NULL;		}	}	this->m_sUrl = strUrl;	this->m_sLocation = strLocation;	//this->m_sHeader.assign(header,strlen(header));	this->m_sHeader = header;	this->m_nLenHeader = strlen(header);	this->m_sContent.assign(body, nLenBody);	this->m_nLenContent = nLenBody;}CPage::~CPage(){}int CPage::GetContentLinkInfo(){	assert( this->m_sContent.empty() == false );	this->m_sContentLinkInfo = this->m_sContent;	string& s = this->m_sContentLinkInfo;	// transform all separators into one space character	const string delims(" \t\r\n");	string::size_type idx=0,pre_idx;	while( (idx = s.find_first_of(delims, idx )) != string::npos ){		pre_idx = idx;		s.replace(idx,1,1,' ');			idx++;		while( (idx = s.find_first_of(delims, idx)) != string::npos ){			if( idx-pre_idx == 1){				s.erase(idx, 1);			}else{				break;			}		}		idx--;	}	// transform all "<br>" into one space character	//replace(s.begin(), s.end(), "<br>", " ");	while( (idx = s.find("<br>")) != string::npos ){		s.replace(idx,4,1,' ');	}	if( s.length() < 20 ) return -1; 	// Keep only <img ...>, <area ...>,<script ...> and <a href ...> tags.	string::size_type idxHref=0,idxArea=0,idxImg=0;	string dest;	do{		if( s.empty() ) break;		//idxHref = CStrFun::FindCase(s, "<a href");		idxHref = CStrFun::FindCase(s, "href");		idxArea = CStrFun::FindCase(s, "<area");		idxImg = CStrFun::FindCase(s, "<img");		pre_idx = idxHref > idxArea? idxArea: idxHref;		pre_idx = idxImg > pre_idx? pre_idx: idxImg;		if( pre_idx == string::npos) break;		s = s.substr(pre_idx);		idx = s.find_first_of('<',1);		if( idx != string::npos ){			dest = dest + s.substr(0,idx);		}else{			break;		}		s = s.substr(idx);		idxHref=0; idxArea=0; idxImg=0;	}while(1);			s = dest;	/* erase all '\' character	 * too avoid the following situations:	 * 	document.write("<A href=\"/~webg/refpaper/index.html\">t2</A>");	 */	while( (idx = s.find_first_of('\\', idx )) != string::npos ){		s.erase(idx, 1);	}	return( s.length() < 20 ? -1: 0 );	return 0;}int CPage::GetLinkInfo4SE(){	assert( this->m_sContentLinkInfo.empty() == false );	this->m_sLinkInfo4SE = this->m_sContentLinkInfo;	string& s = this->m_sLinkInfo4SE; 	// Keep only <area ...>,and <a href ...> tags.	string::size_type idxHref=0,idxArea=0,		idx,pre_idx;	string dest;	do{		if( s.empty() ) break;		//idxHref = CStrFun::FindCase(s, "<a href");		idxHref = CStrFun::FindCase(s, "href");		idxArea = CStrFun::FindCase(s, "<area ");		pre_idx = idxHref > idxArea? idxArea: idxHref;		//pre_idx = idxHref;		if( pre_idx == string::npos) break;		s = s.substr(pre_idx);		idx = s.find_first_of('<',1);		if( !(s.length() < 4) ){			idxHref = CStrFun::FindCaseFrom(s, "href", 4);			idx = idx > idxHref ? idxHref: idx;		}		if( idx != string::npos ){			dest = dest + s.substr(0,idx);		}else if (idx == string::npos && pre_idx != string::npos){			dest = dest + s;			break;		}else{			break;		}		s = s.substr(idx);		idxHref=0; idxArea=0;	}while(1);			s = dest;	if( s.length() < 20 ) return -1;	//cout << endl << "in it: " << s << endl;	// erase all '"' , '\'', "&nbsp;".	while( (idx = s.find('"')) != string::npos ){		s.erase(idx,1);	}	while( (idx = s.find('\'')) != string::npos ){		s.erase(idx,1);	}	while( (idx = s.find("&nbsp;")) != string::npos ){		s.erase(idx,6);	}	//cout << endl << "in it: " << s << endl; 	// Keep URLs and anchor text.	idxHref=0;	const string delims( " #>");	dest.clear();	do{		if( s.empty() ) break;		idxHref = CStrFun::FindCase(s, "href");		if( idxHref == string::npos) break;		pre_idx = idxHref;		//####		idx = s.find('=', idxHref);		if( idx == string::npos ) break;		s = s.substr(idx+1);		while( s.length() > 0 && s[0] == ' ' ){			s.erase(0,1);		}		if( s.length() == 0 ) break;		idx = s.find_first_of(delims,1);		//cout << endl << s.substr(0, idx) << endl;		if( idx == string::npos ) break;		dest += '"' + s.substr(0, idx);		//cout << endl << dest << endl;					idx = s.find('>');		if( idx == string::npos ) break;		dest += '>';		s = s.substr(idx +1);					idx = s.find('<');		if( !s.empty() ){			idxHref = CStrFun::FindCase(s, "href");			idx = idx > idxHref ? idxHref: idx;		}			if( idx == string::npos ){			dest += s;			break;		}/*		if( idx == idxHref ){			dest += '"' + s.substr(0,idx);		}else{*/			dest += s.substr(0,idx);		//}		//####		idxHref=0;	}while(1);			// look for empty filenames.	idx = 0;	while( (idx = dest.find("\"\"",idx)) != string::npos ){		dest.erase(idx, 1);	}	s = dest;	return( s.length() < 20 ? -1: 0 );}int CPage::GetLinkInfo4History(){	assert( this->m_sContentLinkInfo.empty() == false );	this->m_sLinkInfo4History = this->m_sContentLinkInfo;	string& s = this->m_sLinkInfo4History; 	// Keep only <img ...> tags.	string::size_type idxImg=0,		idx,pre_idx;	string dest;	do{		if( s.empty() ) break;		idxImg = CStrFun::FindCase(s, "<img");		if( idxImg == string::npos) break;		pre_idx = idxImg;		s = s.substr(pre_idx);		idx = s.find_first_of('<',1);		if( idx != string::npos ){			dest = dest + s.substr(0,idx);		}else if (idx == string::npos && pre_idx != string::npos){			dest = dest + s;			break;		}else{			break;		}		s = s.substr(idx);		idxImg=0;	}while(1);			s = dest;	if( s.length() < 20 ) return -1;	// erase all '"'.	while( (idx = s.find('"')) != string::npos ){		s.erase(idx,1);	} 	// Keep URLs and anchor text.	idxImg=0;	string::size_type idxSrc = 0;	const string delims( " \"#>");	dest.clear();	do{		if( s.empty() ) break;		idxImg = CStrFun::FindCase(s, "img");		if( idxImg == string::npos) break;		pre_idx = idxImg;		s = s.substr(idxImg+3);		// skip "img"		//####		idx = s.find('>', idxImg);		if( idxImg == string::npos) break;		if( s.empty() ) break;		idxSrc = CStrFun::FindCase(s, "src");		if( idxSrc > idxImg ) continue;		s = s.substr(idxSrc);		idx = s.find('=', idxImg);		if( idx == string::npos ) break;		s = s.substr(idx+1);		while( s.length() > 0 && s[0] == ' ' ){			s.erase(0,1);		}		if( s.length() == 0 ) break;		idx = s.find_first_of(delims,1);		if( idx == string::npos ) break;		if( s.at(0) == '"'){			dest += s.substr(0, idx);		}else{			dest += '"' + s.substr(0, idx);		}					idx = s.find('>');		if( idx == string::npos ) break;		dest += '>';		s = s.substr(idx +1);					idx = s.find('<');		if( idx == string::npos ){			dest += s;			break;		}		dest += s.substr(0,idx);		//####		idxImg=0;	}while(1);			// look for empty filenames.	idx = 0;	while( (idx = dest.find("\"\"",idx)) != string::npos ){		dest.erase(idx, 1);	}	s = dest;	return( s.length() < 20 ? -1: 0 );}int CPage::GetCharset(){	string	headerBuf=this->m_sHeader,		strCharset;	string::size_type pre_idx,idx;	const string delims(" \",;>");	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("charset=");	if( idx != string::npos) {		strCharset = headerBuf.substr(idx + sizeof("charset=") -1);	}	headerBuf = this->m_sContent;	headerBuf = headerBuf.substr(0,2024) ;	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("charset=");
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -