📄 page.cpp

📁 在linux下的crawler程序,来自北大天网tiny search engine spider
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*Page handling */#include <iostream>#include <string>#include <map>#include <vector>#include <iterator>#include "Url.h"#include "Page.h"#include "StrFun.h"CPage::CPage(){	m_nStatusCode = 0;	m_nContentLength = 0;	m_sLocation = "";	m_bConnectionState = false;	m_sContentEncoding = "";	m_sContentType = "";	m_sCharset = "";	m_sTransferEncoding = "";	m_sContentLinkInfo = "";        m_sLinkInfo4SE = "";        m_sLinkInfo4History = "";        m_sContentNoTags = "";	m_nRefLink4SENum = 0;	m_nRefLink4HistoryNum = 0;        m_eType = PLAIN_TEXT;	for(int i=0; i< MAX_URL_REFERENCES; i++ ){		m_RefLink4SE[i].link = NULL;		m_RefLink4SE[i].anchor_text = NULL;		m_RefLink4SE[i].strCharset = "";		if(i < MAX_URL_REFERENCES/2){			m_RefLink4History[i].link = NULL;		}	}}CPage::CPage( string strUrl, string strLocation, char* header, char* body, int nLenBody){	//assert( header != NULL );	//assert( body != NULL );	//assert( nLenBody > 0 );	// CPage();	m_nStatusCode = 0;	m_nContentLength = 0;	m_sLocation = "";	m_bConnectionState = false;	m_sContentEncoding = "";	m_sContentType = "";	m_sCharset = "";	m_sTransferEncoding = "";	m_sContentLinkInfo = "";        m_sLinkInfo4SE = "";        m_sLinkInfo4History = "";        m_sContentNoTags = "";	m_nRefLink4SENum = 0;	m_nRefLink4HistoryNum = 0;        m_eType = PLAIN_TEXT;	for(int i=0; i< MAX_URL_REFERENCES; i++ ){		m_RefLink4SE[i].link = NULL;		m_RefLink4SE[i].anchor_text = NULL;		m_RefLink4SE[i].strCharset = "";		if(i < MAX_URL_REFERENCES/2){			m_RefLink4History[i].link = NULL;		}	}	m_sUrl = strUrl;	m_sLocation = strLocation;	m_sHeader = header;	m_nLenHeader = strlen(header);	m_sContent.assign(body, nLenBody);	m_nLenContent = nLenBody;}CPage::~CPage(){}void CPage::ParseHeaderInfo(string strHeader){	GetStatusCode(strHeader);	GetContentLength(strHeader);	GetLocation(strHeader);	GetConnectionState(strHeader);        GetCharset(strHeader);        GetContentEncoding(strHeader);        GetContentType(strHeader);	GetTransferEncoding(strHeader);}void CPage::GetStatusCode(string headerBuf){	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	char *charIndex = strstr(headerBuf.c_str(), "http/");	if (charIndex == NULL)	{		m_nStatusCode = -1;		return;	}	while(*charIndex != ' '){		charIndex++;	}	charIndex++;		int ret = sscanf(charIndex, "%i", &m_nStatusCode);	if (ret != 1)  m_nStatusCode = -1;}void CPage::GetContentLength(string headerBuf){	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	char *charIndex = strstr(headerBuf.c_str(), "content-length");	if (charIndex == NULL) return;	while(*charIndex != ' '){		charIndex++;	}	charIndex++;		int ret = sscanf(charIndex, "%i", &m_nContentLength);	if (ret != 1)  m_nContentLength = -1;}void CPage::GetLocation(string headerBuf){	string::size_type pre_idx,idx;	const string delims("\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("location:");	if (idx != string::npos)	{		pre_idx = idx + sizeof("location: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if (idx != string::npos)		{			m_sLocation = headerBuf.substr(pre_idx, idx - pre_idx);		}	}}void CPage::GetCharset(string headerBuf){	string::size_type pre_idx,idx;	const string delims(" \",;>");	CStrFun::Str2Lower(headerBuf, headerBuf.size());	idx = headerBuf.find("charset=");	if( idx != string::npos) {		m_sCharset = headerBuf.substr(idx + sizeof("charset=") -1);	}	headerBuf = m_sContent;	headerBuf = headerBuf.substr(0,2024) ;	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("charset=");	if (idx != string::npos)	{		pre_idx = idx + sizeof("charset=") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if(idx != string::npos){			m_sCharset = headerBuf.substr(pre_idx, idx - pre_idx);		}	}}void CPage::GetContentEncoding(string headerBuf){	string::size_type pre_idx,idx;	const string delims("\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("content-encoding:");	if (idx != string::npos)	{		pre_idx = idx + sizeof("content-encoding: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if (idx != string::npos)		{			m_sContentEncoding = headerBuf.substr(pre_idx, idx - pre_idx);		}	}}void CPage::GetConnectionState(string headerBuf){	string::size_type pre_idx,idx;	const string delims(";\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("connection:");	if (idx != string::npos)	{		pre_idx = idx + sizeof("connection: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if (idx != string::npos)		{			string str = headerBuf.substr(pre_idx, idx - pre_idx);			//cout << "Connection state: " << str << endl;			//if (str == "close") m_bConnectionState = false;			if (str == "keep-alive") m_bConnectionState = true;		}	}}void CPage::GetContentType(string headerBuf){	string::size_type pre_idx,idx;	const string delims(";\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.size() );	idx = headerBuf.find("content-type:");	if (idx != string::npos)	{		pre_idx = idx + sizeof("content-type: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if (idx != string::npos)		{			m_sContentType = headerBuf.substr(pre_idx, idx - pre_idx);		}	}}void CPage::GetTransferEncoding(string headerBuf){	string::size_type pre_idx,idx;	const string delims(";\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.size() );	idx = headerBuf.find("transfer-encoding:");	if ( idx != string::npos)	{		pre_idx = idx + sizeof("transfer-encoding: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if(idx != string::npos)		{			m_sTransferEncoding = headerBuf.substr(pre_idx, idx - pre_idx);		}	}}/* * Filter spam links * If it is, return ture; otherwise false */bool CPage::IsFilterLink(string plink){	if( plink.empty() ) return true;	if( plink.size() > URL_LEN ) return true;	string link = plink, tmp;	string::size_type idx = 0;		CStrFun::Str2Lower( link, link.length() );	// find two times following symbols, return false	tmp = link;	idx = tmp.find("?");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("?");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("-");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("+");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("&");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("&");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("//");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("//");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("http");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("http");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("misc");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("misc");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("ipb");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("ipb");		if( idx != string::npos ) return true;	}	const char *filter_str[]={	"cgi-bin",	"htbin",	"linder",	"srs5",		"uin-cgi",  // robots.txt of http://www.expasy.org/	"uhtbin",	"snapshot",	"=+",		"=-",		"script",	"gate",		"search",	"clickfile",	"data/scop",	"names",	"staff/",	"enter",	"user",		"mail",	"pst?",	"find?",	"ccc?",		"fwd?",		"tcon?",	"&amp",	"counter?",	"forum",	"cgisirsi",	"{",		"}",	"proxy",	"login",	"00.pl?",	"sciserv.pl",	"sign.asp",	"<",		">",		"review.asp?",	"result.asp?",	"keyword",	"\"",		"'",		"php?s=",	"error",	"showdate",	"niceprot.pl?",	"volue.asp?id",	".css",		".asp?month",	"prot.pl?",	"msg.asp",	"register.asp", "database",	"reg.asp",	"qry?u",	"p?msg",	"tj_all.asp?page", ".plot.",	"comment.php",	"nicezyme.pl?",	"entr",		"compute-map?", "view-pdb?",	"list.cgi?",	"lists.cgi?",	"details.pl?",	"aligner?",	"raw.pl?",	"interface.pl?","memcp.php?",	"member.php?",	"post.php?",	"thread.php",	"bbs/",		"/bbs"	};	int filter_str_num = 75;		for(int i=0; i<filter_str_num; i++){		if( link.find(filter_str[i]) != string::npos)		return true;	}		return false;}/////////////////////////////// just for ImgSE// e.g: http://www.people.com.cn/GB/tupian/index.html// 	http://news.xinhuanet.com/photo/// 	http://photo.tom.com//////////////////////////////// comment previous one and open this one/*bool CPage::IsFilterLink(string plink){	if( plink.empty() ) return true;	if( plink.size() > URL_LEN ) return true;	return false;	string link = plink, tmp;	string::size_type idx = 0;		CStrFun::Str2Lower( link, link.length() );	const char *filter_str[]={		"tupian", "photo", "ttjstk"		};	int filter_str_num = 3;	CStrFun::Str2Lower( link, link.length() );	for(int i=0; i<filter_str_num; i++){		if( link.find(filter_str[i]) != string::npos)		return false;	}		return true;}*//******************************************************************* Function name: ParseHyperLinks** Input argv:**      --** Output argv:**      --** Return:        true: success        false: fail** Function Description:  Parse hyperlinks from the web page** Version: 1.0** Be careful:*****************************************************************/bool CPage::ParseHyperLinks(){	if( GetContentLinkInfo() == false ) return false;	if( m_sContentLinkInfo.empty() ) return false;	bool bFind4SE = false;	bool bFind4History = false;	if( GetLinkInfo4SE() ){		if( FindRefLink4SE() ) bFind4SE = true;	} 	if( GetLinkInfo4History() ){		if( FindRefLink4History() ) bFind4History = true;	}	if( !bFind4SE && !bFind4History ){		 return false;	}	//return   GetHref(m_sContentLinkInfo.c_str(), "href", m_listLink4SE);	return true;}/******************************************************************* Function name: GetContentLinkInfo** Input argv:**      --** Output argv:**      --** Return:        true: success        false: fail** Function Description:  Parse hyperlinks from the web page** Version: 1.0** Be careful:*****************************************************************/bool CPage::GetContentLinkInfo(){	if( m_sContent.empty() ) return false;		m_sContentLinkInfo = m_sContent;	string& s = m_sContentLinkInfo;	// transform all separation into one space character	//CStrFun::ReplaceStr(s, "\t", " ");	//CStrFun::ReplaceStr(s, "\r", " ");	//CStrFun::ReplaceStr(s, "\n", " ");	const string delims(" \t\r\n");	string::size_type idx=0, pre_idx;	while( (idx = s.find_first_of(delims, idx)) != string::npos ){		pre_idx = idx;		s.replace(idx,1,1,' ');		idx++;		while( (idx = s.find_first_of(delims, idx)) != string::npos ){			if( idx-pre_idx == 1 ){				s.erase(idx, 1);			} else {				break;			}		}		idx--;	}	// transform all "<br>" into one space character	CStrFun::ReplaceStr(s, "<br>", " ");	if( s.size() < 20 ) return false;	// Keep only <img ...>, <area ...>,<script ...> and <a href ...> tags.	string::size_type idxHref=0,idxArea=0,idxImg=0;	string dest;	do{		if( s.empty() ) break;		idxHref = CStrFun::FindCase(s, "href");		idxArea = CStrFun::FindCase(s, "<area");		idxImg = CStrFun::FindCase(s, "<img");		pre_idx = idxHref > idxArea? idxArea: idxHref;		pre_idx = idxImg > pre_idx? pre_idx: idxImg;		if( pre_idx == string::npos) break;		s = s.substr(pre_idx);		idx = s.find_first_of('<',1);		if( idx != string::npos ){			dest = dest + s.substr(0,idx);		}else{			break;		}		s = s.substr(idx);		idxHref=0; idxArea=0; idxImg=0;	}while(1);	s = dest;		/* erase all '\' character	 * too avoid the following situations:	 *      document.write("<A href=\"/~webg/refpaper/index.html\">t2</A>");	*/	CStrFun::EraseStr(s, "\\");	if( s.size() < 20 ) return false;	return true;}/******************************************************************* Function name: GetLinkInfo4SE()** Input argv:**      --  ** Output argv:**      --** Return:       true: success       false: fail** Function Description:  Get links for SE** Version: 1.0** Be careful:*****************************************************************/bool CPage::GetLinkInfo4SE(){	if( m_sContentLinkInfo.empty() ) return false;	m_sLinkInfo4SE = m_sContentLinkInfo;	string& s = m_sLinkInfo4SE; 	// Keep only <area ...>,and <a href ...> tags.	string::size_type idxHref=0,idxArea=0,		idx,pre_idx;	string dest;	do{		if( s.empty() ) break;		//idxHref = CStrFun::FindCase(s, "<a href");		idxHref = CStrFun::FindCase(s, "href");		idxArea = CStrFun::FindCase(s, "<area ");		pre_idx = idxHref > idxArea? idxArea: idxHref;		//pre_idx = idxHref;		if( pre_idx == string::npos) break;		s = s.substr(pre_idx);		idx = s.find_first_of('<',1);		if( !(s.length() < 4) ){			idxHref = CStrFun::FindCaseFrom(s, "href", 4);			idx = idx > idxHref ? idxHref: idx;		}		if( idx != string::npos ){			dest = dest + s.substr(0,idx);		}else if (idx == string::npos && pre_idx != string::npos){			dest = dest + s;			break;		}else{			break;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -