📄 page.cpp

📁 此源码功能是捕获网页中的链接并进行分析
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*Page handling */#include <iostream>#include <string>#include <map>#include <vector>#include <iterator>#include "Url.h"#include "Page.h"#include "StrFun.h"#include "h/code.h"CPage::CPage(){	m_nStatusCode = 0;	m_nContentLength = 0;	m_sLocation = "";	m_bConnectionState = false;	m_sContentEncoding = "";	m_sContentType = "";	m_sCharset = "";	m_sTransferEncoding = "";	m_sContentLinkInfo = "";        m_sLinkInfo4SE = "";        m_sLinkInfo4History = "";        m_sContentNoTags = "";	m_nRefLink4SENum = 0;	m_nRefLink4HistoryNum = 0;        m_eType = PLAIN_TEXT;	for(int i=0; i< MAX_URL_REFERENCES; i++ ){		m_RefLink4SE[i].link = NULL;		m_RefLink4SE[i].anchor_text = NULL;		m_RefLink4SE[i].strCharset = "";		if(i < MAX_URL_REFERENCES/2){			m_RefLink4History[i].link = NULL;		}	}}CPage::CPage( string strUrl, string strLocation, char* header, char* body, int nLenBody){	//assert( header != NULL );	//assert( body != NULL );	//assert( nLenBody > 0 );	// CPage();	m_nStatusCode = 0;	m_nContentLength = 0;	m_sLocation = "";	m_bConnectionState = false;	m_sContentEncoding = "";	m_sContentType = "";	m_sCharset = "";	m_sTransferEncoding = "";	m_sContentLinkInfo = "";        m_sLinkInfo4SE = "";        m_sLinkInfo4History = "";        m_sContentNoTags = "";	m_nRefLink4SENum = 0;	m_nRefLink4HistoryNum = 0;        m_eType = PLAIN_TEXT;	for(int i=0; i< MAX_URL_REFERENCES; i++ ){		m_RefLink4SE[i].link = NULL;		m_RefLink4SE[i].anchor_text = NULL;		m_RefLink4SE[i].strCharset = "";		if(i < MAX_URL_REFERENCES/2){			m_RefLink4History[i].link = NULL;		}	}	m_sUrl = strUrl;	m_sLocation = strLocation;	m_sHeader = header;	m_nLenHeader = strlen(header);	m_sContent.assign(body, nLenBody);	m_nLenContent = nLenBody;}CPage::~CPage(){}void CPage::ParseHeaderInfo(string strHeader){	GetStatusCode(strHeader);	GetContentLength(strHeader);	GetLocation(strHeader);	GetConnectionState(strHeader);        GetCharset(strHeader);        GetContentEncoding(strHeader);        GetContentType(strHeader);	GetTransferEncoding(strHeader);}void CPage::GetStatusCode(string headerBuf){	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	char *charIndex = strstr(headerBuf.c_str(), "http/");	if (charIndex == NULL)	{		m_nStatusCode = -1;		return;	}	while(*charIndex != ' '){		charIndex++;	}	charIndex++;		int ret = sscanf(charIndex, "%i", &m_nStatusCode);	if (ret != 1)  m_nStatusCode = -1;}void CPage::GetContentLength(string headerBuf){	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	char *charIndex = strstr(headerBuf.c_str(), "content-length");	if (charIndex == NULL) return;	while(*charIndex != ' '){		charIndex++;	}	charIndex++;		int ret = sscanf(charIndex, "%i", &m_nContentLength);	if (ret != 1)  m_nContentLength = -1;}void CPage::GetLocation(string headerBuf){	string::size_type pre_idx,idx;	const string delims("\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("location:");	if (idx != string::npos)	{		pre_idx = idx + sizeof("location: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if (idx != string::npos)		{			m_sLocation = headerBuf.substr(pre_idx, idx - pre_idx);		}	}}void CPage::GetCharset(string headerBuf){	string::size_type pre_idx,idx;	const string delims(" \",;>");	CStrFun::Str2Lower(headerBuf, headerBuf.size());	idx = headerBuf.find("charset=");	if( idx != string::npos) {		m_sCharset = headerBuf.substr(idx + sizeof("charset=") -1);		idx = m_sCharset.find("\r\n");		if (idx!=string::npos) m_sCharset = m_sCharset.substr(0,idx);		idx = m_sCharset.find("\n");		if (idx!=string::npos) m_sCharset = m_sCharset.substr(0,idx);		return;	}	headerBuf = m_sContent;	headerBuf = headerBuf.substr(0,2024) ;	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("charset=");	if (idx != string::npos)	{		pre_idx = idx + sizeof("charset=") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if(idx != string::npos){			m_sCharset = headerBuf.substr(pre_idx, idx - pre_idx);		}		return;	}	headerBuf = m_sContent;	char *s = (char*)headerBuf.c_str();	RemoveTags(s);	headerBuf = s;	long iCntGBK= 0, iCntBig5 = 0, iCntOther =0 , i=0;	//cout << "Content: " << headerBuf << endl;	//unsigned char *ch = &(unsigned char)headerBuf[i];	unsigned char *ch = (unsigned char*) &headerBuf[i];    while (i< headerBuf.size()) {		if(is_InASCII(ch)){			i++;			iCntGBK++;		}else if(IS_GBK(ch)){			i=i+2;			iCntGBK++;		}else if (IS_BIG5(ch)){			i = i+2;			iCntBig5++;		}else{			i ++;			iCntOther ++;		}			//ch= &(unsigned char)headerBuf[i];		ch= (unsigned char*) &headerBuf[i];	}		//if (Max(iCntGBK,iCntBig5,iCntOther) == iCntGBK)  m_sCharset = "GBK";	if (iCntGBK >= iCntBig5 && iCntGBK >= iCntOther )		m_sCharset = "gbk";	if (iCntBig5 >= iCntGBK && iCntBig5 >= iCntOther )		m_sCharset = "big5";	}//remove tag and content of scripts, css, java, embeddedobjects, comments, etcvoid CPage::RemoveTags(char *s){	int intag;	char *p, *q;	if (!s || !*s) return;	for (p=q=s, intag=0; *q; q++) {		switch (*q){		case '<':			intag = 1;			*p++ = ' ';			break;		case '>':			intag = 0;			break;		default:			if (!intag) {				*p++ = *q;			}			break;		}	}	*p = '\0';}void CPage::GetContentEncoding(string headerBuf){	string::size_type pre_idx,idx;	const string delims("\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("content-encoding:");	if (idx != string::npos)	{		pre_idx = idx + sizeof("content-encoding: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if (idx != string::npos)		{			m_sContentEncoding = headerBuf.substr(pre_idx, idx - pre_idx);		}	}}void CPage::GetConnectionState(string headerBuf){	string::size_type pre_idx,idx;	const string delims(";\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.length() );	idx = headerBuf.find("connection:");	if (idx != string::npos)	{		pre_idx = idx + sizeof("connection: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if (idx != string::npos)		{			string str = headerBuf.substr(pre_idx, idx - pre_idx);			//cout << "Connection state: " << str << endl;			//if (str == "close") m_bConnectionState = false;			if (str == "keep-alive") m_bConnectionState = true;		}	}}void CPage::GetContentType(string headerBuf){	string::size_type pre_idx,idx;	const string delims(";\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.size() );	idx = headerBuf.find("content-type:");	if (idx != string::npos)	{		pre_idx = idx + sizeof("content-type: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if (idx != string::npos)		{			m_sContentType = headerBuf.substr(pre_idx, idx - pre_idx);		}	}}void CPage::GetTransferEncoding(string headerBuf){	string::size_type pre_idx,idx;	const string delims(";\r\n");	CStrFun::Str2Lower( headerBuf, headerBuf.size() );	idx = headerBuf.find("transfer-encoding:");	if ( idx != string::npos)	{		pre_idx = idx + sizeof("transfer-encoding: ") -1;		idx = headerBuf.find_first_of(delims, pre_idx );		if(idx != string::npos)		{			m_sTransferEncoding = headerBuf.substr(pre_idx, idx - pre_idx);		}	}}/* * Filter spam links * If it is, return ture; otherwise false */bool CPage::IsFilterLink(string plink){	if( plink.empty() ) return true;	if( plink.size() > URL_LEN ) return true;	string link = plink, tmp;	string::size_type idx = 0;		CStrFun::Str2Lower( link, link.length() );	// find two times following symbols, return false	tmp = link;	idx = tmp.find("?");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("?");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("-");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("+");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("&");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("&");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("//");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("//");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("http");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("http");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("misc");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("misc");		if( idx != string::npos ) return true;	}	tmp = link;	idx = tmp.find("ipb");	if( idx != string::npos ){		tmp = tmp.substr(idx+1);		idx = tmp.find("ipb");		if( idx != string::npos ) return true;	}	const char *filter_str[]={	"cgi-bin",	"htbin",	"linder",	"srs5",		"uin-cgi",  // robots.txt of http://www.expasy.org/	"uhtbin",	"snapshot",	"=+",		"=-",		"script",	"gate",		"search",	"clickfile",	"data/scop",	"names",	"staff/",	"enter",	"user",		"mail",	"pst?",	"find?",	"ccc?",		"fwd?",		"tcon?",	"&amp",	"counter?",	"forum",	"cgisirsi",	"{",		"}",	"proxy",	"login",	"00.pl?",	"sciserv.pl",	"sign.asp",	"<",		">",		"review.asp?",	"result.asp?",	"keyword",	"\"",		"'",		"php?s=",	"error",	"showdate",	"niceprot.pl?",	"volue.asp?id",	".css",		".asp?month",	"prot.pl?",	"msg.asp",	"register.asp", "database",	"reg.asp",	"qry?u",	"p?msg",	"tj_all.asp?page", ".plot.",	"comment.php",	"nicezyme.pl?",	"entr",		"compute-map?", "view-pdb?",	"list.cgi?",	"lists.cgi?",	"details.pl?",	"aligner?",	"raw.pl?",	"interface.pl?","memcp.php?",	"member.php?",	"post.php?",	"thread.php",	"bbs/",		"/bbs"	};	int filter_str_num = 75;		for(int i=0; i<filter_str_num; i++){		if( link.find(filter_str[i]) != string::npos)		return true;	}		return false;}/////////////////////////////// just for ImgSE// e.g: http://www.people.com.cn/GB/tupian/index.html// 	http://news.xinhuanet.com/photo/// 	http://photo.tom.com//////////////////////////////// comment previous one and open this one/*bool CPage::IsFilterLink(string plink){	if( plink.empty() ) return true;	if( plink.size() > URL_LEN ) return true;	return false;	string link = plink, tmp;	string::size_type idx = 0;		CStrFun::Str2Lower( link, link.length() );	const char *filter_str[]={		"tupian", "photo", "ttjstk"		};	int filter_str_num = 3;	CStrFun::Str2Lower( link, link.length() );	for(int i=0; i<filter_str_num; i++){		if( link.find(filter_str[i]) != string::npos)		return false;	}		return true;}*//******************************************************************* Function name: ParseHyperLinks** Input argv:**      --** Output argv:**      --** Return:        true: success        false: fail** Function Description:  Parse hyperlinks from the web page** Version: 1.0** Be careful:*****************************************************************/bool CPage::ParseHyperLinks(){	if( GetContentLinkInfo() == false ) return false;	if( m_sContentLinkInfo.empty() ) return false;	bool bFind4SE = false;	//bool bFind4History = false;	if( GetLinkInfo4SE() ){		if( FindRefLink4SE() ) bFind4SE = true;	} 	//if( GetLinkInfo4History() ){		//if( FindRefLink4History() ) bFind4History = true;	//}	//if( !bFind4SE && !bFind4History ){	if( !bFind4SE ){		 return false;	}	//return   GetHref(m_sContentLinkInfo.c_str(), "href", m_listLink4SE);	return true;}/******************************************************************* Function name: GetContentLinkInfo** Input argv:**      --** Output argv:**      --** Return:        true: success        false: fail** Function Description:  Parse hyperlinks from the web page** Version: 1.0** Be careful:*****************************************************************/bool CPage::GetContentLinkInfo(){	if( m_sContent.empty() ) return false;		m_sContentLinkInfo = m_sContent;	string& s = m_sContentLinkInfo;	// transform all separation into one space character	//CStrFun::ReplaceStr(s, "\t", " ");	//CStrFun::ReplaceStr(s, "\r", " ");	//CStrFun::ReplaceStr(s, "\n", " ");	const string delims(" \t\r\n");	string::size_type idx=0, pre_idx;	while( (idx = s.find_first_of(delims, idx)) != string::npos ){		pre_idx = idx;		s.replace(idx,1,1,' ');		idx++;		while( (idx = s.find_first_of(delims, idx)) != string::npos ){			if( idx-pre_idx == 1 ){				s.erase(idx, 1);			} else {				break;			}		}		idx--;	}	// transform all "<br>" into one space character	CStrFun::ReplaceStr(s, "<br>", " ");	if( s.size() < 20 ) return false;	// Keep only <img ...>, <area ...>,<script ...> and <a href ...> tags.	string::size_type idxHref=0,idxArea=0,idxImg=0;	string dest;	do{		if( s.empty() ) break;		idxHref = CStrFun::FindCase(s, "href");		idxArea = CStrFun::FindCase(s, "<area");		idxImg = CStrFun::FindCase(s, "<img");		pre_idx = idxHref > idxArea? idxArea: idxHref;		pre_idx = idxImg > pre_idx? pre_idx: idxImg;		if( pre_idx == string::npos) break;		s = s.substr(pre_idx);		idx = s.find_first_of('<',1);		if( idx != string::npos ){			dest = dest + s.substr(0,idx);		}else{			break;		}		s = s.substr(idx);		idxHref=0; idxArea=0; idxImg=0;	}while(1);	s = dest;		/* erase all '\' character	 * too avoid the following situations:	 *      document.write("<A href=\"/~webg/refpaper/index.html\">t2</A>");	*/	CStrFun::EraseStr(s, "\\");	if( s.size() < 20 ) return false;	return true;}/******************************************************************* Function name: GetLinkInfo4SE()** Input argv:**      --  ** Output argv:**      --** Return:       true: success       false: fail** Function Description:  Get links for SE** Version: 1.0** Be careful:*****************************************************************/bool CPage::GetLinkInfo4SE(){
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -