📄 crawl.cpp

📁 搜索引擎部分代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
		cout << "-unwant type  host: " << iUrl.m_sHost << endl;		return;	}	// paragraph end	#ifdef DEBUG    // content encoding		cout <<"######Content encoding: ######" << endl 			<< iPage.m_sContentEncoding << endl;	#endif	char sUnzipContent[1024000];	int  nUnzipLength = 0;	if( iPage.m_sContentEncoding == "gzip" 		&& iPage.m_sContentType == "text/html" ){		gzFile zip;  		string ofsGzipName;		ofsGzipName = CStrFun::itos(pthread_self()) + ".gz";		ofstream ofsDownloadFile(ofsGzipName.c_str(),ios::trunc | ios::binary);		cout << "file_length: " << file_length << endl;               	ofsDownloadFile.write(iPage.m_sContent.c_str(), iPage.m_nLenContent);		ofsDownloadFile.close();		zip = gzopen(ofsGzipName.c_str(),"rb");  		if( zip == NULL ){			cout << "Open zip file " << ofsGzipName.c_str() << " error." << endl;			exit(-1);		}		nUnzipLength = gzread(zip, sUnzipContent, 1024000);		if( nUnzipLength == -1 ){			cout << "Read zip file " << ofsGzipName.c_str() << " error." << endl;			exit(-1);		}   		sUnzipContent[nUnzipLength]=0;		gzclose(zip); 	}	CMD5 iMD5;	string strDigest;	/////////////////////////////	// because we can make sure the url in the setVisitedUrlMd5	// is not same(we have check it before insert it to the collection),	// we intert it directly.  however...	//iMD5.GenerateMD5( (unsigned char*)iPage.m_sUrl.c_str(), iPage.m_sUrl.length() );	iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.length() );	strDigest = iMD5.ToString();	pthread_mutex_lock(&mutexVisitedUrlMD5);	if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) {		cout << "!vurl: ";    //1.crawled already		pthread_mutex_unlock(&mutexVisitedUrlMD5);		return;	}			setVisitedUrlMD5.insert(strDigest);	SaveVisitedUrlMD5(strDigest);	pthread_mutex_unlock(&mutexVisitedUrlMD5);	/////////////////////////////	// whether it is a visited page	// for ImgSE, should comment this paragraph	// for SE, should uncomment this paragraph	// begin	iMD5.GenerateMD5( (unsigned char*)iPage.m_sContent.c_str(), iPage.m_sContent.length() );	strDigest = iMD5.ToString();	pthread_mutex_lock(&mutexVisitedPageMD5);	replicas.insert(pair<string, string>(strDigest, iPage.m_sUrl));	if( setVisitedPageMD5.find(strDigest) != setVisitedPageMD5.end() ) {		cout << "!vpage: ";		// crawled already		pthread_mutex_unlock(&mutexVisitedPageMD5);		return;	}	setVisitedPageMD5.insert(strDigest);	SaveVisitedPageMD5(strDigest);	pthread_mutex_unlock(&mutexVisitedPageMD5);			// end	///////////////////////	// save as ISAM file	//SaveIsamRawData(&iUrl, &iPage);		cout << "+";	////////////////////	// save as Tianwang format	SaveTianwangRawData(pTianwangFile, &iUrl, &iPage);	////////////////////	// save visited Urls	if( iPage.m_sLocation.length() < 1 ){		SaveVisitedUrl(iUrl.m_sUrl);	} else {		SaveVisitedUrl(iPage.m_sLocation);	}	//return;	// just crawl seeds	/////////////////////////////////////	// Parse hyperlinks	if (iPage.m_sContentType != "text/html") { // we can only find links in tex/html		return;	}/*	if (iPage.ParseHyperLinks() == false){		return;	}	SaveLink4SE( &iPage);	SaveLink4History( &iPage);	map<string,string>::iterator it4SE = iPage.m_mapLink4SE.begin();	string str;	for( ; it4SE!= iPage.m_mapLink4SE.end(); ++it4SE ){		str = (*it4SE).first;		AddUrl( str.c_str() );	}*/	// using XIE Han's link parser        struct uri page_uri;	//FILE *tmp;	//tmp = tmpfile();		//fwrite(iPage.m_sContent.c_str(), iPage.m_nLenContent, 1, tmp);	//fseek(tmp, 0, SEEK_SET);	//fclose(tmp);	pthread_mutex_lock(&mutexDetect);	if (iPage.m_sLocation.empty())	{		uri_parse_string(iPage.m_sUrl.c_str(), &page_uri);	}	else	{		uri_parse_string(iPage.m_sLocation.c_str(), &page_uri);	}		struct package p={this,&iPage};	//hlink_detect(tmp, &page_uri, onfind, &p);	hlink_detect_string(iPage.m_sContent.c_str(), &page_uri, onfind, &p);	struct file_arg pLinks = {&iUrl, &iPage};	SaveLink4SE031121( &pLinks );	// save as Link4SE format	//SaveLink4SERawData(pLink4SEFile, &iUrl, &iPage);	pthread_mutex_unlock(&mutexDetect);	uri_destroy(&page_uri);	cout << "Parse End......" << endl;	return;}voidSaveReplicas(const char* filename){	//ofstream ofs(filename, ios::out|ios::app);	ofstream ofs(filename, ios::out|ios::binary|ios::app);	if( !ofs ){		cout << "error open file " << endl;	}	string md5;	pthread_mutex_lock(&mutexReplicas);	multimap<string, string, less<string> >::const_iterator it;	ostringstream *oss = 0;	int i = 0;	for ( it=replicas.begin(); it != replicas.end(); it ++)	{		if (!md5.empty() && md5 != it->first)		{			if (i>=2)				ofs<<(*oss).str()<<endl;			//pthread_mutex_lock(&mutexMemory);			delete(oss);			oss = new ostringstream;			//pthread_mutex_unlock(&mutexMemory);			(*oss)<<it->first<<endl;			i = 0;			md5 = it->first;		}		else if (md5.empty())		{				md5 = it->first;			//pthread_mutex_lock(&mutexMemory);			oss = new ostringstream;			//pthread_mutex_unlock(&mutexMemory);			(*oss)<<it->first<<endl;			i = 0;		}		if (oss != 0)			(*oss)<<it->second<<endl;		i++;	}	pthread_mutex_unlock(&mutexReplicas);}////////////////////////////////////////////////////////////////////////////// Construction/Destruction////////////////////////////////////////////////////////////////////////////CCrawl::CCrawl(){}CCrawl::CCrawl(string inputFileName, string outputFileName){	m_sInputFileName = inputFileName;	m_sOutputFileName = outputFileName; // + ".txt"}CCrawl::~CCrawl(){	m_ofsVisitedUrlFile.close();	m_ofsLink4SEFile.close();	m_ofsLink4HistoryFile.close();	m_isamFile.Close();	m_ofsVisitedUrlMD5File.close();	m_ofsVisitedPageMD5File.close();}/***************************************************************** ** Function name: SigTerm ** Input argv: **      -- ** Output argv: **      -- ** Return: ** Function Description: signal function ** Version: 1.0 ** Be careful: *****************************************************************/static void SigTerm(int x){	SaveUnvisitedUrl();	SaveReplicas("repli");	cout << "Terminated!" << endl;	exit(0);}void CCrawl::GetVisitedUrlMD5(){	ifstream ifsMD5(URL_MD5_FILE.c_str(),ios::binary);	if(!ifsMD5){		//cerr << "did not find " << UrlMD5_FILE << " for iutput" << endl;		return;	}		string strMD5;	while( getline(ifsMD5,strMD5) ){		setVisitedUrlMD5.insert(strMD5);		}	ifsMD5.close();	cout << "got " << setVisitedUrlMD5.size() << " md5 values of visited urls" << endl;}void CCrawl::GetVisitedPageMD5(){	ifstream ifsMD5(PAGE_MD5_FILE.c_str(),ios::binary);	if(!ifsMD5){		//cerr << "did not find " << PageMD5_FILE << " for iutput" << endl;		return;	}		string strMD5;	while( getline(ifsMD5,strMD5) ){		setVisitedPageMD5.insert(strMD5);		}	ifsMD5.close();	cout << "got " << setVisitedPageMD5.size() << " md5 values of visited pages" << endl;}void CCrawl::GetIpBlock(){	ifstream ifsIpBlock(IP_BLOCK_FILE.c_str());	if (!ifsIpBlock){		//cerr << "Cannot open " << IP_BLOCK_FILE << " for input." << endl;		return;	}	string strIpBlock;	while( getline(ifsIpBlock,strIpBlock) ){		if(strIpBlock[0]=='\0' || strIpBlock[0]=='#' 			|| strIpBlock[0]== '\n'){			continue;		}		char buf1[64], buf2[64];                buf1[0]='\0'; buf2[0]='\0';                sscanf( strIpBlock.c_str(), "%s %s", buf1, buf2 );		mapIpBlock.insert(valTypeIpBlock( inet_addr(buf1), inet_addr(buf2)) );		}	ifsIpBlock.close();}void CCrawl::GetUnreachHostMD5(){	//vsUnreachHost.reserve(MAX_UNREACHABLE_HOST_NUM);	ifstream ifsUnreachHost(UNREACH_HOST_FILE.c_str());	if (!ifsUnreachHost){		cerr << "Cannot open " << UNREACH_HOST_FILE << " for input." << endl;		return;	}		string strUnreachHost;	//int i=0;	while( getline(ifsUnreachHost,strUnreachHost) ){		if(strUnreachHost[0]=='\0' || strUnreachHost[0]=='#' 			|| strUnreachHost[0]== '\n'){			continue;		}		CStrFun::Str2Lower( strUnreachHost, strUnreachHost.size() );		//vsUnreachHost.push_back(strUnreachHost);		CMD5 iMD5;		iMD5.GenerateMD5( (unsigned char*)strUnreachHost.c_str(), strUnreachHost.size() );		string strDigest = iMD5.ToString();		setUnreachHostMD5.insert(strDigest);		//i++;		//if(i == MAX_UNREACHABLE_HOST_NUM) break;	}	ifsUnreachHost.close();}/************************************************************************************** *  Function name: SaveTianwangRawData *  Input argv: *  	--	pTianwangFile: tianwang file handle *  	--	pUrl: url *  	--	pPage: web page *  Output argv: *  	-- *  Return: *  Function Description: save raw page data as tianwang file**************************************************************************************/void CCrawl::SaveTianwangRawData(CTianwangFile *pTianwangFile,				CUrl *pUrl, CPage *pPage){	if( !pTianwangFile || !pUrl || !pPage ){		return;	}	file_arg arg;	arg.pUrl = pUrl;	arg.pPage = pPage;	// each thread writes itself, so dnnot need mutex	pTianwangFile->Write((void*)&arg);}/************************************************************************************** *  Function name: SaveLink4SERawData *  Input argv: *  	--	pLink4SEFile: link4SE file handle *  	--	pUrl: url *  	--	pPage: web page *  Output argv: *  	-- *  Return: *  Function Description: save raw page data as tianwang file**************************************************************************************/void CCrawl::SaveLink4SERawData(CLink4SEFile *pLink4SEFile,				CUrl *pUrl, CPage *pPage){	if( !pLink4SEFile || !pUrl || !pPage ){		return;	}	file_arg arg;	arg.pUrl = pUrl;	arg.pPage = pPage;	// each thread writes itself, so dnnot need mutex	pLink4SEFile->Write((void*)&arg);}/************************************************************************************** *  Function name: SaveIsamRawData *  Input argv: *  	--	pUrl: url *  	--	pPage: web page *  Output argv: *  	-- *  Return: *  Function Description: save raw page data as ISAM file**************************************************************************************/void CCrawl::SaveIsamRawData(CUrl *pUrl, CPage *pPage){	if( !pUrl || !pPage ){		return;	}	file_arg arg;	arg.pUrl = pUrl;	arg.pPage = pPage;	pthread_mutex_lock(&mutexIsamFile);	m_isamFile.Write((void *)&arg);	pthread_mutex_unlock(&mutexIsamFile);}/************************************************************************************** *  Function name: SaveVisitedUrl *  Input argv: *  	--	url: url *  Output argv: *  	-- *  Return: *  Function Description: save raw the Visited Url**************************************************************************************/void CCrawl::SaveVisitedUrl(string url){	if( m_ofsVisitedUrlFile ){		pthread_mutex_lock(&mutexVisitedUrlFile);		m_ofsVisitedUrlFile << url << endl;		pthread_mutex_unlock(&mutexVisitedUrlFile);	}}void CCrawl::SaveUnreachHost(string host){	CMD5 iMD5;	iMD5.GenerateMD5( (unsigned char*)host.c_str(), host.size() );	string strDigest = iMD5.ToString();	if(  setUnreachHostMD5.find(strDigest) == setUnreachHostMD5.end() ){		pthread_mutex_lock(&mutexUnreachHost);		setUnreachHostMD5.insert(strDigest);		if( m_ofsUnreachHostFile ){			m_ofsUnreachHostFile << host << endl;		}		pthread_mutex_unlock(&mutexUnreachHost);	}}void CCrawl::SaveLink4SE(CPage *iPage)
💿 文件大小 9 K
👤 上传用户 tiger452
📂 所属分类数据结构
📄 代码行数 1,422 行
💻 语言类型 C++
🏷️ 相关标签

#搜索引擎 #分 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -