📄 crawl.cpp

📁 搜索引擎部分代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
{	if( m_ofsLink4SEFile && iPage->m_nRefLink4SENum>0 ){		pthread_mutex_lock(&mutexLink4SEFile);		m_ofsLink4SEFile << "root_url: " << iPage->m_sUrl << endl;		m_ofsLink4SEFile << "charset: " << iPage->m_sCharset << endl;			m_ofsLink4SEFile << "number: " << iPage->m_nRefLink4SENum << endl;		m_ofsLink4SEFile << "link_anchortext: " << endl;				map<string,string>::iterator it4SE = iPage->m_mapLink4SE.begin();		for( ; it4SE!= iPage->m_mapLink4SE.end(); ++it4SE ){			m_ofsLink4SEFile << (*it4SE).first << '\t' << (*it4SE).second << endl;;		}		pthread_mutex_unlock(&mutexLink4SEFile);	}}bool CCrawl::SaveLink4SE031121(void  *arg){	if( !arg || !m_ofsLink4SEFile ) return false;	//pthread_mutex_lock(&mutexLink4SEFile);	if( vsParsedLinks.size() == 0 ) return false;	file_arg *pFile = (file_arg *)arg;	CUrl *iUrl = pFile->pUrl;	CPage *iPage = pFile->pPage;	char strDownloadTime[128];	time_t tDate;	memset(strDownloadTime, 0, 128);	time(&tDate);	strftime(strDownloadTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));	string links;	vector<string>::iterator it = vsParsedLinks.begin();	for( ; it!= vsParsedLinks.end(); ++it ){		links = links + *it + "\n";	}	m_ofsLink4SEFile << "version: 1.0\n";	if( iPage->m_sLocation.size() == 0 ){		m_ofsLink4SEFile << "url: " << iPage->m_sUrl;	}else{		m_ofsLink4SEFile << "url: " << iPage->m_sLocation;		m_ofsLink4SEFile << "\norigin: " << iUrl->m_sUrl;	}	m_ofsLink4SEFile << "\ndate: " << strDownloadTime;		if( mapCacheHostLookup.find(iUrl->m_sHost) == mapCacheHostLookup.end() ){                m_ofsLink4SEFile << "\nip: " << iUrl->m_sHost;        } else {                m_ofsLink4SEFile << "\nip: " << ( *(mapCacheHostLookup.find(iUrl->m_sHost)) ).second;        }	m_ofsLink4SEFile << "\noutdegree: " << vsParsedLinks.size();	m_ofsLink4SEFile << "\nlength: " << iPage->m_nLenHeader + links.size() + 1		  << "\n\n" << iPage->m_sHeader << "\n";	m_ofsLink4SEFile << links;	m_ofsLink4SEFile << endl;	vsParsedLinks.clear();	//pthread_mutex_unlock(&mutexLink4SEFile);	return true;}			// not wellvoid CCrawl::SaveLink4History(CPage *iPage){	if( m_ofsLink4HistoryFile && iPage->m_nRefLink4HistoryNum>0 ){		pthread_mutex_lock(&mutexLink4HistoryFile);		//m_ofsLink4HistoryFile << "root_url: " << iPage->m_sUrl << endl;		//m_ofsLink4HistoryFile << "charset: " << iPage->m_sCharset << endl;			//m_ofsLink4HistoryFile << "number: " << iPage->m_nRefLink4HistoryNum << endl;		//m_ofsLink4HistoryFile << "link: " << endl;				vector<string>::iterator it4History = iPage->m_vecLink4History.begin();		for( ; it4History!= iPage->m_vecLink4History.end(); ++it4History ){			string s = *it4History;			m_ofsLink4HistoryFile << s << endl;		}		pthread_mutex_unlock(&mutexLink4HistoryFile);	}}/************************************************************************************** *  Function name: SaveVisitedUrlMd5 *  Input argv: *  	--	md5: page md5 value *  Output argv: *  	-- *  Return: *  Function Description: save the visited url Md5**************************************************************************************/void CCrawl::SaveVisitedUrlMD5(string md5){	if( m_ofsVisitedUrlMD5File ){		m_ofsVisitedUrlMD5File << md5 << endl;	}}/************************************************************************************** *  Function name: SaveVisitedPageMd5 *  Input argv: *  	--	md5: page md5 value *  Output argv: *  	-- *  Return: *  Function Description: save the visited url Md5**************************************************************************************/void CCrawl::SaveVisitedPageMD5(string md5){	if( m_ofsVisitedPageMD5File ){		m_ofsVisitedPageMD5File << md5 << endl;	}}/************************************************************************************** *  Function name: OpenFileForOutput *  Input argv: *  	-- *  Output argv: *  	-- *  Return: *  Function Description: Open the files for output**************************************************************************************/void CCrawl::OpenFilesForOutput(){	// open isam file for output	m_isamFile.Open(DATA_FILE_NAME, INDEX_FILE_NAME);	// open visited.url file for output	m_ofsVisitedUrlFile.open(m_sOutputFileName.c_str(), ios::out|ios::app|ios::binary);		if( !m_ofsVisitedUrlFile ){		cerr << "cannot open " << VISITED_FILE << " for output\n" << endl;	}	// open link4SE.url file for output	m_ofsLink4SEFile.open(LINK4SE_FILE.c_str(), ios::out|ios::app|ios::binary);		if( !m_ofsLink4SEFile ){		cerr << "cannot open " << LINK4SE_FILE << " for output\n" << endl;	}	// open link4History.url file for output	m_ofsLink4HistoryFile.open(LINK4History_FILE.c_str(), ios::out|ios::app|ios::binary);		if( !m_ofsLink4HistoryFile ){		cerr << "cannot open " << LINK4History_FILE << " for output\n" << endl;	}	// open unreach host file for output	m_ofsUnreachHostFile.open(UNREACH_HOST_FILE.c_str(), ios::out|ios::app|ios::binary);		if( !m_ofsUnreachHostFile ){		cerr << "cannot open " << UNREACH_HOST_FILE << " for output\n" << endl;	}	// open visited url md5 file for output	m_ofsVisitedUrlMD5File.open(URL_MD5_FILE.c_str(), ios::out|ios::app|ios::binary);		if( !m_ofsVisitedUrlMD5File ){		cerr << "cannot open " << URL_MD5_FILE << " for output\n" << endl;	}		// open visited page md5 file for output	m_ofsVisitedPageMD5File.open(PAGE_MD5_FILE.c_str(), ios::out|ios::app|ios::binary);		if( !m_ofsVisitedPageMD5File ){		cerr << "cannot open " << PAGE_MD5_FILE << " for output\n" << endl;	}}/*************************************************************************************** *  Function name: DoCrawl *  Input argv: *  	-- *  Output argv: *  	-- *  Return: *  Function Description: the main function for crawl *  Be careful:***************************************************************************************/void CCrawl::DoCrawl(){	/* set the signal function */	signal(SIGTERM, SigTerm);	signal(SIGKILL, SigTerm);	signal(SIGINT, SigTerm);	signal(SIGPIPE, SIG_IGN);	signal(SIGCHLD,SIG_IGN);	// output the begin time	char strTime[128];	time_t tDate;	memset(strTime,0,128);	time(&tDate);	strftime(strTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));	cout << "\n\nBegin at: " << strTime << "\n\n";	// get the other info from file	GetVisitedUrlMD5();	GetVisitedPageMD5();	GetIpBlock();	GetUnreachHostMD5();	// open the seed url file	ifstream ifsSeed(m_sInputFileName.c_str());	if (!ifsSeed){		cerr << "Cannot open " << m_sInputFileName << " for input\n";		return;	}	// open the files for output	OpenFilesForOutput();	// Create thread ID structures. 	pthread_t *tids = (pthread_t*)malloc(NUM_WORKERS * sizeof(pthread_t)); 	if( tids == NULL){		cerr << "malloc error" << endl;	}	for(unsigned int i=0; i< NUM_WORKERS; i++){		if( pthread_create( &tids[i], NULL, start, this))			cerr << "create threads error" << endl;	}	string strUrl;	CPage iCPage;	while( getline(ifsSeed, strUrl) ){		string::size_type idx;				if(strUrl[0]=='\0' || strUrl[0]=='#' || strUrl[0]== '\n'){			continue;		}		idx = strUrl.find('\t');		if(idx != string::npos){			strUrl = strUrl.substr(0,idx);		}		//idx = strUrl.find("http");		idx = CStrFun::FindCase(strUrl, "http");		if(idx == string::npos){			//continue;			idx = strUrl.find('/');			if( idx == string::npos ){				strUrl = "http://" + strUrl + "/";			}else{				strUrl = "http://" + strUrl;			}		}		//if( strUrl.length() < 8 ) continue;		if( iCPage.IsFilterLink(strUrl) ) continue;		AddUrl(strUrl.c_str());	}	// Get the unvisited URL	ifstream ifsUnvisitedUrl(UNVISITED_FILE.c_str());	if( ifsUnvisitedUrl ){		while( getline(ifsUnvisitedUrl, strUrl) ){			string::size_type idx;			if( strUrl[0]=='\0' || strUrl[0]=='#' || strUrl[0]== '\n'){				continue;			}			idx =  strUrl.find('\t');			if(idx != string::npos){				strUrl = strUrl.substr(0,idx);			}			// filter invalid urls			if( iCPage.IsFilterLink(strUrl) ) continue;			AddUrl(strUrl.c_str());		}	}else{		//cerr << "Cannot open " << UNVISITED_FILE << " for input\n";	}	// sleep(30);	b_fOver = true;	cout << "finished to get all unvisited urls." << endl;	// Wait for the threads. 	for (unsigned int i = 0; i < NUM_WORKERS; ++i){		(void)pthread_join(tids[i], NULL);	}		cout << "closed " << NUM_WORKERS << " threads." << endl;	SaveUnvisitedUrl();	SaveReplicas("repli");	memset(strTime,0,128);	time(&tDate);	strftime(strTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));	cout << "\n\nEnd at: " << strTime << "\n\n";}/******************************************************************* Function name: AddUrl** Input argv:**      --** Output argv:**      --** Return:** Function Description: Add a parsed url into the collection** Version: 1.0** Be careful:   An important function!!!*****************************************************************/void CCrawl::AddUrl(const char * url){	string strUrl = url;	if( strUrl.empty() || strUrl.size() < 8 ){ //invalid url		cout << "!so small!" << strUrl << endl;		return;	}		CPage iCPage;        if( iCPage.NormalizeUrl(strUrl) == false ){		// cout << "!normalize fail!" << strUrl << endl;		return;	}	CUrl iUrl;	// for ImgSE, comment the paragraph	// if image/xxx url, store it to link4History.url	// begin	if (iUrl.IsImageUrl(strUrl))	{		if( m_ofsLink4HistoryFile ){			pthread_mutex_lock(&mutexLink4HistoryFile);			m_ofsLink4HistoryFile << strUrl << endl;;			pthread_mutex_unlock(&mutexLink4HistoryFile);		}		return;	}	// end	if( iUrl.ParseUrlEx(strUrl) == false ){		cout << "ParseUrlEx error in AddUrl(): " << strUrl << endl;		return;	}	// if it is an invalid host, discard it	if( iUrl.IsValidHost( iUrl.m_sHost.c_str() ) == false ){		cout << "!invalid host: " << iUrl.m_sHost << endl;    		return;	}	// filter foreign hosts	if( iUrl.IsForeignHost(iUrl.m_sHost) ){		cout << "!foreign hosts: " << iUrl.m_sHost << endl;		return;	}	// if it is a block ip, discard it	// this work is left in the CreatSocket()	// because the work of getting ip is inevitable in the CreatSocket function	// 	and this work is expensive	// if it is an unreach host, discard it	// here we only deal with numbers-and-dots notations	unsigned long inaddr = 0;	char *ip = NULL;	inaddr = (unsigned long)inet_addr( iUrl.m_sHost.c_str() );	if ( inaddr != INADDR_NONE){ // host is just ip		//pthread_mutex_lock(&mutexMemory);		ip = new char[iUrl.m_sHost.size()+1];		//pthread_mutex_unlock(&mutexMemory);		memset(ip, 0, iUrl.m_sHost.size()+1);		memcpy(ip, iUrl.m_sHost.c_str(), iUrl.m_sHost.size());		if( !iUrl.IsValidIp(ip) ){ // out of ip block			//pthread_mutex_lock(&mutexMemory);			delete [] ip; ip = NULL;			//pthread_mutex_unlock(&mutexMemory);			//cout << "!unreach hosts: " << iUrl.m_sHost << endl;			return;		}		//pthread_mutex_lock(&mutexMemory);		delete [] ip; ip = NULL;		//pthread_mutex_unlock(&mutexMemory);	}			CStrFun::Str2Lower( iUrl.m_sHost, iUrl.m_sHost.size() );	CMD5 iMD5;	iMD5.GenerateMD5( (unsigned char*)iUrl.m_sHost.c_str(), iUrl.m_sHost.size() );	string strDigest = iMD5.ToString();	if( setUnreachHostMD5.find(strDigest) != setUnreachHostMD5.end() ){		//cout << "!unreach host! " << iUrl.m_sHost << endl;    		return;	}	// if crawled, discard it	iMD5.GenerateMD5( (unsigned char*)strUrl.c_str(), strUrl.size() );	strDigest = iMD5.ToString();	if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) {		// cout << "!visited! " << strUrl << endl;    		return;	}	// if already in the collection, discard it	if( setUnvisitedUrlMD5.find(strDigest) != setUnvisitedUrlMD5.end() ){		// cout << "!in collection! " << strUrl << endl;    		return;	} else {		pthread_mutex_lock(&mutexUnvisitedUrlMD5);		setUnvisitedUrlMD5.insert(strDigest);       		pthread_mutex_unlock(&mutexUnvisitedUrlMD5);	}		// add	// make sure limited threads crawling on a site	int cnt = 0;	for(;;){		//if( mmapUrls.count(iUrl.m_sHost) < NUM_WORKERS_ON_A_SITE ){		if(1) {	        	//pthread_mutex_lock(&mutexVisitedUrlMD5);			// if crawled, discard it :) double secure			//if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) {				//cout << "!v! " << strUrl << endl;            			//pthread_mutex_unlock(&mutexVisitedUrlMD5);				//return;			//} else {	        		pthread_mutex_lock(&mutexVisitedUrlMD5);				mmapUrls.insert(mvalType( iUrl.m_sHost, strUrl));	        			pthread_mutex_unlock(&mutexVisitedUrlMD5);      				break;			//}		} else {			cnt++;			if( cnt % 100 == 0){				cout << "~";				//cnt = 0;			}	    	        // If we have waiting so long, we may remove it   	                if(cnt == 50000) {				cout << "romove it!!!!!!!!!!!!!!!!!!!" << endl;				break;			}			usleep(4000);		}	}}
上一页 1 23
💿 文件大小 9 K
👤 上传用户 tiger452
📂 所属分类数据结构
📄 代码行数 1,422 行
💻 语言类型 C++
🏷️ 相关标签

#搜索引擎 #分 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -