📄 crawl.cpp

📁 搜索引擎部分代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
#include "Crawl.h"#include "Url.h"#include "Md5.h"#include <list.h>#include <hlink.h>#include <uri.h>extern pthread_mutex_t mymutex;extern map<string,string> mapCacheHostLookup;extern vector<string> vsUnreachHost;extern char **ParseRobot( char *data, char len);set<string> setVisitedUrlMD5;set<string> setVisitedPageMD5;set<string> setUnvisitedUrlMD5;set<string> setUnreachHostMD5;multimap<string, string, less<string> > replicas;pthread_mutex_t mutexCollection = PTHREAD_MUTEX_INITIALIZER;	// unvisited urlspthread_mutex_t mutexUnreachHost = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexUnvisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexVisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexVisitedPageMD5 = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexDetect = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexLink4SEFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexLink4HistoryFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexIsamFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexVisitedUrlFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexUnreachHostFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexReplicas = PTHREAD_MUTEX_INITIALIZER;//pthread_mutex_t mutexMemory = PTHREAD_MUTEX_INITIALIZER;map<unsigned long,unsigned long> mapIpBlock;bool b_fOver;//multimap<string,string, less<string> > mmapUrls;multimap<string,string > mmapUrls;typedef map<unsigned long,unsigned long>::value_type valTypeIpBlock;typedef map<string,string>::value_type mvalType;voidSaveReplicas(const char* filename);struct package{	CCrawl *crawl;	CPage *page;};vector<string> vsParsedLinks;int onfind(const char *elem, const char *attr, struct uri *uri, void *arg){	struct package *p=(struct package*)arg;	char buff[URL_LEN+1];	//	if (uri_recombine(uri, buff, URL_LEN+1, C_URI) >= 0)	if (uri_recombine(uri, buff, URL_LEN+1, C_SCHEME| C_AUTHORITY| C_PATH| C_QUERY ) >= 0)	{		vsParsedLinks.push_back(buff);		if( !p->page->IsFilterLink(buff) )		{			// accept "a,link,frame,iframe,img,area"			if (strcasecmp(elem, "img") == 0)			{				pthread_mutex_lock(&mutexLink4HistoryFile);				if( p->crawl->m_ofsLink4HistoryFile ){					p->crawl->m_ofsLink4HistoryFile << buff << endl;				}				pthread_mutex_unlock(&mutexLink4HistoryFile);			} else {				p->crawl->AddUrl( buff );			}/*			else if (strcasecmp(elem, "img") == 0)			{				pthread_mutex_lock(&mutexLink4HistoryFile);				if( p->crawl->m_ofsLink4HistoryFile ){					p->crawl->m_ofsLink4HistoryFile << p->page->m_sUrl << endl;;				}				pthread_mutex_unlock(&mutexLink4HistoryFile);			}*/		}	}	uri_destroy(uri);	free(uri);	return 1;}/*********************************************************************** * Function name: start * Input argv: * 	-- arg: the CCrawl handle * Output argv: * 	-- * Return:***********************************************************************/void* start(void *arg){	( (CCrawl*)arg )->fetch(arg);}/***************************************************************** * Function name: SaveUnvisitedUrl * Input argv: *      -- * Output argv: *      -- * Return: * Function Description: Save teh Unvisited Url * Version: 1.0 * Be careful: ****************************************************************/void SaveUnvisitedUrl(){	ofstream ofsUnvisitedUrl;	ofsUnvisitedUrl.open(UNVISITED_FILE.c_str(), ios::in|ios::out|ios::trunc|ios::binary);        if (!ofsUnvisitedUrl) {		cerr << "cannot open " << UNVISITED_FILE << "for output" << endl;		exit (-1);	}	multimap<string,string>::iterator it = mmapUrls.begin();	for (; it!=mmapUrls.end(); it++) {		ofsUnvisitedUrl << ((*it).second).c_str() << "\n";	}	ofsUnvisitedUrl << endl;	ofsUnvisitedUrl.close();}/*********************************************************************** * Function name: fetch * Input argv: * 	-- arg: the CCrawl handle * Output argv: * 	-- * Return:***********************************************************************/void CCrawl::fetch(void *arg){	string strUrl,host;	int	nGSock = -1;	string	strGHost = "";	// create a Tianwang file for output the raw page data	string ofsName = DATA_TIANWANG_FILE + "." + CStrFun::itos(pthread_self());	CTianwangFile tianwangFile(ofsName);	// create a Link4SE file for output the raw link data	ofsName = DATA_LINK4SE_FILE + "." + CStrFun::itos(pthread_self());	CLink4SEFile link4SEFile(ofsName);	int iSleepCnt=0;	for(;;){		pthread_mutex_lock(&mutexCollection);		//if( !mmapUrls.empty() ){		int cnt = mmapUrls.size();		if(cnt > 0){			cout << "collection has: " << cnt << " unvisited urls" << endl;			multimap<string,string>::iterator it=mmapUrls.begin();			if( it != mmapUrls.end() ){				// get an URL				strUrl = (*it).second;				// remove it from the collection				mmapUrls.erase( it );				pthread_mutex_unlock(&mutexCollection);				// parse URL				CUrl iUrl;				if( iUrl.ParseUrlEx(strUrl) == false ){					cout << "ParseUrlEx error in fetch(): " << strUrl << endl;					continue;				}				if( strGHost != iUrl.m_sHost ){					close( nGSock );					nGSock = -1;					strGHost = iUrl.m_sHost;				}				(( CCrawl* )arg)->DownloadFile(&tianwangFile,&link4SEFile,iUrl,nGSock);				cnt = 0;			} else {				pthread_mutex_unlock(&mutexCollection);			}		} else {			pthread_mutex_unlock(&mutexCollection);			usleep(1000);			iSleepCnt++;		}		if( b_fOver == true && iSleepCnt==200)			break;		/*		if( b_fOver == true ){			break;		} else if( cnt == 100 ) {			cout << "w.";			cnt = 0;		}		*/	}	tianwangFile.Close();	link4SEFile.Close();}/*********************************************************************** * Function name: DownloadFile * Input argv: * 	-- pTianwang: the CCrawl handle * 	-- pLink4SE: the CCrawl handle * 	-- iUrl: the URL for crawling * 	-- nGSock: the previous global socket * Output argv: * 	-- * Return:***********************************************************************/void CCrawl::DownloadFile(CTianwangFile *pTianwangFile,	CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock){	char	*downloaded_file = NULL,		*fileHead = NULL,		*location = NULL;	int file_length = 0;	string strUrlLocation = "";	int nSock = nGSock;	cout << "1. pid=" << pthread_self() << " sock = " << nGSock << endl;	CHttp http;	file_length = http.Fetch(iUrl.m_sUrl, &downloaded_file, &fileHead, &location, &nSock);	#ifdef DEBUG	// just download		cout << "######file length: ######" << file_length << endl;		cout << "######head: ######" << fileHead << endl;	#endif	int nCount = 0;	while( file_length == -300 ){ // moved to an another place		if( strlen(location) > URL_LEN-1 || nCount == 3 || strlen(location)==0 ){			if( location )			{				//pthread_mutex_lock(&mutexMemory); 				free( location ); location = NULL;				//pthread_mutex_unlock(&mutexMemory);			}			file_length = -1;			break;		}		strUrlLocation = location;		if(location)		{			//pthread_mutex_lock(&mutexMemory);			free(location); location = NULL;			//pthread_mutex_unlock(&mutexMemory);		}		string::size_type idx1 = CStrFun::FindCase(strUrlLocation, "http");		if( idx1 != 0 ){			char c1 = iUrl.m_sUrl.at(iUrl.m_sUrl.length()-1);			char c2 = strUrlLocation.at(0);			if( c2 == '/' ){				strUrlLocation = "http://" + iUrl.m_sHost + strUrlLocation;			}else if(  c1!='/' && c2!='/'){				string::size_type idx;                                                                                                                                        idx = iUrl.m_sUrl.rfind('/');                                if( idx != string::npos ){                                        if( idx > 6 ){ // > strlen("http://..")                                                strUrlLocation = iUrl.m_sUrl.substr(0, idx+1) + strUrlLocation;                                        } else {                                                strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation;                                        }                                                                                                                                        } else {					file_length = -1;					break;                                }			} else {				if( c1=='/' ){                                        strUrlLocation = iUrl.m_sUrl + strUrlLocation;                                } else {                                        strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation;                                }                        }		}		CPage iPage;		if( iPage.IsFilterLink(strUrlLocation) ){			file_length = -1;			break;		}		cout << "2. pid=" << pthread_self() << " sock = " << nGSock << endl;		file_length = http.Fetch( strUrlLocation, &downloaded_file, &fileHead, &location, &nSock);		nCount++;	}	nGSock = nSock;	if(file_length == -1){ // unreachable, skipped.		cout << "!-: " << iUrl.m_sUrl << endl;		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		cout << "-unreach host: " << iUrl.m_sHost << endl;;		return;	}	if(file_length == -2){ // out of ip block .		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		// save unreach host                SaveUnreachHost(iUrl.m_sHost);		cout << "-out of block host: " << iUrl.m_sHost << endl;;		return;	}	if(file_length == -3) { // invalid host or ip		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		cout << "-invalid host: " << iUrl.m_sHost << endl;		return;	}	if(file_length == -4) {	// MIME is image/xxx		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		if( m_ofsLink4HistoryFile ){			pthread_mutex_lock(&mutexLink4HistoryFile);			m_ofsLink4HistoryFile << iUrl.m_sUrl << endl;;			pthread_mutex_unlock(&mutexLink4HistoryFile);		}		cout << "-imgage host: " << iUrl.m_sHost << endl;		return;	}	/* still experiment	char **dir;	dir =  ParseRobot( downloaded_file, file_length);	for( int i = 0; dir[i] != NULL ; i++){		cout << dir[i] << endl;		free( dir[i] );	}	exit(1);	*/	// so small, maybe some unuseful info, skipped	//if(file_length < 40){	// for ImgSE, /*	if(file_length < 256){	// for SE		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		cout << "#";		return;	}*/	// deal with normal page	if (!fileHead || !downloaded_file)	{		//pthread_mutex_lock(&mutexMemory);		if (fileHead)		{			free(fileHead); fileHead=NULL;		}		if (downloaded_file)		{			free(downloaded_file); downloaded_file=NULL;		}		//pthread_mutex_unlock(&mutexMemory);		close(nGSock);		nGSock = -1;		cout << "-size0 host: " << iUrl.m_sHost << endl;		return;	}	CPage iPage(iUrl.m_sUrl, strUrlLocation, fileHead, downloaded_file, file_length);	//pthread_mutex_lock(&mutexMemory);	if (fileHead)	{		free(fileHead); fileHead=NULL;	}	if (downloaded_file)	{		free(downloaded_file); downloaded_file=NULL;	}	//pthread_mutex_unlock(&mutexMemory);	iPage.ParseHeaderInfo(iPage.m_sHeader);	if( iPage.m_bConnectionState == false ){		close(nGSock);		nGSock = -1;	}	// when crawling images for ImgSE, remember to comment the paragraph	// when crawling plain text for SE, remember to open the paragraph	// paragraph begin	// iPage.m_sContentType != "text/css" &&	if( iPage.m_sContentType != "text/html" && 		iPage.m_sContentType != "text/plain" &&		iPage.m_sContentType != "text/xml" &&		iPage.m_sContentType != "application/msword" &&		iPage.m_sContentType != "application/pdf" &&		iPage.m_sContentType != "text/rtf" &&		iPage.m_sContentType != "application/postscript" &&		iPage.m_sContentType != "application/vnd.ms-execl" &&		iPage.m_sContentType != "application/vnd.ms-powerpoint" ){
12 3 下一页
💿 文件大小 9 K
👤 上传用户 tiger452
📂 所属分类数据结构
📄 代码行数 1,422 行
💻 语言类型 C++
🏷️ 相关标签

#搜索引擎 #分 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -