⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 main.cpp

📁 小型搜索引擎,用C/C++编写,属于全文搜索引擎
💻 CPP
📖 第 1 页 / 共 2 页
字号:
#include "Tse.h"#include "HttpTse.h"#include "StrFun.h"#include "Url.h"#include "Page.h"#include <openssl/md5.h>#include <zlib.h>using namespace std;extern int IsamFile(const char* url, char* downloaded_file, int len);//extern int FindUrl(const char* url, char **content);extern int FindKey(const char* key);extern int ScanPageContent(char*, int);extern void DoSearch();extern void DoHelp();extern map<string,string> mapCacheHostLookup;bool IsFilterLink(string plink);int NormallizeUrl(string& strUrl);void DownloadFile(CUrl,int&);void *fetch(void *arg);static void SigTerm(int x);void GetVisitedUrlMd5();void GetIpBlock();void GetUnreachHost();void SaveMd5();static char *pt(unsigned char *md);vector<string> vsUnreachHost;bool f_bUnreachHost;string f_strOutFile;pthread_mutex_t mymutex = PTHREAD_MUTEX_INITIALIZER;multimap<string,string> mmapUrls;typedef map<string,string>::value_type mvalType;map<unsigned long,unsigned long> mapIpBlock;typedef map<unsigned long,unsigned long>::value_type valTypeIpBlock;set<string> setVisitedUrlMd5;bool b_fOver = false;int main(int argc, char* argv[]){	if(argc < 2) DoHelp();	if( !strncmp(argv[1],"-s",2) && argc==2 )		DoSearch();	if( !strncmp(argv[1],"-c",2) && argc==4 ){		f_strOutFile = argv[3];	}else DoHelp();	/* set the signal function */	signal(SIGTERM, SigTerm);	signal(SIGKILL, SigTerm);	signal(SIGINT, SigTerm);	signal(SIGPIPE, SIG_IGN);	signal(SIGCHLD,SIG_IGN);/* First * --------> */	char strTime[128];	time_t tDate;	memset(strTime,0,128);	time(&tDate);	strftime(strTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));	cout << "\n\nBegin at: " << strTime << "\n\n";	GetVisitedUrlMd5();	GetIpBlock();	GetUnreachHost();/* Second * --------> get seeds and download files */	ifstream ifsSeed(argv[2]);	if (!ifsSeed){		cerr << "Cannot open " << argv[2] << " for input\n";		return -1;	}	string strUrl;	pthread_t *tids;	// Create thread ID structures. 	tids = (pthread_t*)malloc(NUM_WORKERS * sizeof(pthread_t)); 	if( tids == NULL)		cerr << "malloc error" << endl;	for(unsigned int i=0; i< NUM_WORKERS; i++){		if (pthread_create(&tids[i], NULL, fetch, NULL))			cerr << "create threads error" << endl;	}	int nUrlNum = 0;	while( getline(ifsSeed, strUrl) ){		string::size_type idx;				if(strUrl[0]=='\0' || strUrl[0]=='#' || strUrl[0]== '\n'){			continue;		}		idx = strUrl.find('\t');		if(idx != string::npos){			strUrl = strUrl.substr(0,idx);		}		if( strUrl.length() < 8 ) continue;		char tmpUrl[URL_LEN];		memset(tmpUrl,0,URL_LEN);		memcpy(tmpUrl, strUrl.c_str(), URL_LEN);		strUrl = tmpUrl;		CUrl iUrl;		iUrl.ParseUrl(strUrl);/*##########*/		map<string,string>::iterator it  = mapCacheHostLookup.find(iUrl.m_sHost);		if( it != mapCacheHostLookup.end() ){   // find in host lookup cache			string strHostIp;			strHostIp = (*it).second;			unsigned long   inaddr = (unsigned long)inet_addr( strHostIp.c_str() );			if( mapIpBlock.size() > 0){				map<unsigned long,unsigned long>::iterator pos;				bool b_fContinue = false;				for(pos=mapIpBlock.begin(); pos != mapIpBlock.end(); ++pos ){					unsigned long ret;					ret = inaddr & ~((*pos).second);					if( ret == (*pos).first ){						b_fContinue = true;						break;					}				}				if( b_fContinue == false ) continue;			}/*                        //save blocked Urls                        string ofsName = IP_BLOCKED_URL + "." + CStrFun::itos(pthread_self());                        ofstream ofsBlockUrl(ofsName.c_str(),ios::app|ios::binary);                        if(!ofsBlockUrl){                                cerr << "cannot open " << ofsName.c_str() << "for output" << endl;                                exit(-1);                        }                        ofsBlockUrl << host << endl;                        ofsBlockUrl.close();*/		}/*###########*/		//if crawled, discard it.		unsigned char sMd[17];		char *p;		sMd[16] = '\0';		MD5((const unsigned char*)strUrl.c_str(), strUrl.length(), sMd);		p=pt(sMd);		//if( setVisitedUrlMd5.count((const char*)sMd) == 1 ){		if( setVisitedUrlMd5.count((const char*)p) > 0 ){			cout << "!";	//1.crawled already			continue;		}/*		pthread_mutex_lock(&mymutex);		setVisitedUrlMd5.insert((const char*)p);		pthread_mutex_unlock(&mymutex);*/				vector<string>::iterator itResult;		itResult = find( vsUnreachHost.begin(), vsUnreachHost.end(), iUrl.m_sHost );		if( itResult != vsUnreachHost.end() ){			cout << iUrl.m_sHost << " is an unreachable site." << endl;			continue;		}		nUrlNum ++;		if ( nUrlNum == 10 ){			cout << "cache(" << mmapUrls.size() << ")";			nUrlNum = 0;		}		// if crawled, just skip		//if( FindUrl(strUrl.c_str(),NULL) == 0 ) continue;		// Download pages		int cnt = 0;		for(;;){			if( cnt == 100 ){				//break;				cout << "~";				cnt = 0;			}			pthread_mutex_lock(&mymutex);			if( mmapUrls.count(iUrl.m_sHost) < NUM_WORKERS_ON_A_SITE ){				mmapUrls.insert(mvalType( iUrl.m_sHost, strUrl));					pthread_mutex_unlock(&mymutex);				break;			}else{				pthread_mutex_unlock(&mymutex);				usleep(4000);				cnt++;			}		}	}	sleep(30);	b_fOver = true;	cout << "finished to get all unvisited urls." << endl;	// Wait for the threads. 	for (unsigned int i = 0; i < NUM_WORKERS; ++i){		(void)pthread_join(tids[i], NULL);	}		cout << "closed " << NUM_WORKERS << " threads." << endl;	SaveMd5();	memset(strTime,0,128);	time(&tDate);	strftime(strTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));	cout << "\n\nEnd at: " << strTime << "\n\n";	exit(0);}void *fetch(void *arg){	string strUrl,host;	int	nGSock = -1;	string	strGHost = "";	int cnt=0;	for(;;){		pthread_mutex_lock(&mymutex);		if( !mmapUrls.empty() ){						multimap<string,string>::iterator it=mmapUrls.begin();			if(it != mmapUrls.end()){				strUrl = (*it).second;							mmapUrls.erase( it );				pthread_mutex_unlock(&mymutex);				// if crawled, just skip/*				if( FindUrl(strUrl.c_str(),NULL) == 0 ){					cnt = 0;					continue;				}*/				CUrl iUrl;				iUrl.ParseUrl(strUrl);				if( strGHost != iUrl.m_sHost ){					close( nGSock );					nGSock = -1;					strGHost = iUrl.m_sHost;				}				DownloadFile(iUrl, nGSock);				cnt = 0;			}else{				pthread_mutex_unlock(&mymutex);			}		}else{			pthread_mutex_unlock(&mymutex);			usleep(1000);			cnt++;		}/*		if(cnt == 100) {			//cout << endl << "found 100 times empty, maybe no tasks" << endl;			cout << "@100";			break;		}*/		if( b_fOver == true ){			break;		}else if (cnt == 100 ){			cout << "w." ;			cnt = 0;		}	}	return NULL;}void DownloadFile(CUrl iUrl, int& nGSock){	char	*downloaded_file = NULL,		*fileHead = NULL,		*location = NULL;	int file_length = 0;	string strUrlLocation = "";	int nSock = nGSock;	cout << "1. " << pthread_self() << " sock = " << nGSock << endl;	file_length = HttpFetch(iUrl.m_sUrl, &downloaded_file, &fileHead, &location, &nSock);	#ifdef DEBUG	// Just download		cout << "#######file length: ######" << file_length << endl;		cout << "#######head: ######" << fileHead << endl;	#endif	int nCount = 0;	while (file_length == -2){		if( strlen(location) > URL_LEN-1 || nCount == 1){			if( location ) free( location );			location = NULL;			file_length = -1;			break;		}		strUrlLocation = location;		if(location) free(location);		location = NULL;		cout << "2. " << pthread_self() << " sock = " << nGSock << endl;		file_length = HttpFetch( strUrlLocation, &downloaded_file, &fileHead, &location, &nSock);		nCount = 1;	}	nGSock = nSock;						if(file_length == -1){ // unreachable, skipped.		//cout << "-" << iUrl.m_sUrl << endl;		cout << "-";		if(fileHead) free(fileHead);		return;	}	else if(file_length < 40){		// so small, maybe some unuseful info, skipped		if(fileHead) free(fileHead);		if(downloaded_file) free(downloaded_file);		cout << "#";		return;	}else{		CPage iPage(iUrl.m_sUrl, strUrlLocation, fileHead, downloaded_file, file_length);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -