⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 main.cpp

📁 小型搜索引擎,用C/C++编写,属于全文搜索引擎
💻 CPP
📖 第 1 页 / 共 2 页
字号:
		if(fileHead) free(fileHead);		if(downloaded_file) free(downloaded_file);		iPage.GetContentEncoding();		iPage.GetContentType();		#ifdef DEBUG    // content encoding			cout <<"######Content encoding: ######" << endl 				<< iPage.m_sContentEncoding << endl;		#endif		char sUnzipContent[1024000];		int  nUnzipLength = 0;		if( iPage.m_sContentEncoding == "gzip" 			&& iPage.m_sContentType == "text/html" ){			gzFile zip;  			string ofsGzipName;			ofsGzipName = CStrFun::itos(pthread_self()) + ".gz";			ofstream ofsDownloadFile(ofsGzipName.c_str(),ios::trunc | ios::binary);			cout << "file_length: " << file_length << endl;                	ofsDownloadFile.write(iPage.m_sContent.c_str(), iPage.m_nLenContent);			ofsDownloadFile.close();			zip = gzopen(ofsGzipName.c_str(),"rb");  			if( zip == NULL ){				cout << "Open zip file " << ofsGzipName.c_str() << " error." << endl;				exit(-1);			}			nUnzipLength = gzread(zip, sUnzipContent, 1024000);			if( nUnzipLength == -1 ){				cout << "Read zip file " << ofsGzipName.c_str() << " error." << endl;				exit(-1);			}   			sUnzipContent[nUnzipLength]=0;    			gzclose(zip); 		}		unsigned char sMd[17];		char *p;		sMd[16] = '\0';		MD5((const unsigned char*)iPage.m_sUrl.c_str(), iPage.m_sUrl.length(), sMd);		p=pt(sMd);		pthread_mutex_lock(&mymutex);		if( setVisitedUrlMd5.count((const char*)p) > 0 ){			cout << "!";    //1.crawled already			pthread_mutex_unlock(&mymutex);			return;		}		setVisitedUrlMd5.insert((const char*)p);		//IsamFile(iPage.m_sUrl.c_str(), (char*)iPage.m_sContent.c_str(), iPage.m_nLenContent);		pthread_mutex_unlock(&mymutex);/*		cout << endl << "Downloading " << iPage.m_sUrl << " ... "			<< iPage.m_nLenContent << " bytes." << endl;*/		cout << "+";		// save as Tianwang format		string ofsName;		ofsName = DATA_TIANWANG_FILE + "." + CStrFun::itos(pthread_self());		ofstream ofsTianwangFile(ofsName.c_str(),ios::app|ios::binary);		if(!ofsTianwangFile){			cerr << "cannot open " << ofsName << "for output" << endl;			exit(-1);		}		char strDownloadTime[128];		time_t tDate;		memset(strDownloadTime,0,128);		time(&tDate);		strftime(strDownloadTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));		ofsTianwangFile << "version: 1.0\n";		//if( iPage.m_sLocation.empty() == true ){		if( iPage.m_sLocation.length() == 0 ){			 ofsTianwangFile << "url: " << iUrl.m_sUrl;		}else{			 ofsTianwangFile << "url: " << iPage.m_sLocation;			 ofsTianwangFile << "\norigin: " << iUrl.m_sUrl;		}		ofsTianwangFile << "\ndate: " << strDownloadTime;		if( mapCacheHostLookup.find(iUrl.m_sHost) == mapCacheHostLookup.end() ){			ofsTianwangFile <<"\nip: " << iUrl.m_sHost;		}else{			ofsTianwangFile <<"\nip: " << ( *(mapCacheHostLookup.find(iUrl.m_sHost)) ).second;		}		ofsTianwangFile << "\nlength: " << iPage.m_nLenContent + iPage.m_nLenHeader + 1 << "\n\n"			<< iPage.m_sHeader << "\n";		ofsTianwangFile.write( iPage.m_sContent.c_str(), iPage.m_nLenContent);		ofsTianwangFile << "\n";		ofsTianwangFile.close();				//save visited Urls		ofsName = VISITED_FILE + "." + CStrFun::itos(pthread_self());		ofstream ofsVisitedUrl(ofsName.c_str(),ios::app|ios::binary);		if(!ofsVisitedUrl){			cerr << "cannot open " << VISITED_FILE << "for output" << endl;			exit(-1);		}		//if( iPage.m_sLocation.empty() == true) {		if( iPage.m_sLocation.length() == 0 ){			ofsVisitedUrl << iUrl.m_sUrl << endl;		}else{			ofsVisitedUrl << iPage.m_sLocation << endl;		}					ofsVisitedUrl.close();		iPage.GetCharset();		// parse links		if( nUnzipLength != 0 ){			iPage.m_sContent = sUnzipContent;			iPage.m_nLenContent = nUnzipLength;		}		if( iPage.m_sContent.empty() ) return;		iPage.GetContentLinkInfo();		//cout << endl << "##ContentLinkInfo:" << endl;		//cout << iPage.m_sContentLinkInfo << endl;		if( iPage.m_sContentLinkInfo.empty() ) return;		iPage.GetLinkInfo4SE();		//cout << endl << "##LinkInfo4SE:" << endl;		//cout << iPage.m_sLinkInfo4SE << endl;		iPage.GetLinkInfo4History();		//cout << endl << "##LinkInfo4History:" << endl;		//cout << iPage.m_sLinkInfo4History << endl;		if( iPage.m_sLinkInfo4SE.empty() ) return;		iPage.FindRefLink4SE();/*		cout << endl << "4SE total: " << iPage.m_mapLink4SE.size() << " links." << endl;		map<string,string >::iterator it4SE = iPage.m_mapLink4SE.begin();		for ( ; it4SE != iPage.m_mapLink4SE.end(); ++it4SE ){			cout << (*it4SE).first << '\t' << (*it4SE).second << endl;		}					exit(0);*/		iPage.FindRefLink4History();		// save history link. such as <img ...>		ofsName = HISTORY_LINK_FILE + "." + CStrFun::itos(pthread_self());		ofstream ofsHistoryLink(ofsName.c_str(),ios::app|ios::binary);		if(!ofsHistoryLink){			cerr << "cannot open " << ofsName.c_str() << "for output" << endl;			exit(-1);		}		vector<string>::iterator it4His = iPage.m_vecLink4History.begin();		for ( ; it4His != iPage.m_vecLink4History.end(); ++it4His ){			ofsHistoryLink << *it4His << endl;		}		ofsHistoryLink.close();		ofsName = f_strOutFile + "." + CStrFun::itos(pthread_self());		ofstream ofsUnvisitUrl(ofsName.c_str(),ios::app|ios::binary);		if(!ofsUnvisitUrl){			cerr << "cannot open " << f_strOutFile << "for output" << endl;			exit(-1);		}			ofsName = VISITED_LINK_FILE + "." + CStrFun::itos(pthread_self());		ofstream ofsVisitedUrlLink(ofsName.c_str(),ios::app|ios::binary);		if(!ofsVisitedUrlLink){			cerr << "cannot open " << VISITED_LINK_FILE << "for output" << endl;			exit(-1);		}		ofsVisitedUrlLink << "########" << endl;		ofsVisitedUrlLink << iPage.m_sUrl << "\t\t" << iPage.m_sCharset << endl;		//save url links		map<string,string,less<string> >::iterator it;		for ( it = iPage.m_mapLink4SE.begin(); it != iPage.m_mapLink4SE.end(); ++it ){			string strRefLink = (*it).first;			string::size_type idx;			idx = strRefLink.find('?');			if(idx != string::npos ){				idx = strRefLink.find('/');				if(idx != string::npos ){					strRefLink = strRefLink.substr(0,idx);				}				if(strRefLink.length() > 88){					continue;				}			}			if( strRefLink.length() > 141){				continue;			}/*			if( FindUrl(strRefLink.c_str(), NULL) == -1 ){				ofsUnvisitUrl << strRefLink << endl;			}*/			unsigned char sMd[17];			char *p;			sMd[16] = '\0';			MD5((const unsigned char*)strRefLink.c_str(), strRefLink.length(), sMd);			p=pt(sMd);			if( setVisitedUrlMd5.count((const char*)p) == 0 ){				ofsUnvisitUrl << strRefLink << endl;			}			if( (*it).second == "" ){				ofsVisitedUrlLink << (*it).first << endl;			}else{				if( iPage.m_sCharset == ""){					ofsVisitedUrlLink << (*it).first << "\t" 						<< (*it).second << endl;				}else{					ofsVisitedUrlLink << (*it).first << "\t" 						<< (*it).second << "\t" << iPage.m_sCharset << endl;				}			}		}		ofsUnvisitUrl.close();		ofsVisitedUrlLink.close();	}	return;}static void SigTerm(int x){	SaveMd5();	cout << "Terminated!" << endl;	exit(0);}void SaveMd5(){	ofstream ofsMd5(MD5_FILE.c_str(),ios::trunc | ios::binary);	if(!ofsMd5){		cerr << "cannot open " << MD5_FILE << "for output" << endl;		exit(-1);	}		set<string>::iterator it=setVisitedUrlMd5.begin();	for( ; it!=setVisitedUrlMd5.end(); ++it ){		//ofsMd5.write( (*it).c_str(), 16);		ofsMd5 << (*it).c_str();		ofsMd5 << "\n";	}	cout << endl << "saved " << setVisitedUrlMd5.size() 		<< " visited url md5 values" << endl;	ofsMd5.close();}void GetVisitedUrlMd5(){	ifstream ifsMd5(MD5_FILE.c_str(),ios::binary);	if(!ifsMd5){		cerr << "did not find " << MD5_FILE << " for iutput" << endl;	}		string strMd5;	while( getline(ifsMd5,strMd5) ){		setVisitedUrlMd5.insert(strMd5);		}	ifsMd5.close();	cout << "got " << setVisitedUrlMd5.size() << " md5 values of visited urls" << endl;}void GetIpBlock(){	ifstream ifsIpBlock(IP_BLOCK_FILE.c_str());	if (!ifsIpBlock){		cerr << "Cannot open " << IP_BLOCK_FILE << " for input." << endl;	}	string strIpBlock;	while( getline(ifsIpBlock,strIpBlock) ){		if(strIpBlock[0]=='\0' || strIpBlock[0]=='#' 			|| strIpBlock[0]== '\n'){			continue;		}		char buf1[64], buf2[64];                buf1[0]='\0'; buf2[0]='\0';                sscanf( strIpBlock.c_str(), "%s %s", buf1, buf2 );		mapIpBlock.insert(valTypeIpBlock( inet_addr(buf1), inet_addr(buf2)) );		}	ifsIpBlock.close();}void GetUnreachHost(){	vsUnreachHost.reserve(MAX_UNREACHABLE_HOST_NUM);	ifstream ifsUnreachHost(UNREACH_HOST.c_str());	if (!ifsUnreachHost){		cerr << "Cannot open " << UNREACH_HOST << " for input." << endl;	}		string strUnreachHost;	int i=0;	while( getline(ifsUnreachHost,strUnreachHost) ){		if(strUnreachHost[0]=='\0' || strUnreachHost[0]=='#' 			|| strUnreachHost[0]== '\n'){			continue;		}		vsUnreachHost.push_back(strUnreachHost);		i++;		if(i == MAX_UNREACHABLE_HOST_NUM) break;	}	ifsUnreachHost.close();}static char *pt(unsigned char *md){	int i;	static char buf[33];	for (i=0; i<16; i++)		sprintf(&(buf[i*2]),"%02x",md[i]);	//printf("len=%2d %s\n",strlen(buf),buf);	return(buf);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -