📄 crawl.cpp
字号:
cout << "-unwant type host: " << iUrl.m_sHost << endl; return; } // paragraph end #ifdef DEBUG // content encoding cout <<"######Content encoding: ######" << endl << iPage.m_sContentEncoding << endl; #endif char sUnzipContent[1024000]; int nUnzipLength = 0; if( iPage.m_sContentEncoding == "gzip" && iPage.m_sContentType == "text/html" ){ gzFile zip; string ofsGzipName; ofsGzipName = CStrFun::itos(pthread_self()) + ".gz"; ofstream ofsDownloadFile(ofsGzipName.c_str(),ios::trunc | ios::binary); cout << "file_length: " << file_length << endl; ofsDownloadFile.write(iPage.m_sContent.c_str(), iPage.m_nLenContent); ofsDownloadFile.close(); zip = gzopen(ofsGzipName.c_str(),"rb"); if( zip == NULL ){ cout << "Open zip file " << ofsGzipName.c_str() << " error." << endl; exit(-1); } nUnzipLength = gzread(zip, sUnzipContent, 1024000); if( nUnzipLength == -1 ){ cout << "Read zip file " << ofsGzipName.c_str() << " error." << endl; exit(-1); } sUnzipContent[nUnzipLength]=0; gzclose(zip); } CMD5 iMD5; string strDigest; ///////////////////////////// // because we can make sure the url in the setVisitedUrlMd5 // is not same(we have check it before insert it to the collection), // we intert it directly. however... //iMD5.GenerateMD5( (unsigned char*)iPage.m_sUrl.c_str(), iPage.m_sUrl.length() ); iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.length() ); strDigest = iMD5.ToString(); pthread_mutex_lock(&mutexVisitedUrlMD5); if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) { cout << "!vurl: "; //1.crawled already pthread_mutex_unlock(&mutexVisitedUrlMD5); return; } setVisitedUrlMD5.insert(strDigest); SaveVisitedUrlMD5(strDigest); pthread_mutex_unlock(&mutexVisitedUrlMD5); ///////////////////////////// // whether it is a visited page // for ImgSE, should comment this paragraph // for SE, should uncomment this paragraph // begin iMD5.GenerateMD5( (unsigned char*)iPage.m_sContent.c_str(), iPage.m_sContent.length() ); strDigest = iMD5.ToString(); pthread_mutex_lock(&mutexVisitedPageMD5); replicas.insert(pair<string, string>(strDigest, iPage.m_sUrl)); if( setVisitedPageMD5.find(strDigest) != setVisitedPageMD5.end() ) { cout << "!vpage: "; // crawled already pthread_mutex_unlock(&mutexVisitedPageMD5); return; } setVisitedPageMD5.insert(strDigest); SaveVisitedPageMD5(strDigest); pthread_mutex_unlock(&mutexVisitedPageMD5); // end /////////////////////// // save as ISAM file //SaveIsamRawData(&iUrl, &iPage); cout << "+"; //////////////////// // save as Tianwang format SaveTianwangRawData(pTianwangFile, &iUrl, &iPage); //////////////////// // save visited Urls if( iPage.m_sLocation.length() < 1 ){ SaveVisitedUrl(iUrl.m_sUrl); } else { SaveVisitedUrl(iPage.m_sLocation); } //return; // just crawl seeds ///////////////////////////////////// // Parse hyperlinks if (iPage.m_sContentType != "text/html") { // we can only find links in tex/html return; }/* if (iPage.ParseHyperLinks() == false){ return; } SaveLink4SE( &iPage); SaveLink4History( &iPage); map<string,string>::iterator it4SE = iPage.m_mapLink4SE.begin(); string str; for( ; it4SE!= iPage.m_mapLink4SE.end(); ++it4SE ){ str = (*it4SE).first; AddUrl( str.c_str() ); }*/ // using XIE Han's link parser struct uri page_uri; //FILE *tmp; //tmp = tmpfile(); //fwrite(iPage.m_sContent.c_str(), iPage.m_nLenContent, 1, tmp); //fseek(tmp, 0, SEEK_SET); //fclose(tmp); pthread_mutex_lock(&mutexDetect); if (iPage.m_sLocation.empty()) { uri_parse_string(iPage.m_sUrl.c_str(), &page_uri); } else { uri_parse_string(iPage.m_sLocation.c_str(), &page_uri); } struct package p={this,&iPage}; //hlink_detect(tmp, &page_uri, onfind, &p); hlink_detect_string(iPage.m_sContent.c_str(), &page_uri, onfind, &p); struct file_arg pLinks = {&iUrl, &iPage}; SaveLink4SE031121( &pLinks ); // save as Link4SE format //SaveLink4SERawData(pLink4SEFile, &iUrl, &iPage); pthread_mutex_unlock(&mutexDetect); uri_destroy(&page_uri); cout << "Parse End......" << endl; return;}voidSaveReplicas(const char* filename){ //ofstream ofs(filename, ios::out|ios::app); ofstream ofs(filename, ios::out|ios::binary|ios::app); if( !ofs ){ cout << "error open file " << endl; } string md5; pthread_mutex_lock(&mutexReplicas); multimap<string, string, less<string> >::const_iterator it; ostringstream *oss = 0; int i = 0; for ( it=replicas.begin(); it != replicas.end(); it ++) { if (!md5.empty() && md5 != it->first) { if (i>=2) ofs<<(*oss).str()<<endl; //pthread_mutex_lock(&mutexMemory); delete(oss); oss = new ostringstream; //pthread_mutex_unlock(&mutexMemory); (*oss)<<it->first<<endl; i = 0; md5 = it->first; } else if (md5.empty()) { md5 = it->first; //pthread_mutex_lock(&mutexMemory); oss = new ostringstream; //pthread_mutex_unlock(&mutexMemory); (*oss)<<it->first<<endl; i = 0; } if (oss != 0) (*oss)<<it->second<<endl; i++; } pthread_mutex_unlock(&mutexReplicas);}////////////////////////////////////////////////////////////////////////////// Construction/Destruction////////////////////////////////////////////////////////////////////////////CCrawl::CCrawl(){}CCrawl::CCrawl(string inputFileName, string outputFileName){ m_sInputFileName = inputFileName; m_sOutputFileName = outputFileName; // + ".txt"}CCrawl::~CCrawl(){ m_ofsVisitedUrlFile.close(); m_ofsLink4SEFile.close(); m_ofsLink4HistoryFile.close(); m_isamFile.Close(); m_ofsVisitedUrlMD5File.close(); m_ofsVisitedPageMD5File.close();}/***************************************************************** ** Function name: SigTerm ** Input argv: ** -- ** Output argv: ** -- ** Return: ** Function Description: signal function ** Version: 1.0 ** Be careful: *****************************************************************/static void SigTerm(int x){ SaveUnvisitedUrl(); SaveReplicas("repli"); cout << "Terminated!" << endl; exit(0);}void CCrawl::GetVisitedUrlMD5(){ ifstream ifsMD5(URL_MD5_FILE.c_str(),ios::binary); if(!ifsMD5){ //cerr << "did not find " << UrlMD5_FILE << " for iutput" << endl; return; } string strMD5; while( getline(ifsMD5,strMD5) ){ setVisitedUrlMD5.insert(strMD5); } ifsMD5.close(); cout << "got " << setVisitedUrlMD5.size() << " md5 values of visited urls" << endl;}void CCrawl::GetVisitedPageMD5(){ ifstream ifsMD5(PAGE_MD5_FILE.c_str(),ios::binary); if(!ifsMD5){ //cerr << "did not find " << PageMD5_FILE << " for iutput" << endl; return; } string strMD5; while( getline(ifsMD5,strMD5) ){ setVisitedPageMD5.insert(strMD5); } ifsMD5.close(); cout << "got " << setVisitedPageMD5.size() << " md5 values of visited pages" << endl;}void CCrawl::GetIpBlock(){ ifstream ifsIpBlock(IP_BLOCK_FILE.c_str()); if (!ifsIpBlock){ //cerr << "Cannot open " << IP_BLOCK_FILE << " for input." << endl; return; } string strIpBlock; while( getline(ifsIpBlock,strIpBlock) ){ if(strIpBlock[0]=='\0' || strIpBlock[0]=='#' || strIpBlock[0]== '\n'){ continue; } char buf1[64], buf2[64]; buf1[0]='\0'; buf2[0]='\0'; sscanf( strIpBlock.c_str(), "%s %s", buf1, buf2 ); mapIpBlock.insert(valTypeIpBlock( inet_addr(buf1), inet_addr(buf2)) ); } ifsIpBlock.close();}void CCrawl::GetUnreachHostMD5(){ //vsUnreachHost.reserve(MAX_UNREACHABLE_HOST_NUM); ifstream ifsUnreachHost(UNREACH_HOST_FILE.c_str()); if (!ifsUnreachHost){ cerr << "Cannot open " << UNREACH_HOST_FILE << " for input." << endl; return; } string strUnreachHost; //int i=0; while( getline(ifsUnreachHost,strUnreachHost) ){ if(strUnreachHost[0]=='\0' || strUnreachHost[0]=='#' || strUnreachHost[0]== '\n'){ continue; } CStrFun::Str2Lower( strUnreachHost, strUnreachHost.size() ); //vsUnreachHost.push_back(strUnreachHost); CMD5 iMD5; iMD5.GenerateMD5( (unsigned char*)strUnreachHost.c_str(), strUnreachHost.size() ); string strDigest = iMD5.ToString(); setUnreachHostMD5.insert(strDigest); //i++; //if(i == MAX_UNREACHABLE_HOST_NUM) break; } ifsUnreachHost.close();}/************************************************************************************** * Function name: SaveTianwangRawData * Input argv: * -- pTianwangFile: tianwang file handle * -- pUrl: url * -- pPage: web page * Output argv: * -- * Return: * Function Description: save raw page data as tianwang file**************************************************************************************/void CCrawl::SaveTianwangRawData(CTianwangFile *pTianwangFile, CUrl *pUrl, CPage *pPage){ if( !pTianwangFile || !pUrl || !pPage ){ return; } file_arg arg; arg.pUrl = pUrl; arg.pPage = pPage; // each thread writes itself, so dnnot need mutex pTianwangFile->Write((void*)&arg);}/************************************************************************************** * Function name: SaveLink4SERawData * Input argv: * -- pLink4SEFile: link4SE file handle * -- pUrl: url * -- pPage: web page * Output argv: * -- * Return: * Function Description: save raw page data as tianwang file**************************************************************************************/void CCrawl::SaveLink4SERawData(CLink4SEFile *pLink4SEFile, CUrl *pUrl, CPage *pPage){ if( !pLink4SEFile || !pUrl || !pPage ){ return; } file_arg arg; arg.pUrl = pUrl; arg.pPage = pPage; // each thread writes itself, so dnnot need mutex pLink4SEFile->Write((void*)&arg);}/************************************************************************************** * Function name: SaveIsamRawData * Input argv: * -- pUrl: url * -- pPage: web page * Output argv: * -- * Return: * Function Description: save raw page data as ISAM file**************************************************************************************/void CCrawl::SaveIsamRawData(CUrl *pUrl, CPage *pPage){ if( !pUrl || !pPage ){ return; } file_arg arg; arg.pUrl = pUrl; arg.pPage = pPage; pthread_mutex_lock(&mutexIsamFile); m_isamFile.Write((void *)&arg); pthread_mutex_unlock(&mutexIsamFile);}/************************************************************************************** * Function name: SaveVisitedUrl * Input argv: * -- url: url * Output argv: * -- * Return: * Function Description: save raw the Visited Url**************************************************************************************/void CCrawl::SaveVisitedUrl(string url){ if( m_ofsVisitedUrlFile ){ pthread_mutex_lock(&mutexVisitedUrlFile); m_ofsVisitedUrlFile << url << endl; pthread_mutex_unlock(&mutexVisitedUrlFile); }}void CCrawl::SaveUnreachHost(string host){ CMD5 iMD5; iMD5.GenerateMD5( (unsigned char*)host.c_str(), host.size() ); string strDigest = iMD5.ToString(); if( setUnreachHostMD5.find(strDigest) == setUnreachHostMD5.end() ){ pthread_mutex_lock(&mutexUnreachHost); setUnreachHostMD5.insert(strDigest); if( m_ofsUnreachHostFile ){ m_ofsUnreachHostFile << host << endl; } pthread_mutex_unlock(&mutexUnreachHost); }}void CCrawl::SaveLink4SE(CPage *iPage)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -