📄 main.cpp
字号:
if(fileHead) free(fileHead); if(downloaded_file) free(downloaded_file); iPage.GetContentEncoding(); iPage.GetContentType(); #ifdef DEBUG // content encoding cout <<"######Content encoding: ######" << endl << iPage.m_sContentEncoding << endl; #endif char sUnzipContent[1024000]; int nUnzipLength = 0; if( iPage.m_sContentEncoding == "gzip" && iPage.m_sContentType == "text/html" ){ gzFile zip; string ofsGzipName; ofsGzipName = CStrFun::itos(pthread_self()) + ".gz"; ofstream ofsDownloadFile(ofsGzipName.c_str(),ios::trunc | ios::binary); cout << "file_length: " << file_length << endl; ofsDownloadFile.write(iPage.m_sContent.c_str(), iPage.m_nLenContent); ofsDownloadFile.close(); zip = gzopen(ofsGzipName.c_str(),"rb"); if( zip == NULL ){ cout << "Open zip file " << ofsGzipName.c_str() << " error." << endl; exit(-1); } nUnzipLength = gzread(zip, sUnzipContent, 1024000); if( nUnzipLength == -1 ){ cout << "Read zip file " << ofsGzipName.c_str() << " error." << endl; exit(-1); } sUnzipContent[nUnzipLength]=0; gzclose(zip); } unsigned char sMd[17]; char *p; sMd[16] = '\0'; MD5((const unsigned char*)iPage.m_sUrl.c_str(), iPage.m_sUrl.length(), sMd); p=pt(sMd); pthread_mutex_lock(&mymutex); if( setVisitedUrlMd5.count((const char*)p) > 0 ){ cout << "!"; //1.crawled already pthread_mutex_unlock(&mymutex); return; } setVisitedUrlMd5.insert((const char*)p); //IsamFile(iPage.m_sUrl.c_str(), (char*)iPage.m_sContent.c_str(), iPage.m_nLenContent); pthread_mutex_unlock(&mymutex);/* cout << endl << "Downloading " << iPage.m_sUrl << " ... " << iPage.m_nLenContent << " bytes." << endl;*/ cout << "+"; // save as Tianwang format string ofsName; ofsName = DATA_TIANWANG_FILE + "." + CStrFun::itos(pthread_self()); ofstream ofsTianwangFile(ofsName.c_str(),ios::app|ios::binary); if(!ofsTianwangFile){ cerr << "cannot open " << ofsName << "for output" << endl; exit(-1); } char strDownloadTime[128]; time_t tDate; memset(strDownloadTime,0,128); time(&tDate); strftime(strDownloadTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); ofsTianwangFile << "version: 1.0\n"; //if( iPage.m_sLocation.empty() == true ){ if( iPage.m_sLocation.length() == 0 ){ ofsTianwangFile << "url: " << iUrl.m_sUrl; }else{ ofsTianwangFile << "url: " << iPage.m_sLocation; ofsTianwangFile << "\norigin: " << iUrl.m_sUrl; } ofsTianwangFile << "\ndate: " << strDownloadTime; if( mapCacheHostLookup.find(iUrl.m_sHost) == mapCacheHostLookup.end() ){ ofsTianwangFile <<"\nip: " << iUrl.m_sHost; }else{ ofsTianwangFile <<"\nip: " << ( *(mapCacheHostLookup.find(iUrl.m_sHost)) ).second; } ofsTianwangFile << "\nlength: " << iPage.m_nLenContent + iPage.m_nLenHeader + 1 << "\n\n" << iPage.m_sHeader << "\n"; ofsTianwangFile.write( iPage.m_sContent.c_str(), iPage.m_nLenContent); ofsTianwangFile << "\n"; ofsTianwangFile.close(); //save visited Urls ofsName = VISITED_FILE + "." + CStrFun::itos(pthread_self()); ofstream ofsVisitedUrl(ofsName.c_str(),ios::app|ios::binary); if(!ofsVisitedUrl){ cerr << "cannot open " << VISITED_FILE << "for output" << endl; exit(-1); } //if( iPage.m_sLocation.empty() == true) { if( iPage.m_sLocation.length() == 0 ){ ofsVisitedUrl << iUrl.m_sUrl << endl; }else{ ofsVisitedUrl << iPage.m_sLocation << endl; } ofsVisitedUrl.close(); iPage.GetCharset(); // parse links if( nUnzipLength != 0 ){ iPage.m_sContent = sUnzipContent; iPage.m_nLenContent = nUnzipLength; } if( iPage.m_sContent.empty() ) return; iPage.GetContentLinkInfo(); //cout << endl << "##ContentLinkInfo:" << endl; //cout << iPage.m_sContentLinkInfo << endl; if( iPage.m_sContentLinkInfo.empty() ) return; iPage.GetLinkInfo4SE(); //cout << endl << "##LinkInfo4SE:" << endl; //cout << iPage.m_sLinkInfo4SE << endl; iPage.GetLinkInfo4History(); //cout << endl << "##LinkInfo4History:" << endl; //cout << iPage.m_sLinkInfo4History << endl; if( iPage.m_sLinkInfo4SE.empty() ) return; iPage.FindRefLink4SE();/* cout << endl << "4SE total: " << iPage.m_mapLink4SE.size() << " links." << endl; map<string,string >::iterator it4SE = iPage.m_mapLink4SE.begin(); for ( ; it4SE != iPage.m_mapLink4SE.end(); ++it4SE ){ cout << (*it4SE).first << '\t' << (*it4SE).second << endl; } exit(0);*/ iPage.FindRefLink4History(); // save history link. such as <img ...> ofsName = HISTORY_LINK_FILE + "." + CStrFun::itos(pthread_self()); ofstream ofsHistoryLink(ofsName.c_str(),ios::app|ios::binary); if(!ofsHistoryLink){ cerr << "cannot open " << ofsName.c_str() << "for output" << endl; exit(-1); } vector<string>::iterator it4His = iPage.m_vecLink4History.begin(); for ( ; it4His != iPage.m_vecLink4History.end(); ++it4His ){ ofsHistoryLink << *it4His << endl; } ofsHistoryLink.close(); ofsName = f_strOutFile + "." + CStrFun::itos(pthread_self()); ofstream ofsUnvisitUrl(ofsName.c_str(),ios::app|ios::binary); if(!ofsUnvisitUrl){ cerr << "cannot open " << f_strOutFile << "for output" << endl; exit(-1); } ofsName = VISITED_LINK_FILE + "." + CStrFun::itos(pthread_self()); ofstream ofsVisitedUrlLink(ofsName.c_str(),ios::app|ios::binary); if(!ofsVisitedUrlLink){ cerr << "cannot open " << VISITED_LINK_FILE << "for output" << endl; exit(-1); } ofsVisitedUrlLink << "########" << endl; ofsVisitedUrlLink << iPage.m_sUrl << "\t\t" << iPage.m_sCharset << endl; //save url links map<string,string,less<string> >::iterator it; for ( it = iPage.m_mapLink4SE.begin(); it != iPage.m_mapLink4SE.end(); ++it ){ string strRefLink = (*it).first; string::size_type idx; idx = strRefLink.find('?'); if(idx != string::npos ){ idx = strRefLink.find('/'); if(idx != string::npos ){ strRefLink = strRefLink.substr(0,idx); } if(strRefLink.length() > 88){ continue; } } if( strRefLink.length() > 141){ continue; }/* if( FindUrl(strRefLink.c_str(), NULL) == -1 ){ ofsUnvisitUrl << strRefLink << endl; }*/ unsigned char sMd[17]; char *p; sMd[16] = '\0'; MD5((const unsigned char*)strRefLink.c_str(), strRefLink.length(), sMd); p=pt(sMd); if( setVisitedUrlMd5.count((const char*)p) == 0 ){ ofsUnvisitUrl << strRefLink << endl; } if( (*it).second == "" ){ ofsVisitedUrlLink << (*it).first << endl; }else{ if( iPage.m_sCharset == ""){ ofsVisitedUrlLink << (*it).first << "\t" << (*it).second << endl; }else{ ofsVisitedUrlLink << (*it).first << "\t" << (*it).second << "\t" << iPage.m_sCharset << endl; } } } ofsUnvisitUrl.close(); ofsVisitedUrlLink.close(); } return;}static void SigTerm(int x){ SaveMd5(); cout << "Terminated!" << endl; exit(0);}void SaveMd5(){ ofstream ofsMd5(MD5_FILE.c_str(),ios::trunc | ios::binary); if(!ofsMd5){ cerr << "cannot open " << MD5_FILE << "for output" << endl; exit(-1); } set<string>::iterator it=setVisitedUrlMd5.begin(); for( ; it!=setVisitedUrlMd5.end(); ++it ){ //ofsMd5.write( (*it).c_str(), 16); ofsMd5 << (*it).c_str(); ofsMd5 << "\n"; } cout << endl << "saved " << setVisitedUrlMd5.size() << " visited url md5 values" << endl; ofsMd5.close();}void GetVisitedUrlMd5(){ ifstream ifsMd5(MD5_FILE.c_str(),ios::binary); if(!ifsMd5){ cerr << "did not find " << MD5_FILE << " for iutput" << endl; } string strMd5; while( getline(ifsMd5,strMd5) ){ setVisitedUrlMd5.insert(strMd5); } ifsMd5.close(); cout << "got " << setVisitedUrlMd5.size() << " md5 values of visited urls" << endl;}void GetIpBlock(){ ifstream ifsIpBlock(IP_BLOCK_FILE.c_str()); if (!ifsIpBlock){ cerr << "Cannot open " << IP_BLOCK_FILE << " for input." << endl; } string strIpBlock; while( getline(ifsIpBlock,strIpBlock) ){ if(strIpBlock[0]=='\0' || strIpBlock[0]=='#' || strIpBlock[0]== '\n'){ continue; } char buf1[64], buf2[64]; buf1[0]='\0'; buf2[0]='\0'; sscanf( strIpBlock.c_str(), "%s %s", buf1, buf2 ); mapIpBlock.insert(valTypeIpBlock( inet_addr(buf1), inet_addr(buf2)) ); } ifsIpBlock.close();}void GetUnreachHost(){ vsUnreachHost.reserve(MAX_UNREACHABLE_HOST_NUM); ifstream ifsUnreachHost(UNREACH_HOST.c_str()); if (!ifsUnreachHost){ cerr << "Cannot open " << UNREACH_HOST << " for input." << endl; } string strUnreachHost; int i=0; while( getline(ifsUnreachHost,strUnreachHost) ){ if(strUnreachHost[0]=='\0' || strUnreachHost[0]=='#' || strUnreachHost[0]== '\n'){ continue; } vsUnreachHost.push_back(strUnreachHost); i++; if(i == MAX_UNREACHABLE_HOST_NUM) break; } ifsUnreachHost.close();}static char *pt(unsigned char *md){ int i; static char buf[33]; for (i=0; i<16; i++) sprintf(&(buf[i*2]),"%02x",md[i]); //printf("len=%2d %s\n",strlen(buf),buf); return(buf);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -