📄 crawl.cpp
字号:
{ if( m_ofsLink4SEFile && iPage->m_nRefLink4SENum>0 ){ pthread_mutex_lock(&mutexLink4SEFile); m_ofsLink4SEFile << "root_url: " << iPage->m_sUrl << endl; m_ofsLink4SEFile << "charset: " << iPage->m_sCharset << endl; m_ofsLink4SEFile << "number: " << iPage->m_nRefLink4SENum << endl; m_ofsLink4SEFile << "link_anchortext: " << endl; map<string,string>::iterator it4SE = iPage->m_mapLink4SE.begin(); for( ; it4SE!= iPage->m_mapLink4SE.end(); ++it4SE ){ m_ofsLink4SEFile << (*it4SE).first << '\t' << (*it4SE).second << endl;; } pthread_mutex_unlock(&mutexLink4SEFile); }}bool CCrawl::SaveLink4SE031121(void *arg){ if( !arg || !m_ofsLink4SEFile ) return false; //pthread_mutex_lock(&mutexLink4SEFile); if( vsParsedLinks.size() == 0 ) return false; file_arg *pFile = (file_arg *)arg; CUrl *iUrl = pFile->pUrl; CPage *iPage = pFile->pPage; char strDownloadTime[128]; time_t tDate; memset(strDownloadTime, 0, 128); time(&tDate); strftime(strDownloadTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); string links; vector<string>::iterator it = vsParsedLinks.begin(); for( ; it!= vsParsedLinks.end(); ++it ){ links = links + *it + "\n"; } m_ofsLink4SEFile << "version: 1.0\n"; if( iPage->m_sLocation.size() == 0 ){ m_ofsLink4SEFile << "url: " << iPage->m_sUrl; }else{ m_ofsLink4SEFile << "url: " << iPage->m_sLocation; m_ofsLink4SEFile << "\norigin: " << iUrl->m_sUrl; } m_ofsLink4SEFile << "\ndate: " << strDownloadTime; if( mapCacheHostLookup.find(iUrl->m_sHost) == mapCacheHostLookup.end() ){ m_ofsLink4SEFile << "\nip: " << iUrl->m_sHost; } else { m_ofsLink4SEFile << "\nip: " << ( *(mapCacheHostLookup.find(iUrl->m_sHost)) ).second; } m_ofsLink4SEFile << "\noutdegree: " << vsParsedLinks.size(); m_ofsLink4SEFile << "\nlength: " << iPage->m_nLenHeader + links.size() + 1 << "\n\n" << iPage->m_sHeader << "\n"; m_ofsLink4SEFile << links; m_ofsLink4SEFile << endl; vsParsedLinks.clear(); //pthread_mutex_unlock(&mutexLink4SEFile); return true;} // not wellvoid CCrawl::SaveLink4History(CPage *iPage){ if( m_ofsLink4HistoryFile && iPage->m_nRefLink4HistoryNum>0 ){ pthread_mutex_lock(&mutexLink4HistoryFile); //m_ofsLink4HistoryFile << "root_url: " << iPage->m_sUrl << endl; //m_ofsLink4HistoryFile << "charset: " << iPage->m_sCharset << endl; //m_ofsLink4HistoryFile << "number: " << iPage->m_nRefLink4HistoryNum << endl; //m_ofsLink4HistoryFile << "link: " << endl; vector<string>::iterator it4History = iPage->m_vecLink4History.begin(); for( ; it4History!= iPage->m_vecLink4History.end(); ++it4History ){ string s = *it4History; m_ofsLink4HistoryFile << s << endl; } pthread_mutex_unlock(&mutexLink4HistoryFile); }}/************************************************************************************** * Function name: SaveVisitedUrlMd5 * Input argv: * -- md5: page md5 value * Output argv: * -- * Return: * Function Description: save the visited url Md5**************************************************************************************/void CCrawl::SaveVisitedUrlMD5(string md5){ if( m_ofsVisitedUrlMD5File ){ m_ofsVisitedUrlMD5File << md5 << endl; }}/************************************************************************************** * Function name: SaveVisitedPageMd5 * Input argv: * -- md5: page md5 value * Output argv: * -- * Return: * Function Description: save the visited url Md5**************************************************************************************/void CCrawl::SaveVisitedPageMD5(string md5){ if( m_ofsVisitedPageMD5File ){ m_ofsVisitedPageMD5File << md5 << endl; }}/************************************************************************************** * Function name: OpenFileForOutput * Input argv: * -- * Output argv: * -- * Return: * Function Description: Open the files for output**************************************************************************************/void CCrawl::OpenFilesForOutput(){ // open isam file for output m_isamFile.Open(DATA_FILE_NAME, INDEX_FILE_NAME); // open visited.url file for output m_ofsVisitedUrlFile.open(m_sOutputFileName.c_str(), ios::out|ios::app|ios::binary); if( !m_ofsVisitedUrlFile ){ cerr << "cannot open " << VISITED_FILE << " for output\n" << endl; } // open link4SE.url file for output m_ofsLink4SEFile.open(LINK4SE_FILE.c_str(), ios::out|ios::app|ios::binary); if( !m_ofsLink4SEFile ){ cerr << "cannot open " << LINK4SE_FILE << " for output\n" << endl; } // open link4History.url file for output m_ofsLink4HistoryFile.open(LINK4History_FILE.c_str(), ios::out|ios::app|ios::binary); if( !m_ofsLink4HistoryFile ){ cerr << "cannot open " << LINK4History_FILE << " for output\n" << endl; } // open unreach host file for output m_ofsUnreachHostFile.open(UNREACH_HOST_FILE.c_str(), ios::out|ios::app|ios::binary); if( !m_ofsUnreachHostFile ){ cerr << "cannot open " << UNREACH_HOST_FILE << " for output\n" << endl; } // open visited url md5 file for output m_ofsVisitedUrlMD5File.open(URL_MD5_FILE.c_str(), ios::out|ios::app|ios::binary); if( !m_ofsVisitedUrlMD5File ){ cerr << "cannot open " << URL_MD5_FILE << " for output\n" << endl; } // open visited page md5 file for output m_ofsVisitedPageMD5File.open(PAGE_MD5_FILE.c_str(), ios::out|ios::app|ios::binary); if( !m_ofsVisitedPageMD5File ){ cerr << "cannot open " << PAGE_MD5_FILE << " for output\n" << endl; }}/*************************************************************************************** * Function name: DoCrawl * Input argv: * -- * Output argv: * -- * Return: * Function Description: the main function for crawl * Be careful:***************************************************************************************/void CCrawl::DoCrawl(){ /* set the signal function */ signal(SIGTERM, SigTerm); signal(SIGKILL, SigTerm); signal(SIGINT, SigTerm); signal(SIGPIPE, SIG_IGN); signal(SIGCHLD,SIG_IGN); // output the begin time char strTime[128]; time_t tDate; memset(strTime,0,128); time(&tDate); strftime(strTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); cout << "\n\nBegin at: " << strTime << "\n\n"; // get the other info from file GetVisitedUrlMD5(); GetVisitedPageMD5(); GetIpBlock(); GetUnreachHostMD5(); // open the seed url file ifstream ifsSeed(m_sInputFileName.c_str()); if (!ifsSeed){ cerr << "Cannot open " << m_sInputFileName << " for input\n"; return; } // open the files for output OpenFilesForOutput(); // Create thread ID structures. pthread_t *tids = (pthread_t*)malloc(NUM_WORKERS * sizeof(pthread_t)); if( tids == NULL){ cerr << "malloc error" << endl; } for(unsigned int i=0; i< NUM_WORKERS; i++){ if( pthread_create( &tids[i], NULL, start, this)) cerr << "create threads error" << endl; } string strUrl; CPage iCPage; while( getline(ifsSeed, strUrl) ){ string::size_type idx; if(strUrl[0]=='\0' || strUrl[0]=='#' || strUrl[0]== '\n'){ continue; } idx = strUrl.find('\t'); if(idx != string::npos){ strUrl = strUrl.substr(0,idx); } //idx = strUrl.find("http"); idx = CStrFun::FindCase(strUrl, "http"); if(idx == string::npos){ //continue; idx = strUrl.find('/'); if( idx == string::npos ){ strUrl = "http://" + strUrl + "/"; }else{ strUrl = "http://" + strUrl; } } //if( strUrl.length() < 8 ) continue; if( iCPage.IsFilterLink(strUrl) ) continue; AddUrl(strUrl.c_str()); } // Get the unvisited URL ifstream ifsUnvisitedUrl(UNVISITED_FILE.c_str()); if( ifsUnvisitedUrl ){ while( getline(ifsUnvisitedUrl, strUrl) ){ string::size_type idx; if( strUrl[0]=='\0' || strUrl[0]=='#' || strUrl[0]== '\n'){ continue; } idx = strUrl.find('\t'); if(idx != string::npos){ strUrl = strUrl.substr(0,idx); } // filter invalid urls if( iCPage.IsFilterLink(strUrl) ) continue; AddUrl(strUrl.c_str()); } }else{ //cerr << "Cannot open " << UNVISITED_FILE << " for input\n"; } // sleep(30); b_fOver = true; cout << "finished to get all unvisited urls." << endl; // Wait for the threads. for (unsigned int i = 0; i < NUM_WORKERS; ++i){ (void)pthread_join(tids[i], NULL); } cout << "closed " << NUM_WORKERS << " threads." << endl; SaveUnvisitedUrl(); SaveReplicas("repli"); memset(strTime,0,128); time(&tDate); strftime(strTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); cout << "\n\nEnd at: " << strTime << "\n\n";}/******************************************************************* Function name: AddUrl** Input argv:** --** Output argv:** --** Return:** Function Description: Add a parsed url into the collection** Version: 1.0** Be careful: An important function!!!*****************************************************************/void CCrawl::AddUrl(const char * url){ string strUrl = url; if( strUrl.empty() || strUrl.size() < 8 ){ //invalid url cout << "!so small!" << strUrl << endl; return; } CPage iCPage; if( iCPage.NormalizeUrl(strUrl) == false ){ // cout << "!normalize fail!" << strUrl << endl; return; } CUrl iUrl; // for ImgSE, comment the paragraph // if image/xxx url, store it to link4History.url // begin if (iUrl.IsImageUrl(strUrl)) { if( m_ofsLink4HistoryFile ){ pthread_mutex_lock(&mutexLink4HistoryFile); m_ofsLink4HistoryFile << strUrl << endl;; pthread_mutex_unlock(&mutexLink4HistoryFile); } return; } // end if( iUrl.ParseUrlEx(strUrl) == false ){ cout << "ParseUrlEx error in AddUrl(): " << strUrl << endl; return; } // if it is an invalid host, discard it if( iUrl.IsValidHost( iUrl.m_sHost.c_str() ) == false ){ cout << "!invalid host: " << iUrl.m_sHost << endl; return; } // filter foreign hosts if( iUrl.IsForeignHost(iUrl.m_sHost) ){ cout << "!foreign hosts: " << iUrl.m_sHost << endl; return; } // if it is a block ip, discard it // this work is left in the CreatSocket() // because the work of getting ip is inevitable in the CreatSocket function // and this work is expensive // if it is an unreach host, discard it // here we only deal with numbers-and-dots notations unsigned long inaddr = 0; char *ip = NULL; inaddr = (unsigned long)inet_addr( iUrl.m_sHost.c_str() ); if ( inaddr != INADDR_NONE){ // host is just ip //pthread_mutex_lock(&mutexMemory); ip = new char[iUrl.m_sHost.size()+1]; //pthread_mutex_unlock(&mutexMemory); memset(ip, 0, iUrl.m_sHost.size()+1); memcpy(ip, iUrl.m_sHost.c_str(), iUrl.m_sHost.size()); if( !iUrl.IsValidIp(ip) ){ // out of ip block //pthread_mutex_lock(&mutexMemory); delete [] ip; ip = NULL; //pthread_mutex_unlock(&mutexMemory); //cout << "!unreach hosts: " << iUrl.m_sHost << endl; return; } //pthread_mutex_lock(&mutexMemory); delete [] ip; ip = NULL; //pthread_mutex_unlock(&mutexMemory); } CStrFun::Str2Lower( iUrl.m_sHost, iUrl.m_sHost.size() ); CMD5 iMD5; iMD5.GenerateMD5( (unsigned char*)iUrl.m_sHost.c_str(), iUrl.m_sHost.size() ); string strDigest = iMD5.ToString(); if( setUnreachHostMD5.find(strDigest) != setUnreachHostMD5.end() ){ //cout << "!unreach host! " << iUrl.m_sHost << endl; return; } // if crawled, discard it iMD5.GenerateMD5( (unsigned char*)strUrl.c_str(), strUrl.size() ); strDigest = iMD5.ToString(); if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) { // cout << "!visited! " << strUrl << endl; return; } // if already in the collection, discard it if( setUnvisitedUrlMD5.find(strDigest) != setUnvisitedUrlMD5.end() ){ // cout << "!in collection! " << strUrl << endl; return; } else { pthread_mutex_lock(&mutexUnvisitedUrlMD5); setUnvisitedUrlMD5.insert(strDigest); pthread_mutex_unlock(&mutexUnvisitedUrlMD5); } // add // make sure limited threads crawling on a site int cnt = 0; for(;;){ //if( mmapUrls.count(iUrl.m_sHost) < NUM_WORKERS_ON_A_SITE ){ if(1) { //pthread_mutex_lock(&mutexVisitedUrlMD5); // if crawled, discard it :) double secure //if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) { //cout << "!v! " << strUrl << endl; //pthread_mutex_unlock(&mutexVisitedUrlMD5); //return; //} else { pthread_mutex_lock(&mutexVisitedUrlMD5); mmapUrls.insert(mvalType( iUrl.m_sHost, strUrl)); pthread_mutex_unlock(&mutexVisitedUrlMD5); break; //} } else { cnt++; if( cnt % 100 == 0){ cout << "~"; //cnt = 0; } // If we have waiting so long, we may remove it if(cnt == 50000) { cout << "romove it!!!!!!!!!!!!!!!!!!!" << endl; break; } usleep(4000); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -