📄 crawl.cpp
字号:
#include "Crawl.h"#include "Url.h"#include "Md5.h"#include <list.h>#include <hlink.h>#include <uri.h>extern pthread_mutex_t mymutex;extern map<string,string> mapCacheHostLookup;extern vector<string> vsUnreachHost;extern char **ParseRobot( char *data, char len);set<string> setVisitedUrlMD5;set<string> setVisitedPageMD5;set<string> setUnvisitedUrlMD5;set<string> setUnreachHostMD5;multimap<string, string, less<string> > replicas;pthread_mutex_t mutexCollection = PTHREAD_MUTEX_INITIALIZER; // unvisited urlspthread_mutex_t mutexUnreachHost = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexUnvisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexVisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexVisitedPageMD5 = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexDetect = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexLink4SEFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexLink4HistoryFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexIsamFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexVisitedUrlFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexUnreachHostFile = PTHREAD_MUTEX_INITIALIZER;pthread_mutex_t mutexReplicas = PTHREAD_MUTEX_INITIALIZER;//pthread_mutex_t mutexMemory = PTHREAD_MUTEX_INITIALIZER;map<unsigned long,unsigned long> mapIpBlock;bool b_fOver;//multimap<string,string, less<string> > mmapUrls;multimap<string,string > mmapUrls;typedef map<unsigned long,unsigned long>::value_type valTypeIpBlock;typedef map<string,string>::value_type mvalType;voidSaveReplicas(const char* filename);struct package{ CCrawl *crawl; CPage *page;};vector<string> vsParsedLinks;int onfind(const char *elem, const char *attr, struct uri *uri, void *arg){ struct package *p=(struct package*)arg; char buff[URL_LEN+1]; // if (uri_recombine(uri, buff, URL_LEN+1, C_URI) >= 0) if (uri_recombine(uri, buff, URL_LEN+1, C_SCHEME| C_AUTHORITY| C_PATH| C_QUERY ) >= 0) { vsParsedLinks.push_back(buff); if( !p->page->IsFilterLink(buff) ) { // accept "a,link,frame,iframe,img,area" if (strcasecmp(elem, "img") == 0) { pthread_mutex_lock(&mutexLink4HistoryFile); if( p->crawl->m_ofsLink4HistoryFile ){ p->crawl->m_ofsLink4HistoryFile << buff << endl; } pthread_mutex_unlock(&mutexLink4HistoryFile); } else { p->crawl->AddUrl( buff ); }/* else if (strcasecmp(elem, "img") == 0) { pthread_mutex_lock(&mutexLink4HistoryFile); if( p->crawl->m_ofsLink4HistoryFile ){ p->crawl->m_ofsLink4HistoryFile << p->page->m_sUrl << endl;; } pthread_mutex_unlock(&mutexLink4HistoryFile); }*/ } } uri_destroy(uri); free(uri); return 1;}/*********************************************************************** * Function name: start * Input argv: * -- arg: the CCrawl handle * Output argv: * -- * Return:***********************************************************************/void* start(void *arg){ ( (CCrawl*)arg )->fetch(arg);}/***************************************************************** * Function name: SaveUnvisitedUrl * Input argv: * -- * Output argv: * -- * Return: * Function Description: Save teh Unvisited Url * Version: 1.0 * Be careful: ****************************************************************/void SaveUnvisitedUrl(){ ofstream ofsUnvisitedUrl; ofsUnvisitedUrl.open(UNVISITED_FILE.c_str(), ios::in|ios::out|ios::trunc|ios::binary); if (!ofsUnvisitedUrl) { cerr << "cannot open " << UNVISITED_FILE << "for output" << endl; exit (-1); } multimap<string,string>::iterator it = mmapUrls.begin(); for (; it!=mmapUrls.end(); it++) { ofsUnvisitedUrl << ((*it).second).c_str() << "\n"; } ofsUnvisitedUrl << endl; ofsUnvisitedUrl.close();}/*********************************************************************** * Function name: fetch * Input argv: * -- arg: the CCrawl handle * Output argv: * -- * Return:***********************************************************************/void CCrawl::fetch(void *arg){ string strUrl,host; int nGSock = -1; string strGHost = ""; // create a Tianwang file for output the raw page data string ofsName = DATA_TIANWANG_FILE + "." + CStrFun::itos(pthread_self()); CTianwangFile tianwangFile(ofsName); // create a Link4SE file for output the raw link data ofsName = DATA_LINK4SE_FILE + "." + CStrFun::itos(pthread_self()); CLink4SEFile link4SEFile(ofsName); int iSleepCnt=0; for(;;){ pthread_mutex_lock(&mutexCollection); //if( !mmapUrls.empty() ){ int cnt = mmapUrls.size(); if(cnt > 0){ cout << "collection has: " << cnt << " unvisited urls" << endl; multimap<string,string>::iterator it=mmapUrls.begin(); if( it != mmapUrls.end() ){ // get an URL strUrl = (*it).second; // remove it from the collection mmapUrls.erase( it ); pthread_mutex_unlock(&mutexCollection); // parse URL CUrl iUrl; if( iUrl.ParseUrlEx(strUrl) == false ){ cout << "ParseUrlEx error in fetch(): " << strUrl << endl; continue; } if( strGHost != iUrl.m_sHost ){ close( nGSock ); nGSock = -1; strGHost = iUrl.m_sHost; } (( CCrawl* )arg)->DownloadFile(&tianwangFile,&link4SEFile,iUrl,nGSock); cnt = 0; } else { pthread_mutex_unlock(&mutexCollection); } } else { pthread_mutex_unlock(&mutexCollection); usleep(1000); iSleepCnt++; } if( b_fOver == true && iSleepCnt==200) break; /* if( b_fOver == true ){ break; } else if( cnt == 100 ) { cout << "w."; cnt = 0; } */ } tianwangFile.Close(); link4SEFile.Close();}/*********************************************************************** * Function name: DownloadFile * Input argv: * -- pTianwang: the CCrawl handle * -- pLink4SE: the CCrawl handle * -- iUrl: the URL for crawling * -- nGSock: the previous global socket * Output argv: * -- * Return:***********************************************************************/void CCrawl::DownloadFile(CTianwangFile *pTianwangFile, CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock){ char *downloaded_file = NULL, *fileHead = NULL, *location = NULL; int file_length = 0; string strUrlLocation = ""; int nSock = nGSock; cout << "1. pid=" << pthread_self() << " sock = " << nGSock << endl; CHttp http; file_length = http.Fetch(iUrl.m_sUrl, &downloaded_file, &fileHead, &location, &nSock); #ifdef DEBUG // just download cout << "######file length: ######" << file_length << endl; cout << "######head: ######" << fileHead << endl; #endif int nCount = 0; while( file_length == -300 ){ // moved to an another place if( strlen(location) > URL_LEN-1 || nCount == 3 || strlen(location)==0 ){ if( location ) { //pthread_mutex_lock(&mutexMemory); free( location ); location = NULL; //pthread_mutex_unlock(&mutexMemory); } file_length = -1; break; } strUrlLocation = location; if(location) { //pthread_mutex_lock(&mutexMemory); free(location); location = NULL; //pthread_mutex_unlock(&mutexMemory); } string::size_type idx1 = CStrFun::FindCase(strUrlLocation, "http"); if( idx1 != 0 ){ char c1 = iUrl.m_sUrl.at(iUrl.m_sUrl.length()-1); char c2 = strUrlLocation.at(0); if( c2 == '/' ){ strUrlLocation = "http://" + iUrl.m_sHost + strUrlLocation; }else if( c1!='/' && c2!='/'){ string::size_type idx; idx = iUrl.m_sUrl.rfind('/'); if( idx != string::npos ){ if( idx > 6 ){ // > strlen("http://..") strUrlLocation = iUrl.m_sUrl.substr(0, idx+1) + strUrlLocation; } else { strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation; } } else { file_length = -1; break; } } else { if( c1=='/' ){ strUrlLocation = iUrl.m_sUrl + strUrlLocation; } else { strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation; } } } CPage iPage; if( iPage.IsFilterLink(strUrlLocation) ){ file_length = -1; break; } cout << "2. pid=" << pthread_self() << " sock = " << nGSock << endl; file_length = http.Fetch( strUrlLocation, &downloaded_file, &fileHead, &location, &nSock); nCount++; } nGSock = nSock; if(file_length == -1){ // unreachable, skipped. cout << "!-: " << iUrl.m_sUrl << endl; //pthread_mutex_lock(&mutexMemory); if (fileHead) { free(fileHead); fileHead=NULL; } if (downloaded_file) { free(downloaded_file); downloaded_file=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "-unreach host: " << iUrl.m_sHost << endl;; return; } if(file_length == -2){ // out of ip block . //pthread_mutex_lock(&mutexMemory); if (fileHead) { free(fileHead); fileHead=NULL; } if (downloaded_file) { free(downloaded_file); downloaded_file=NULL; } //pthread_mutex_unlock(&mutexMemory); // save unreach host SaveUnreachHost(iUrl.m_sHost); cout << "-out of block host: " << iUrl.m_sHost << endl;; return; } if(file_length == -3) { // invalid host or ip //pthread_mutex_lock(&mutexMemory); if (fileHead) { free(fileHead); fileHead=NULL; } if (downloaded_file) { free(downloaded_file); downloaded_file=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "-invalid host: " << iUrl.m_sHost << endl; return; } if(file_length == -4) { // MIME is image/xxx //pthread_mutex_lock(&mutexMemory); if (fileHead) { free(fileHead); fileHead=NULL; } if (downloaded_file) { free(downloaded_file); downloaded_file=NULL; } //pthread_mutex_unlock(&mutexMemory); if( m_ofsLink4HistoryFile ){ pthread_mutex_lock(&mutexLink4HistoryFile); m_ofsLink4HistoryFile << iUrl.m_sUrl << endl;; pthread_mutex_unlock(&mutexLink4HistoryFile); } cout << "-imgage host: " << iUrl.m_sHost << endl; return; } /* still experiment char **dir; dir = ParseRobot( downloaded_file, file_length); for( int i = 0; dir[i] != NULL ; i++){ cout << dir[i] << endl; free( dir[i] ); } exit(1); */ // so small, maybe some unuseful info, skipped //if(file_length < 40){ // for ImgSE, /* if(file_length < 256){ // for SE //pthread_mutex_lock(&mutexMemory); if (fileHead) { free(fileHead); fileHead=NULL; } if (downloaded_file) { free(downloaded_file); downloaded_file=NULL; } //pthread_mutex_unlock(&mutexMemory); cout << "#"; return; }*/ // deal with normal page if (!fileHead || !downloaded_file) { //pthread_mutex_lock(&mutexMemory); if (fileHead) { free(fileHead); fileHead=NULL; } if (downloaded_file) { free(downloaded_file); downloaded_file=NULL; } //pthread_mutex_unlock(&mutexMemory); close(nGSock); nGSock = -1; cout << "-size0 host: " << iUrl.m_sHost << endl; return; } CPage iPage(iUrl.m_sUrl, strUrlLocation, fileHead, downloaded_file, file_length); //pthread_mutex_lock(&mutexMemory); if (fileHead) { free(fileHead); fileHead=NULL; } if (downloaded_file) { free(downloaded_file); downloaded_file=NULL; } //pthread_mutex_unlock(&mutexMemory); iPage.ParseHeaderInfo(iPage.m_sHeader); if( iPage.m_bConnectionState == false ){ close(nGSock); nGSock = -1; } // when crawling images for ImgSE, remember to comment the paragraph // when crawling plain text for SE, remember to open the paragraph // paragraph begin // iPage.m_sContentType != "text/css" && if( iPage.m_sContentType != "text/html" && iPage.m_sContentType != "text/plain" && iPage.m_sContentType != "text/xml" && iPage.m_sContentType != "application/msword" && iPage.m_sContentType != "application/pdf" && iPage.m_sContentType != "text/rtf" && iPage.m_sContentType != "application/postscript" && iPage.m_sContentType != "application/vnd.ms-execl" && iPage.m_sContentType != "application/vnd.ms-powerpoint" ){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -