📄 main.cpp
字号:
#include "Tse.h"#include "HttpTse.h"#include "StrFun.h"#include "Url.h"#include "Page.h"#include <openssl/md5.h>#include <zlib.h>using namespace std;extern int IsamFile(const char* url, char* downloaded_file, int len);//extern int FindUrl(const char* url, char **content);extern int FindKey(const char* key);extern int ScanPageContent(char*, int);extern void DoSearch();extern void DoHelp();extern map<string,string> mapCacheHostLookup;bool IsFilterLink(string plink);int NormallizeUrl(string& strUrl);void DownloadFile(CUrl,int&);void *fetch(void *arg);static void SigTerm(int x);void GetVisitedUrlMd5();void GetIpBlock();void GetUnreachHost();void SaveMd5();static char *pt(unsigned char *md);vector<string> vsUnreachHost;bool f_bUnreachHost;string f_strOutFile;pthread_mutex_t mymutex = PTHREAD_MUTEX_INITIALIZER;multimap<string,string> mmapUrls;typedef map<string,string>::value_type mvalType;map<unsigned long,unsigned long> mapIpBlock;typedef map<unsigned long,unsigned long>::value_type valTypeIpBlock;set<string> setVisitedUrlMd5;bool b_fOver = false;int main(int argc, char* argv[]){ if(argc < 2) DoHelp(); if( !strncmp(argv[1],"-s",2) && argc==2 ) DoSearch(); if( !strncmp(argv[1],"-c",2) && argc==4 ){ f_strOutFile = argv[3]; }else DoHelp(); /* set the signal function */ signal(SIGTERM, SigTerm); signal(SIGKILL, SigTerm); signal(SIGINT, SigTerm); signal(SIGPIPE, SIG_IGN); signal(SIGCHLD,SIG_IGN);/* First * --------> */ char strTime[128]; time_t tDate; memset(strTime,0,128); time(&tDate); strftime(strTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); cout << "\n\nBegin at: " << strTime << "\n\n"; GetVisitedUrlMd5(); GetIpBlock(); GetUnreachHost();/* Second * --------> get seeds and download files */ ifstream ifsSeed(argv[2]); if (!ifsSeed){ cerr << "Cannot open " << argv[2] << " for input\n"; return -1; } string strUrl; pthread_t *tids; // Create thread ID structures. tids = (pthread_t*)malloc(NUM_WORKERS * sizeof(pthread_t)); if( tids == NULL) cerr << "malloc error" << endl; for(unsigned int i=0; i< NUM_WORKERS; i++){ if (pthread_create(&tids[i], NULL, fetch, NULL)) cerr << "create threads error" << endl; } int nUrlNum = 0; while( getline(ifsSeed, strUrl) ){ string::size_type idx; if(strUrl[0]=='\0' || strUrl[0]=='#' || strUrl[0]== '\n'){ continue; } idx = strUrl.find('\t'); if(idx != string::npos){ strUrl = strUrl.substr(0,idx); } if( strUrl.length() < 8 ) continue; char tmpUrl[URL_LEN]; memset(tmpUrl,0,URL_LEN); memcpy(tmpUrl, strUrl.c_str(), URL_LEN); strUrl = tmpUrl; CUrl iUrl; iUrl.ParseUrl(strUrl);/*##########*/ map<string,string>::iterator it = mapCacheHostLookup.find(iUrl.m_sHost); if( it != mapCacheHostLookup.end() ){ // find in host lookup cache string strHostIp; strHostIp = (*it).second; unsigned long inaddr = (unsigned long)inet_addr( strHostIp.c_str() ); if( mapIpBlock.size() > 0){ map<unsigned long,unsigned long>::iterator pos; bool b_fContinue = false; for(pos=mapIpBlock.begin(); pos != mapIpBlock.end(); ++pos ){ unsigned long ret; ret = inaddr & ~((*pos).second); if( ret == (*pos).first ){ b_fContinue = true; break; } } if( b_fContinue == false ) continue; }/* //save blocked Urls string ofsName = IP_BLOCKED_URL + "." + CStrFun::itos(pthread_self()); ofstream ofsBlockUrl(ofsName.c_str(),ios::app|ios::binary); if(!ofsBlockUrl){ cerr << "cannot open " << ofsName.c_str() << "for output" << endl; exit(-1); } ofsBlockUrl << host << endl; ofsBlockUrl.close();*/ }/*###########*/ //if crawled, discard it. unsigned char sMd[17]; char *p; sMd[16] = '\0'; MD5((const unsigned char*)strUrl.c_str(), strUrl.length(), sMd); p=pt(sMd); //if( setVisitedUrlMd5.count((const char*)sMd) == 1 ){ if( setVisitedUrlMd5.count((const char*)p) > 0 ){ cout << "!"; //1.crawled already continue; }/* pthread_mutex_lock(&mymutex); setVisitedUrlMd5.insert((const char*)p); pthread_mutex_unlock(&mymutex);*/ vector<string>::iterator itResult; itResult = find( vsUnreachHost.begin(), vsUnreachHost.end(), iUrl.m_sHost ); if( itResult != vsUnreachHost.end() ){ cout << iUrl.m_sHost << " is an unreachable site." << endl; continue; } nUrlNum ++; if ( nUrlNum == 10 ){ cout << "cache(" << mmapUrls.size() << ")"; nUrlNum = 0; } // if crawled, just skip //if( FindUrl(strUrl.c_str(),NULL) == 0 ) continue; // Download pages int cnt = 0; for(;;){ if( cnt == 100 ){ //break; cout << "~"; cnt = 0; } pthread_mutex_lock(&mymutex); if( mmapUrls.count(iUrl.m_sHost) < NUM_WORKERS_ON_A_SITE ){ mmapUrls.insert(mvalType( iUrl.m_sHost, strUrl)); pthread_mutex_unlock(&mymutex); break; }else{ pthread_mutex_unlock(&mymutex); usleep(4000); cnt++; } } } sleep(30); b_fOver = true; cout << "finished to get all unvisited urls." << endl; // Wait for the threads. for (unsigned int i = 0; i < NUM_WORKERS; ++i){ (void)pthread_join(tids[i], NULL); } cout << "closed " << NUM_WORKERS << " threads." << endl; SaveMd5(); memset(strTime,0,128); time(&tDate); strftime(strTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); cout << "\n\nEnd at: " << strTime << "\n\n"; exit(0);}void *fetch(void *arg){ string strUrl,host; int nGSock = -1; string strGHost = ""; int cnt=0; for(;;){ pthread_mutex_lock(&mymutex); if( !mmapUrls.empty() ){ multimap<string,string>::iterator it=mmapUrls.begin(); if(it != mmapUrls.end()){ strUrl = (*it).second; mmapUrls.erase( it ); pthread_mutex_unlock(&mymutex); // if crawled, just skip/* if( FindUrl(strUrl.c_str(),NULL) == 0 ){ cnt = 0; continue; }*/ CUrl iUrl; iUrl.ParseUrl(strUrl); if( strGHost != iUrl.m_sHost ){ close( nGSock ); nGSock = -1; strGHost = iUrl.m_sHost; } DownloadFile(iUrl, nGSock); cnt = 0; }else{ pthread_mutex_unlock(&mymutex); } }else{ pthread_mutex_unlock(&mymutex); usleep(1000); cnt++; }/* if(cnt == 100) { //cout << endl << "found 100 times empty, maybe no tasks" << endl; cout << "@100"; break; }*/ if( b_fOver == true ){ break; }else if (cnt == 100 ){ cout << "w." ; cnt = 0; } } return NULL;}void DownloadFile(CUrl iUrl, int& nGSock){ char *downloaded_file = NULL, *fileHead = NULL, *location = NULL; int file_length = 0; string strUrlLocation = ""; int nSock = nGSock; cout << "1. " << pthread_self() << " sock = " << nGSock << endl; file_length = HttpFetch(iUrl.m_sUrl, &downloaded_file, &fileHead, &location, &nSock); #ifdef DEBUG // Just download cout << "#######file length: ######" << file_length << endl; cout << "#######head: ######" << fileHead << endl; #endif int nCount = 0; while (file_length == -2){ if( strlen(location) > URL_LEN-1 || nCount == 1){ if( location ) free( location ); location = NULL; file_length = -1; break; } strUrlLocation = location; if(location) free(location); location = NULL; cout << "2. " << pthread_self() << " sock = " << nGSock << endl; file_length = HttpFetch( strUrlLocation, &downloaded_file, &fileHead, &location, &nSock); nCount = 1; } nGSock = nSock; if(file_length == -1){ // unreachable, skipped. //cout << "-" << iUrl.m_sUrl << endl; cout << "-"; if(fileHead) free(fileHead); return; } else if(file_length < 40){ // so small, maybe some unuseful info, skipped if(fileHead) free(fileHead); if(downloaded_file) free(downloaded_file); cout << "#"; return; }else{ CPage iPage(iUrl.m_sUrl, strUrlLocation, fileHead, downloaded_file, file_length);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -