📄 thread.h
字号:
/* OpenWebSpider * * Authors: Stefano Alimonti AND Stefano Fantin * Version: 0.7 * E-Mails: shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it * * * This file is part of OpenWebSpider * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */#ifndef __THREAD#define __THREAD#ifdef WIN32 unsigned thrdML[MAXTHREAD]; HANDLE thrdhML[MAXTHREAD]; unsigned thrdServer; HANDLE thrdhServer;#else pthread_t thrdML[MAXTHREAD]; pthread_t thrdServer;#endif#ifdef WIN32 unsigned __stdcall #else void* #endifmainThread(LPVOID pthrdNum){struct sHost currentHst;char packet[MAXPACKETSIZE];char html[MAXPACKETSIZE];DWORD tStart=0;SOCKET sock;int snd;int maxbytes2recive;int recvdbytes;int condition=1;int thrdNum=(int)pthrdNum;char sStdOutTmp[10000];char sStdOut[10000];SOCKADDR_IN mSaddr;NODE* nCur=NULL;char httpStatus[MAXHTTPSTATUSSIZE];int HttpRequestRet;char sLocation[MAXURLSIZE]; while(condition) { UnBlockAll(); if(iQuit==1 || bKillThread==1 || bKillThreadReserved==1) { UnBlockAll(); ExitThread(0); } if(iStop) { Sleep(500); continue; } thrdBlock(BLOCKTHRDHST); if( checkLimits() == 1 || iDoNextHost==1) //switch to the next host { /* set the status of the pages to be indexed as indexed */ lstSetNodeStatus(lstFirst,0,1); /*Un-block all mutexes owned by this thread (only BLOCKTHRDHST) and...*/ UnBlockAll(); /* exit */ ExitThread(0); } if((nCur=lstGetNodeByVal(lstFirst,0))!=NULL) { if(nCur==NULL || nCur->field==NULL) { thrdUnBlock(BLOCKTHRDHST); continue; } /* robots.txt checked?!? */ if(bRobotsOK==0) { /* is this page robots.txt */ if(!(stricmp(((struct sHost*)nCur->field)->Page,"/robots.txt")==0)) { /* if not: please wait robots.txt */ thrdUnBlock(BLOCKTHRDHST); Sleep(1000); continue; } } if(CheckRobotExclusion(((struct sHost*)nCur->field)->Page)==0) { ((struct sHost*)nCur->field)->viewed = 1; thrdUnBlock(BLOCKTHRDHST); bRobotsOK=1; continue; } ((struct sHost*)nCur->field)->viewed = 2; memcpy(¤tHst,((struct sHost*)nCur->field),sizeof(struct sHost)); } else { thrdUnBlock(BLOCKTHRDHST); Sleep(1000); continue; } //TESTING (before: after Unblockall() ) thrdStatus[thrdNum]=GetTickCount(); if(currentHst.type == 3) //current url is not a html page or a plain text file { if(nCur==NULL || nCur->field==NULL) continue; ((struct sHost*)nCur->field)->viewed = 1; thrdUnBlock(BLOCKTHRDHST); bRobotsOK=1; continue; } //(1 - Crawl Delay) Lock the mutex if(iRobCrawlDelay>0 || iCrawlDelay>0) { thrdBlock(BLOCKEXCRAWL); if(iQuit==1 || bKillThread==1 || bKillThreadReserved==1) { UnBlockAll(); ExitThread(0); } Sleep( (iRobCrawlDelay>0) ? iRobCrawlDelay*1000 : iCrawlDelay ); } thrdUnBlock(BLOCKTHRDHST); tStart= GetTickCount(); if(!LoadSocket(&sock,¤tHst,&mSaddr)) { closesocket(sock); fprintf(stderr,"\r\n(%i) Socket(%s) error\r\n\r\n",thrdNum, currentHst.Host); ((struct sHost*)nCur->field)->viewed = 1; bRobotsOK=1; continue; } if (connect(sock, (LPSOCKADDR) &mSaddr, sizeof(mSaddr)) == SOCKET_ERROR) { fprintf(stderr,"\r\n(%i) Connect(%s) error\r\n\r\n",thrdNum,currentHst.Host); closesocket(sock); ((struct sHost*)nCur->field)->viewed = 1; bRobotsOK=1; continue; } memset(packet,0,MAXPACKETSIZE); memset(html,0,MAXPACKETSIZE); sStdOut[0]=0; ForgeHTTPPacket(currentHst,packet); snd=SEND(sock,packet); if(currentHst.port!=PORT) sprintf(sStdOut,"(%i) Current -> http://%s:%i%s (%s)",thrdNum,currentHst.Host,currentHst.port, currentHst.Page,currentHst.Description); else sprintf(sStdOut,"(%i) Current -> http://%s%s (%s)",thrdNum,currentHst.Host,currentHst.Page,currentHst.Description); if(snd<10) { closesocket(sock); strcat(sStdOut,"\t\t[SEND ERROR]\n\n"); printf("%s",sStdOut); ((struct sHost*)nCur->field)->viewed = 1; bRobotsOK=1; continue; } maxbytes2recive=sizeof(packet); //set the bytes to recive recvdbytes=RecvPackets(&sock,packet,maxbytes2recive); closesocket(sock); //(2 - Crawl Delay) File recived unlock the mutex if(iRobCrawlDelay>0 || iCrawlDelay>0) { thrdUnBlock(BLOCKEXCRAWL); } if(recvdbytes<=10) { ((struct sHost*)nCur->field)->viewed = 1; strcat(sStdOut,"\t\t[RECV ERROR]\n\n"); printf("%s",sStdOut); bRobotsOK=1; continue; } bytesDownloaded+=recvdbytes; if((HttpRequestRet=ParseHTTPRequest(packet,html,recvdbytes,httpStatus,sLocation,currentHst.level))!=0) { if(strnicmp(httpStatus,"HTTP/1.1 302",12)==0 || strnicmp(httpStatus,"HTTP/1.0 302",12)==0) sprintf(sStdOutTmp,"\n - HTTP header: %s\n - Location: %s\n - Downloaded %i Kb (%i bytes) in %i ms\n" ,httpStatus,sLocation,recvdbytes/1024,recvdbytes,(int)(GetTickCount()-tStart)); else sprintf(sStdOutTmp,"\n - HTTP header: %s\n - Downloaded %i Kb (%i bytes) in %i ms\n" ,httpStatus,recvdbytes/1024,recvdbytes,(int)(GetTickCount()-tStart)); strcat(sStdOut,sStdOutTmp); if(strnicmp(httpStatus,"HTTP/1.1 4",10)==0 || strnicmp(httpStatus,"HTTP/1.0 4",10)==0 || strnicmp(httpStatus,"HTTP/1.1 5",10)==0 || strnicmp(httpStatus,"HTTP/1.0 5",10)==0) { nErrorPages++; } } else { ((struct sHost*)nCur->field)->viewed = 1; strcat(sStdOut,"\r\n"); printf("%s",sStdOut); bRobotsOK=1; continue; } if(bRobotsOK==0 && stricmp(((struct sHost*)nCur->field)->Page,"/robots.txt")==0) { printf("%s",sStdOut); if(HttpRequestRet==2) ParseRobotsTxt(html,currentHst); else { printf(" - Nothing to do with robots.txt\n\n"); } ((struct sHost*)nCur->field)->viewed = 1; bRobotsOK=1; continue; } else bRobotsOK=1; //Index only HTML(1),plain text files(2)and custom handled files(4) if(currentHst.type <= 2 || currentHst.type == 4) { /* Check the number of pages indexed or if we are switching to the next host */ if( checkLimits() == 1 || iDoNextHost==1) //switch to the next host { ((struct sHost*)nCur->field)->viewed = 1; continue; } nPagesViewed++; if(currentHst.type == 1) //Looks for urls only in html page { tStart=GetTickCount(); sprintf(sStdOutTmp," - Checked in %i ms (%i URL found)\n",(int)(GetTickCount()-tStart),LookForUrls(html,currentHst)); strcat(sStdOut,sStdOutTmp); } tStart=GetTickCount(); if(HttpRequestRet==2) //Index only 200 OK { if(bUseRegularExpressionA==1) //are we using a regular expression filter? { //yes if(regexec(®exPageFilter, currentHst.Page, 0, 0, 0) == 0) { //match...index tStart=GetTickCount(); if(IndexPage(html,currentHst, recvdbytes)==1) sprintf(sStdOutTmp," - Indexed in %i ms\n\n",(int)(GetTickCount()-tStart)); else sprintf(sStdOutTmp,"\n"); strcat(sStdOut,sStdOutTmp); } else { //discard sprintf(sStdOutTmp,"\n"); strcat(sStdOut,sStdOutTmp); } } else { //index tStart=GetTickCount(); if(IndexPage(html,currentHst, recvdbytes)==1) sprintf(sStdOutTmp," - Indexed in %i ms\n\n",(int)(GetTickCount()-tStart)); else sprintf(sStdOutTmp,"\n"); strcat(sStdOut,sStdOutTmp); } } printf("%s",sStdOut); } ((struct sHost*)nCur->field)->viewed = 1; }/*while(condition)*/return 0;}void KillThreads(){int i; printf("Killing Threads...\r\n\r\n"); for(i=0;i<nThread;i++) {#ifdef WIN32 WaitForSingleObject(thrdhML[i],50000); TerminateThread(thrdhML[i],0); CloseHandle(thrdhML[i]);#else if(thrdML[i]!=0) pthread_join(thrdML[i],NULL);#endif } init_mutex(); printf("Threads killed\r\n\r\n"); //set all nodes with status==2(indexing) with status=1(indexed) lstSetNodeStatus(lstFirst, 2, 1); /* if the downloading of the robots.txt timeouts the line above set the file as indexed * but bRobotsOK is set as 0 (un-parsed) and blocks the spider * so we have to manually set it a 1 (parsed) */ bRobotsOK = 1; /* debug */ //lstDebugNodes(lstFirst,0); //lstDebugNodes(lstFirst,2); //lstDebugNodes(lstFirst,1); bKillThread=0;}void CreateThreads(){int i;int errorCode; init_mutex(); printf("\r\n"); for(i=0;i<nThread;i++) { printf("\rCreating thread %i of %i ",i+1,nThread); fflush(stdout);#ifdef WIN32 thrdhML[i] = (HANDLE)_beginthreadex(NULL,0,mainThread,(void*)i,0,&thrdML[i]);#else if( (errorCode=pthread_create(&thrdML[i], NULL, mainThread, (void*)i)) != 0 ) { printf("\r\nThread error (%i):\r\n",errorCode); perror(" - pthread_create() "); exit(0); }#endif thrdStatus[i]=GetTickCount(); } printf("\r\n");return;}void CreateServerThread(int port){#ifdef WIN32 thrdhServer = (HANDLE)_beginthreadex(NULL,0,StartOWSServer,(void*)port,0,&thrdServer);#elseint errorCode; if( (errorCode=pthread_create(&thrdServer, NULL, StartOWSServer, (void*)port)) != 0 ) { printf("\r\nThread error (%i):\r\n",errorCode); perror(" - pthread_create() "); exit(0); }#endif}void CreateHandleConnectionThread(struct sHandleConnection* struct_connection){#ifdef WIN32 _beginthreadex(NULL,0,HandleConnection,(void*)struct_connection,0,NULL);#elseint errorCode;pthread_t ptTmp; if( (errorCode=pthread_create(&ptTmp, NULL, HandleConnection, (void*)struct_connection)) != 0 ) { printf("\r\nThread error (%i):\r\n",errorCode); perror(" - pthread_create() "); exit(0); }#endif}void CheckThreads(){DWORD curTickCount;int i;int avgSec; if(iDoNextHost==0) { avgSec=0; curTickCount=GetTickCount(); for(i=0;i<nThread;i++) //Check the status of the threads { if(curTickCount>thrdStatus[i]) avgSec+=(curTickCount-thrdStatus[i]); else avgSec++; }//for(i=0;i<nThread;i++) avgSec/=nThread; if(avgSec>AVGTHREADDELAY) { ERROR_LOG("Notice: Killing thrads avgSec>100000"); bKillThread=1; } }//if(iDoNextHost==0)return;}#endif/*EOF*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -