⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 thread.h

📁 网页抓取程序
💻 H
字号:
/* OpenWebSpider * *  Authors:     Stefano Alimonti AND Stefano Fantin *  Version:     0.7 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it * * * This file is part of OpenWebSpider * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * */#ifndef __THREAD#define __THREAD#ifdef WIN32  unsigned thrdML[MAXTHREAD];  HANDLE thrdhML[MAXTHREAD];  unsigned thrdServer;  HANDLE thrdhServer;#else  pthread_t thrdML[MAXTHREAD];  pthread_t thrdServer;#endif#ifdef WIN32  unsigned __stdcall #else  void* #endifmainThread(LPVOID pthrdNum){struct sHost currentHst;char         packet[MAXPACKETSIZE];char         html[MAXPACKETSIZE];DWORD        tStart=0;SOCKET       sock;int          snd;int          maxbytes2recive;int          recvdbytes;int          condition=1;int          thrdNum=(int)pthrdNum;char         sStdOutTmp[10000];char         sStdOut[10000];SOCKADDR_IN  mSaddr;NODE*        nCur=NULL;char         httpStatus[MAXHTTPSTATUSSIZE];int          HttpRequestRet;char         sLocation[MAXURLSIZE];	while(condition)	{				UnBlockAll();				if(iQuit==1 || bKillThread==1 || bKillThreadReserved==1)		{			UnBlockAll();			ExitThread(0);		}					if(iStop)		{			Sleep(500);			continue;		}		thrdBlock(BLOCKTHRDHST);        if( checkLimits() == 1            || iDoNextHost==1)	//switch to the next host		{            /* set the status of the pages to be indexed as indexed */			lstSetNodeStatus(lstFirst,0,1);			/*Un-block all mutexes owned by this thread (only BLOCKTHRDHST) and...*/			UnBlockAll();			/* exit */			ExitThread(0);		}			if((nCur=lstGetNodeByVal(lstFirst,0))!=NULL)		{			if(nCur==NULL || nCur->field==NULL)			{				thrdUnBlock(BLOCKTHRDHST);				continue;			}						/* robots.txt checked?!? */			if(bRobotsOK==0)			{				/* is this page robots.txt */				if(!(stricmp(((struct sHost*)nCur->field)->Page,"/robots.txt")==0))				{					/* if not: please wait robots.txt */					thrdUnBlock(BLOCKTHRDHST);					Sleep(1000);					continue;				}			}			if(CheckRobotExclusion(((struct sHost*)nCur->field)->Page)==0)			{				((struct sHost*)nCur->field)->viewed = 1;				thrdUnBlock(BLOCKTHRDHST);				bRobotsOK=1;				continue; 			}			((struct sHost*)nCur->field)->viewed = 2;			memcpy(&currentHst,((struct sHost*)nCur->field),sizeof(struct sHost));		}		else		{			thrdUnBlock(BLOCKTHRDHST);			Sleep(1000);			continue;		}		//TESTING (before: after Unblockall() )		thrdStatus[thrdNum]=GetTickCount();		if(currentHst.type == 3)  //current url is not a html page or a plain text file		{			if(nCur==NULL || nCur->field==NULL)				continue;			((struct sHost*)nCur->field)->viewed = 1;			thrdUnBlock(BLOCKTHRDHST);			bRobotsOK=1;			continue;		}		//(1 - Crawl Delay) Lock the mutex		if(iRobCrawlDelay>0 || iCrawlDelay>0)		{			thrdBlock(BLOCKEXCRAWL);			if(iQuit==1 || bKillThread==1 || bKillThreadReserved==1)			{				UnBlockAll();				ExitThread(0);			}						Sleep(  (iRobCrawlDelay>0) ? iRobCrawlDelay*1000 : iCrawlDelay );					}		thrdUnBlock(BLOCKTHRDHST);		tStart= GetTickCount();		if(!LoadSocket(&sock,&currentHst,&mSaddr))		{			closesocket(sock);			fprintf(stderr,"\r\n(%i) Socket(%s) error\r\n\r\n",thrdNum, currentHst.Host);			((struct sHost*)nCur->field)->viewed = 1;			bRobotsOK=1;						continue;		}		if (connect(sock, (LPSOCKADDR) &mSaddr, sizeof(mSaddr)) == SOCKET_ERROR)		{			fprintf(stderr,"\r\n(%i) Connect(%s) error\r\n\r\n",thrdNum,currentHst.Host);			closesocket(sock);			((struct sHost*)nCur->field)->viewed = 1;			bRobotsOK=1;			continue;		}		memset(packet,0,MAXPACKETSIZE);		memset(html,0,MAXPACKETSIZE);		sStdOut[0]=0;		ForgeHTTPPacket(currentHst,packet);		snd=SEND(sock,packet);		if(currentHst.port!=PORT)				sprintf(sStdOut,"(%i) Current -> http://%s:%i%s (%s)",thrdNum,currentHst.Host,currentHst.port, currentHst.Page,currentHst.Description);		else				sprintf(sStdOut,"(%i) Current -> http://%s%s (%s)",thrdNum,currentHst.Host,currentHst.Page,currentHst.Description);		if(snd<10)		{			closesocket(sock);			strcat(sStdOut,"\t\t[SEND ERROR]\n\n");			printf("%s",sStdOut);			((struct sHost*)nCur->field)->viewed = 1;			bRobotsOK=1;			continue;		}		maxbytes2recive=sizeof(packet);	//set the bytes to recive		recvdbytes=RecvPackets(&sock,packet,maxbytes2recive);		closesocket(sock);		//(2 - Crawl Delay) File recived unlock the mutex		if(iRobCrawlDelay>0 || iCrawlDelay>0)		{			thrdUnBlock(BLOCKEXCRAWL);		}		if(recvdbytes<=10)		{			((struct sHost*)nCur->field)->viewed = 1;			strcat(sStdOut,"\t\t[RECV ERROR]\n\n");			printf("%s",sStdOut);			bRobotsOK=1;			continue;		}		bytesDownloaded+=recvdbytes;		if((HttpRequestRet=ParseHTTPRequest(packet,html,recvdbytes,httpStatus,sLocation,currentHst.level))!=0)		{			if(strnicmp(httpStatus,"HTTP/1.1 302",12)==0 || strnicmp(httpStatus,"HTTP/1.0 302",12)==0)				sprintf(sStdOutTmp,"\n - HTTP header: %s\n - Location: %s\n - Downloaded %i Kb (%i bytes) in %i ms\n" ,httpStatus,sLocation,recvdbytes/1024,recvdbytes,(int)(GetTickCount()-tStart));			else				sprintf(sStdOutTmp,"\n - HTTP header: %s\n - Downloaded %i Kb (%i bytes) in %i ms\n" ,httpStatus,recvdbytes/1024,recvdbytes,(int)(GetTickCount()-tStart));			strcat(sStdOut,sStdOutTmp);			if(strnicmp(httpStatus,"HTTP/1.1 4",10)==0 || strnicmp(httpStatus,"HTTP/1.0 4",10)==0 || strnicmp(httpStatus,"HTTP/1.1 5",10)==0 || strnicmp(httpStatus,"HTTP/1.0 5",10)==0)			{				nErrorPages++;			}		}		else		{			((struct sHost*)nCur->field)->viewed = 1;			strcat(sStdOut,"\r\n");			printf("%s",sStdOut);			bRobotsOK=1;			continue;		}				if(bRobotsOK==0 && stricmp(((struct sHost*)nCur->field)->Page,"/robots.txt")==0)		{			printf("%s",sStdOut);			if(HttpRequestRet==2)				ParseRobotsTxt(html,currentHst);			else			{				printf(" - Nothing to do with robots.txt\n\n");			}			((struct sHost*)nCur->field)->viewed = 1;			bRobotsOK=1;			continue;		}		else			bRobotsOK=1;				        //Index only HTML(1),plain text files(2)and custom handled files(4)		if(currentHst.type <= 2 || currentHst.type == 4)		{			/* Check the number of pages indexed or if we are switching to the next host */            if( checkLimits() == 1                || iDoNextHost==1)	//switch to the next host			{				((struct sHost*)nCur->field)->viewed = 1;				continue;			}						nPagesViewed++;			if(currentHst.type == 1)					//Looks for urls only in html page			{				tStart=GetTickCount();				sprintf(sStdOutTmp," - Checked in %i ms (%i URL found)\n",(int)(GetTickCount()-tStart),LookForUrls(html,currentHst));				strcat(sStdOut,sStdOutTmp);			}			tStart=GetTickCount();			if(HttpRequestRet==2)	//Index only 200 OK			{				if(bUseRegularExpressionA==1)	//are we using a regular expression filter?				{	//yes					if(regexec(&regexPageFilter, currentHst.Page, 0, 0, 0) == 0)					{	//match...index						tStart=GetTickCount();												if(IndexPage(html,currentHst, recvdbytes)==1)							sprintf(sStdOutTmp," - Indexed in %i ms\n\n",(int)(GetTickCount()-tStart));						else							sprintf(sStdOutTmp,"\n");						strcat(sStdOut,sStdOutTmp);					}					else					{	//discard						sprintf(sStdOutTmp,"\n");						strcat(sStdOut,sStdOutTmp);					}								}				else				{	//index					tStart=GetTickCount();					if(IndexPage(html,currentHst, recvdbytes)==1)						sprintf(sStdOutTmp," - Indexed in %i ms\n\n",(int)(GetTickCount()-tStart));					else						sprintf(sStdOutTmp,"\n");					strcat(sStdOut,sStdOutTmp);				}			}			printf("%s",sStdOut);		}		((struct sHost*)nCur->field)->viewed = 1;	}/*while(condition)*/return 0;}void KillThreads(){int i;	printf("Killing Threads...\r\n\r\n");	for(i=0;i<nThread;i++)	{#ifdef WIN32		WaitForSingleObject(thrdhML[i],50000);		TerminateThread(thrdhML[i],0);		CloseHandle(thrdhML[i]);#else		if(thrdML[i]!=0)			pthread_join(thrdML[i],NULL);#endif	}	init_mutex();	printf("Threads killed\r\n\r\n");	//set all nodes with status==2(indexing) with status=1(indexed)	lstSetNodeStatus(lstFirst, 2, 1);	/* if the downloading of the robots.txt timeouts the line above set the file as indexed 	 * but bRobotsOK is set as 0 (un-parsed) and blocks the spider 	 * so we have to manually set it a 1 (parsed)	 */	bRobotsOK = 1;		/* debug */	//lstDebugNodes(lstFirst,0);	//lstDebugNodes(lstFirst,2);	//lstDebugNodes(lstFirst,1);	bKillThread=0;}void CreateThreads(){int i;int errorCode;	init_mutex();    printf("\r\n");    for(i=0;i<nThread;i++)    {            printf("\rCreating thread %i of %i     ",i+1,nThread);            fflush(stdout);#ifdef WIN32            thrdhML[i] = (HANDLE)_beginthreadex(NULL,0,mainThread,(void*)i,0,&thrdML[i]);#else            if( (errorCode=pthread_create(&thrdML[i], NULL, mainThread, (void*)i)) != 0 )            {                    printf("\r\nThread error (%i):\r\n",errorCode);                    perror(" -    pthread_create() ");                    exit(0);            }#endif            thrdStatus[i]=GetTickCount();    }    printf("\r\n");return;}void CreateServerThread(int port){#ifdef WIN32	thrdhServer = (HANDLE)_beginthreadex(NULL,0,StartOWSServer,(void*)port,0,&thrdServer);#elseint errorCode;    if( (errorCode=pthread_create(&thrdServer, NULL, StartOWSServer, (void*)port)) != 0 )    {            printf("\r\nThread error (%i):\r\n",errorCode);            perror(" -    pthread_create() ");            exit(0);    }#endif}void CreateHandleConnectionThread(struct sHandleConnection* struct_connection){#ifdef WIN32	_beginthreadex(NULL,0,HandleConnection,(void*)struct_connection,0,NULL);#elseint errorCode;pthread_t ptTmp;    if( (errorCode=pthread_create(&ptTmp, NULL, HandleConnection, (void*)struct_connection)) != 0 )    {            printf("\r\nThread error (%i):\r\n",errorCode);            perror(" -    pthread_create() ");            exit(0);    }#endif}void CheckThreads(){DWORD curTickCount;int i;int avgSec;	if(iDoNextHost==0)	{		avgSec=0;		curTickCount=GetTickCount();		for(i=0;i<nThread;i++)	//Check the status of the threads		{			if(curTickCount>thrdStatus[i])				avgSec+=(curTickCount-thrdStatus[i]);			else				avgSec++;		}//for(i=0;i<nThread;i++)		avgSec/=nThread;		if(avgSec>AVGTHREADDELAY)		{			ERROR_LOG("Notice: Killing thrads avgSec>100000");			bKillThread=1;		}	}//if(iDoNextHost==0)return;}#endif/*EOF*/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -