⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 misc.h

📁 网页抓取程序
💻 H
📖 第 1 页 / 共 2 页
字号:
	    //puts current hostname in the db as "Scanning host in progress.." (viewed==2)		sprintf(sqlQuery,"INSERT INTO hostlist (hostname, port, status, lastvisit) VALUES('%s', %i, 2, curdate());", currentHst.Host, currentHst.port);	else    //yes        sprintf(sqlQuery,"UPDATE hostlist SET port=%i, status = 2, lastvisit=curdate() WHERE hostname =\'%s\' limit 1", currentHst.port, currentHst.Host);				my_mysql_query(&gMysqlDB1, sqlQuery,NO_BLOCK);	    /* 8legs mod */    if( currentHst.host_id == 0 )	    currentHst.host_id	= GetHostId( currentHst );    robots_txt=(struct sHost*)malloc(sizeof(struct sHost));		if(robots_txt==NULL)		MemoryCorruptedHandler("InitIndexing");		currentHst.viewed = 0;	memcpy(robots_txt,&currentHst,sizeof(struct sHost));	strcpy(robots_txt->Page, "/robots.txt");	robots_txt->level = 1;	robots_txt->type  = 1;    robots_txt->host_id = currentHst.host_id;	lstFirst = lstInit(*robots_txt);		FREE(robots_txt);		currentHst.level = 1;	lstAddHost(&lstFirst,currentHst);		memcpy(&IndexingHost,&currentHst,sizeof(struct sHost));		{		int (*modInitFilter)(char*, char*);		char sError[MAXDESCRIPTIONSIZE];		int ret;				if( ( modInitFilter = GetInitModFunctionHandlerByName("modFilter")) )		{				ret=modInitFilter(currentHst.Host,sError);			if(ret==0)			{				FREE(sqlQuery);				printf("\nmodInitFilter(): %s\n\n",sError);				ERROR_LOG(sError);				return 0;			}		}	}		SetConsoleTitle("...");    setHostExtras( currentHst.host_id );    printf(" --- This site Limits ---\r\n");    printf("Max pages         : %i\r\n", (EXTRA_LIMITS.nMaxPagesPerSite==0) ? CRAWLER_LIMITS.nMaxPagesPerSite : EXTRA_LIMITS.nMaxPagesPerSite);    printf("Max depth level   : %i\r\n", (EXTRA_LIMITS.nMaxDepthLevel==0) ? CRAWLER_LIMITS.nMaxDepthLevel : EXTRA_LIMITS.nMaxDepthLevel);    printf("Max seconds       : %i\r\n", (EXTRA_LIMITS.nMaxSecondsPerSite==0) ? CRAWLER_LIMITS.nMaxSecondsPerSite : EXTRA_LIMITS.nMaxSecondsPerSite);    printf("Max bytes         : %i\r\n", (EXTRA_LIMITS.nMaxBytesPerSite==0) ? CRAWLER_LIMITS.nMaxBytesPerSite : EXTRA_LIMITS.nMaxBytesPerSite);    printf(" -----------------------\r\n");	if(bUpdate==0)	{		printf("Deleting old index for %s...",currentHst.Host);		fflush(stdout);				sprintf(sqlQuery,"DELETE ii FROM pagelist, ii WHERE pagelist.hostname =\'%s\' AND ii.pageid = pagelist.id ",currentHst.Host);		my_mysql_query(&gMysqlDB2, sqlQuery,NO_BLOCK);        sprintf(sqlQuery,"DELETE FROM pagelist WHERE hostname =\'%s\' ",currentHst.Host);		my_mysql_query(&gMysqlDB2, sqlQuery,NO_BLOCK);				printf("OK\r\n");				printf("Deleting old rels for %s...",currentHst.Host);				sprintf(sqlQuery,"DELETE FROM rels WHERE host_id = %d",currentHst.host_id);		my_mysql_query(&gMysqlDB1, sqlQuery,NO_BLOCK);				printf("OK\r\n");			}	    /* set startTimeMS before creating threads */    startTimeMS=GetTickCount();	SetConsoleTitle("Creating threads...");		CreateThreads();	/**************************MT********************************/		printf("\r\n");		time( &long_time ); 	newtime=localtime(&long_time);		sprintf(startTime,"%i:%i:%i",newtime->tm_hour ,newtime->tm_min ,newtime->tm_sec );		while(condition)	{#ifdef WIN32		sprintf(strTitle,"OpenWebSpiderV%s | Pages: %i | Time: %i sec | host: %s",VERSION,nPagesViewed,(int)((GetTickCount()-startTimeMS)/1000),currentHst.Host);		SetConsoleTitle(strTitle);#endif		CheckThreads();				Sleep(300);				if(iQuit==1)		{			printf("\r\n\r\nQuitting: Killing threads...\n\n");						KillThreads();						iQuit=0;			bKillThread=0;						sprintf(sqlQuery,"UPDATE hostlist SET status = 1,indexed_pages=%d,time_sec=%d,bytes_downloaded=%d, error_pages=%d WHERE hostname = \'%s\' limit 1" ,nPagesViewed, (int)((GetTickCount()-startTimeMS)/1000), bytesDownloaded, nErrorPages ,currentHst.Host);						printStats(&currentHst,0);						my_mysql_ping(&gMysqlDB1,NO_BLOCK);			my_mysql_query(&gMysqlDB1, sqlQuery,NO_BLOCK);			FREE(sqlQuery);						FlushTempTable(gTable);						if( bBuildOwsOwnIndex == 1 )            {			    /* all pages are swapped to the table pagelist */			    /* are we using ows own index? */			    /* if so: build the index for the current hostname */			    BuildOwsOwnIndex(&currentHst, 1);            }				CalcPageRank( currentHst );						DoQuit();					}/*if(iQuit==1)*/				if(bKillThread==1)		{			SetConsoleTitle("Killing threads");			KillThreads();			CreateThreads();		}/*if(bKillThread==1)*/						thrdBlock(BLOCKTHRDHST);		if(/*iDoNextHost==1 ||*/						/*Switching to the next host*/			(lstGetNodeByVal(lstFirst,0)==NULL &&			lstGetNodeByVal(lstFirst,2)==NULL))		{			/* set the status of the pages to be indexed and of that in indexing as indexed */			/*lstSetNodeStatus(lstFirst,0,1);			lstSetNodeStatus(lstFirst,2,1);*/						thrdUnBlock(BLOCKTHRDHST);						SetConsoleTitle("Killing threads");						bKillThread=1;						KillThreads();						if(iDoNextHost==1)			{				//sprintf(sqlQuery,"UPDATE hostlist SET status = 1,indexed_pages=%i WHERE hostname =\'%s\' limit 1",nPagesViewed,currentHst.Host);				sprintf(sqlQuery,"UPDATE hostlist SET status = 1,indexed_pages=%d,time_sec=%d,bytes_downloaded=%d, error_pages=%d WHERE hostname = \'%s\' limit 1" ,nPagesViewed, (int)((GetTickCount()-startTimeMS)/1000), bytesDownloaded, nErrorPages ,currentHst.Host);								my_mysql_ping(&gMysqlDB1,NO_BLOCK);				my_mysql_query(&gMysqlDB1, sqlQuery,NO_BLOCK);								iDoNextHost=0;							}						FlushTempTable(gTable);			            if( bBuildOwsOwnIndex == 1 )            {			    /* all pages are swapped to the table pagelist */			    /* are we using ows own index? */			    /* if so: build the index for the current hostname */			    BuildOwsOwnIndex(&currentHst, 1);            }						CalcPageRank(currentHst);			            /* this host has been indexed! Proceed to the next? */			break;								}//if(iDoNextHost==1 || (lstGetNodeByVal(lstFirst,0)==NULL && lstGetNodeByVal(lstFirst,2)==NULL))				thrdUnBlock(BLOCKTHRDHST);			}/*while(condition)*/	FREE(sqlQuery);		return 1;}int checkLimits(){    if(        (EXTRA_LIMITS.nMaxPagesPerSite == 0 && CRAWLER_LIMITS.nMaxPagesPerSite>0 && nPagesViewed >= CRAWLER_LIMITS.nMaxPagesPerSite)                                         /* Check the number of pages indexed */    || (EXTRA_LIMITS.nMaxSecondsPerSite == 0 && CRAWLER_LIMITS.nMaxSecondsPerSite>0 && (int)((GetTickCount()-startTimeMS)/1000) >= CRAWLER_LIMITS.nMaxSecondsPerSite)         /* Check the number of seconds */    || (EXTRA_LIMITS.nMaxBytesPerSite == 0 && CRAWLER_LIMITS.nMaxBytesPerSite>0 && bytesDownloaded >= CRAWLER_LIMITS.nMaxBytesPerSite)                                      /* Check the number of bytes downloaded */    || (EXTRA_LIMITS.nMaxPagesPerSite>0 && nPagesViewed >= EXTRA_LIMITS.nMaxPagesPerSite)                                         /* Check the number of pages indexed */    || (EXTRA_LIMITS.nMaxSecondsPerSite>0 && (int)((GetTickCount()-startTimeMS)/1000) >= EXTRA_LIMITS.nMaxSecondsPerSite)         /* Check the number of seconds */    || (EXTRA_LIMITS.nMaxBytesPerSite>0 && bytesDownloaded >= EXTRA_LIMITS.nMaxBytesPerSite) 	|| (CRAWLER_LIMITS.nMaxErrorPerSite>0 && nErrorPages >= CRAWLER_LIMITS.nMaxErrorPerSite)                                         /* Check the number of error pages */    )        return 1;return 0;}/** flag=0 -> complete stats* flag=1 -> in-complete stats* flag=2 -> switched to the next host*/void printStats(struct sHost* Host,int flag){	time_t long_time;	struct tm *newtime;	FILE* file;		time( &long_time ); 	newtime=localtime(&long_time);		if(flag==1)		printf("\r\n + STATS(*)\r\n");	else if(flag==2)		printf("\r\n + STATS(2)\r\n");	else		printf("\r\n + STATS\r\n");		printf("  - Host:\t\t%s\r\n",Host->Host );	printf("  - Pages:\t\t%i\r\n",nPagesViewed);	printf("  - Downloaded:\t\t%i Kb\r\n",(int)bytesDownloaded/1024);	printf("  - Scan time: %is (%s - %i:%i:%i)\r\n\r\n",(int)((GetTickCount()-startTimeMS)/1000),startTime,newtime->tm_hour ,newtime->tm_min ,newtime->tm_sec  );		if((file = fopen("stats.log","a"))!=NULL)	{		if(flag==1)			fprintf(file," + STATS(*)\r\n");		else if(flag==2)			fprintf(file," + STATS(S)\r\n");		else			fprintf(file," + STATS\r\n");				fprintf(file,"  - %i\\%i\\%i %i:%i:%i -- OpenWebSpider version: %s --\r\n",newtime->tm_mday ,newtime->tm_mon +1, newtime->tm_year +1900,newtime->tm_hour ,newtime->tm_min ,newtime->tm_sec,VERSION);		fprintf(file,"  - Host:\t\t\t%s\r\n",Host->Host );		fprintf(file,"  - Pages:\t\t%i\r\n",nPagesViewed);		fprintf(file,"  - Downloaded:\t\t%i Kb\r\n",(int)bytesDownloaded/1024);		fprintf(file,"  - Scan time: %is (%s - %i:%i:%i) \r\n",(int)((GetTickCount()-startTimeMS)/1000),startTime,newtime->tm_hour ,newtime->tm_min ,newtime->tm_sec);		fprintf(file,"============================================================\r\n\r\n");		fclose(file);	}}void MemoryCorruptedHandler(char* funct){	printf("\r\n\r\nMemory corrupted\r\n");		if(funct)		printf("Function: %s\r\n",funct);		printf("Exiting...\r\n\r\n");	exit(0);}void DoQuit(){	if(actAsAServerPort)	{		printf("\n\nFreeing Sockets...");				closesocket(OWS_Server_fd);				printf("OK\n\n");	}		iQuit=0;	bKillThread=0;		DropTempTable(gTable);		mysql_close(&gMysqlDB1);	mysql_close(&gMysqlDB2);		Sleep(200);		printf("Bye\n\n");	//getchar();getchar();getchar();getchar();getchar();	SetConsoleTitle("Bye byE");	exit(0);		return;}#endif/*EOF*/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -