⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 misc.h

📁 网页抓取程序
💻 H
📖 第 1 页 / 共 2 页
字号:
/* OpenWebSpider* *  Authors:     Stefano Alimonti AND Stefano Fantin *  Version:     0.7 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it*** This file is part of OpenWebSpider** This program is free software; you can redistribute it and/or modify* it under the terms of the GNU General Public License as published by* the Free Software Foundation; either version 2 of the License, or* (at your option) any later version.** This program is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU General Public License for more details.** You should have received a copy of the GNU General Public License* along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA**/#ifndef __MISC#define __MISC#ifndef WIN32int GetTickCount(){	struct timeval tv;	gettimeofday(&tv, NULL);	return tv.tv_sec*1000L+tv.tv_usec/1000L;}void SetConsoleTitle(char* msg){	return;}int stricmp(char*a,char*b){	return strcasecmp(a,b);}int strnicmp(char*a,char*b,int c){	return strncasecmp(a,b,c);}void Sleep(int n){	usleep((unsigned)n*1000);		return;}char* _strupr(char*a){	int m,i;	m=strlen(a);		for(i=0;i<m;i++)		a[i]=(char)toupper(a[i]);		return a;}char* _strlwr(char*a){	int m,i;	m=strlen(a);		for(i=0;i<m;i++)		a[i]=(char)tolower(a[i]);		return a;}int closesocket(int s){	return close(s);}int ExitThread(int a){	pthread_exit(&a);}int TerminateThread(pthread_t thread,int nothing){	return pthread_cancel(thread);}void CloseHandle(HANDLE a){	return;}#endifint InitMysql(){	SetConsoleTitle("Connecting to mysql...");		printf("Connecting to Mysql server n.1 (%s)...",MYSQLSERVER1);                 //Hosts	if(sqlConnect(MYSQLSERVER1, USERDB1, PASSDB1, DB1,&gMysqlDB1, MYSQLSERVER_PORT1)==0)	{		fprintf(stderr, "ERROR\r\nFailed to connect to database(%s): Error: %s\r\n",DB1,mysql_error(&gMysqlDB1));				ERROR_LOG(mysql_error(&gMysqlDB1))			return -1;	}		printf("OK\r\nConnecting to Mysql server n.2 (%s)...",MYSQLSERVER2);           //Pages	if(sqlConnect(MYSQLSERVER2, USERDB2, PASSDB2, DB2,&gMysqlDB2, MYSQLSERVER_PORT2)==0)	{		fprintf(stderr, "ERROR\r\nFailed to connect to database(%s): Error: %s\r\n",DB2,mysql_error(&gMysqlDB2));				ERROR_LOG(mysql_error(&gMysqlDB2))						mysql_close(&gMysqlDB2);		return -1;	}		printf("OK\r\n");	SetConsoleTitle("Connecting to mysql...OK");return 1;}int InitCrawler(struct sHost currentHst){	memset(iLastPing,0,sizeof(iLastPing));	printf("\r\n");	printf("Start Host        : %s\r\n",currentHst.Host);	printf("Start Page        : %s\r\n", currentHst.Page);	printf("Scan Mode         : Index\r\n");	printf("Mode              : %s\r\n",(starthostonly==1)?"Single Host":"Recursive");	printf("Mysql server n.1  : %s\r\n",MYSQLSERVER1);	printf("Mysql server n.2  : %s\r\n",MYSQLSERVER2);    printf(" ---  Global Limits  ---\r\n");    printf("Max pages         : %i\r\n",CRAWLER_LIMITS.nMaxPagesPerSite);	printf("Max depth level   : %i\r\n",CRAWLER_LIMITS.nMaxDepthLevel);    printf("Max seconds       : %i\r\n",CRAWLER_LIMITS.nMaxSecondsPerSite);    printf("Max bytes         : %i\r\n",CRAWLER_LIMITS.nMaxBytesPerSite);    printf(" -----------------------\r\n");	printf("Surfing the net... (press CTRL+C to exit)\r\n");			if(actAsAServerPort)	{		CreateServerThread(actAsAServerPort);		Sleep(200);	}        if(!StartUpWinsock())	{		fprintf(stderr,"WSAStartup() error\r\n");		ERROR_LOG("WSAStartup() error")			return -1;	}		/* connect to mysql servers */	if(InitMysql()==-1)		return -1;		SetConsoleTitle("Creating temp table...");		do	{		RandomTable(gTable);	}	while(!CreateTmpTable(gTable));    //Loop until creates a new tmp table!!!   	signal(SIGINT,  sigdie);	signal(SIGTERM, sigdie);return 1;}int setHostExtras(int host_id){char sqlQuery[MAXQUERYSIZE];MYSQL_ROW row;MYSQL_RES gRes;MYSQL_RES** tmpRes=NULL;    tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));		if(tmpRes==NULL)		MemoryCorruptedHandler("setHostExtras");    sprintf(sqlQuery,"select hostlist_extras.max_pages,hostlist_extras.max_level,hostlist_extras.max_seconds, hostlist_extras.max_bytes from hostlist left outer join hostlist_extras on hostlist.id = hostlist_extras.host_id WHERE hostlist.id = %d ", host_id);		my_mysql_query_and_store_results(&gMysqlDB1, sqlQuery, tmpRes, &gRes, BLOCKDB1);    if((row = mysql_fetch_row(&gRes)))	{		if(row[0])  /* max_pages */        {            if( atoi(row[0]) > 0 )            {                EXTRA_LIMITS.nMaxPagesPerSite = atoi(row[0]);            }        }        if(row[1])  /* max_level */        {            if( atoi(row[1]) > 0 )            {                EXTRA_LIMITS.nMaxDepthLevel = atoi(row[1]);            }        }        if(row[2])  /* max_seconds */        {            if( atoi(row[2]) > 0 )            {                EXTRA_LIMITS.nMaxSecondsPerSite = atoi(row[2]);            }        }        if(row[3])  /* max_bytes */        {            if( atoi(row[3]) > 0 )            {                EXTRA_LIMITS.nMaxBytesPerSite = atoi(row[3]);            }        }	}    if(*tmpRes)	{		mysql_free_result(*tmpRes);	}		FREE(tmpRes);return 1;}int CrawlerMainLoop(struct sHost currentHst){    /* this is the first URL */    InitIndexing(currentHst);    while(1)    {        /* set the current host as indexed and try to get another host to be indexed */        if((ReturnFirstUrl(&currentHst))==-1)		{			fprintf(stderr,"\nBuffer empty\n");			break;		}        /* check whether iQuit as been set by ReturnFirstUrl */        if(iQuit==1)		{			break;		}        InitIndexing(currentHst);    }        DoQuit();return 1;}int InitIndexing(struct sHost currentHst){	int condition = 1;	char* sqlQuery;	DWORD avgSec;	time_t long_time;	struct tm *newtime;	struct sHost *robots_txt;	#ifdef WIN32	char strTitle[3000];#endif	    iRobCrawlDelay  = 0;    bRobotsOK       = 0;    nPagesViewed    = 0;	bytesDownloaded = 0;	nErrorPages     = 0;	startTimeMS     = 0;    bKillThread     = 0;    avgSec          = 0;    EXTRA_LIMITS.nMaxBytesPerSite   = 0;    EXTRA_LIMITS.nMaxDepthLevel     = 0;    EXTRA_LIMITS.nMaxPagesPerSite   = 0;    EXTRA_LIMITS.nMaxSecondsPerSite = 0;	memset(lstRobotsExclusions,0,sizeof(lstRobotsExclusions));	sqlQuery = malloc(MAXQUERYSIZE);	if(sqlQuery==NULL)		MemoryCorruptedHandler("InitIndexing");    /* try to free the memory used */    lstFreeAll(lstFirst);	    /* does this host exist? */    if( currentHst.host_id == 0 )        currentHst.host_id = GetHostId( currentHst );    if( currentHst.host_id == 0)   //no

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -