⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 urlfunct.h

📁 网页抓取程序
💻 H
📖 第 1 页 / 共 2 页
字号:
/* OpenWebSpider**  Authors:     Stefano Alimonti AND Stefano Fantin*  Version:     0.7*  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it*** This file is part of OpenWebSpider** This program is free software; you can redistribute it and/or modify* it under the terms of the GNU General Public License as published by* the Free Software Foundation; either version 2 of the License, or* (at your option) any later version.** This program is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU General Public License for more details.** You should have received a copy of the GNU General Public License* along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA**/#ifndef __URLFUNCT#define __URLFUNCT/* ReturnFirstUrl* Host <-* Set current Host as indexed and return the first host found in the list if available*/int ReturnFirstUrl(struct sHost* Host){	MYSQL_ROW row;	char sqlQuery[MAXQUERYSIZE];	MYSQL_RES gRes;	MYSQL_RES** tmpRes=NULL;		if(Host==NULL)	{		printf("Critical error\r\n\r\n");		return -1;	}		my_mysql_ping(&gMysqlDB1,BLOCKDB1);		//insert current host as viewed (hostlist.status=1)	if(Host->Host[0]!=0)	{		sprintf(sqlQuery,"UPDATE hostlist SET status = 1,indexed_pages=%d,time_sec=%d,bytes_downloaded=%d, error_pages=%d WHERE hostname = \'%s\' limit 1",nPagesViewed, (int)((GetTickCount()-startTimeMS)/1000), bytesDownloaded, nErrorPages ,Host->Host);				my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);	}		printStats(Host,(iDoNextHost==1)?2:0);		if(starthostonly==1)	{		iQuit=1;		return 1;	}		/* we have to crawl the current host (ows server switch to an input-defined host) */    if(nextHost)    {        AddExternalHost(*nextHost, NULL);        sprintf(sqlQuery,"(select CONCAT('http://', hostname),port, id from hostlist where hostname='%s' and port = %i) union all (select CONCAT('http://', hostname),port, id from hostlist where status=0 ORDER BY priority DESC) limit 1",nextHost->Host, nextHost->port);		        FREE(nextHost);        nextHost = NULL;    }    else        sprintf(sqlQuery,"select CONCAT('http://', hostname) ,port, id from hostlist where status=0 ORDER BY priority DESC, id limit 1");			tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));		if(tmpRes==NULL)		MemoryCorruptedHandler("ReturnFirstUrl");		my_mysql_query_and_store_results(&gMysqlDB1, sqlQuery,tmpRes,&gRes,BLOCKDB1);	//May return null!!!		if(mysql_affected_rows(&gMysqlDB1)==0)			//all buffers empty	{		if(*tmpRes)		{			mysql_free_result(*tmpRes);		}				FREE(tmpRes);				return -1;	}	else	{			if((row = mysql_fetch_row(&gRes))==NULL)	//?!?		{			if(*tmpRes)			{				mysql_free_result(*tmpRes);			}						FREE(tmpRes);						return -1;					//there's no results (it seems so...)		}				if(ParseUrl(row[0],Host,NULL)==-1)		//Wrong URL???		{			if(*tmpRes)			{				mysql_free_result(*tmpRes);			}						FREE(tmpRes);						return 0;		}				Host->port = atoi(row[1]);        Host->host_id = atoi(row[3]);				if(*tmpRes)		{			mysql_free_result(*tmpRes);		}				FREE(tmpRes);						return 1;	}}/* AddUrl*/int AddUrl(struct sHost hst, unsigned int level,struct sHost* from){	char* sqlQuery; 			/* if the host of the current page is the same of that we are indexing and Free Indexing Mode is off*/	/* bFreeIndexingMode == 1 == Index all pages of the current host and not */	if(stricmp(IndexingHost.Host,hst.Host)!=0 && bFreeIndexingMode==0)		AddExternalHost(hst,from);	else	{				/* if we are in the free indexing mode we will index this page as it was of the current indexing host but we must add this host to the table hostlist */		/* and we must delete the current page from the Index (pagelist) */		if(bFreeIndexingMode==1)		{			AddExternalHost(hst,from);						sqlQuery = malloc(MAXQUERYSIZE);			snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"DELETE FROM pagelist WHERE hostname =\'%s\' AND page=\'%s\'",hst.Host, hst.Page);			my_mysql_query(&gMysqlDB2, sqlQuery, BLOCKINDEX);			FREE(sqlQuery);		}        else        {            /* we are in the same host */				if(from)	            hst.host_id = from->host_id;        }        if(hst.host_id==0)            hst.host_id = GetHostId(hst);				if(nRelationships==2)		{			pRelationships(from,&hst,nRelationships);		}				/* Check the current page against the robots.txt,             the maximum level of depth and the maximum number of pages to be indexed,            the number of seconds,            the number of bytes downloaded            or if we are switching to the next host */		if(            CheckRobotExclusion(hst.Page)==0             || checkLimits() == 1            || (EXTRA_LIMITS.nMaxDepthLevel == 0 && CRAWLER_LIMITS.nMaxDepthLevel>0 && level >= CRAWLER_LIMITS.nMaxDepthLevel)            || (EXTRA_LIMITS.nMaxDepthLevel>0 && level >= EXTRA_LIMITS.nMaxDepthLevel)            || iDoNextHost==1)			return -1;				if(lstGetNodeByHost(lstFirst,hst)==NULL)	//Host is not in list		{			hst.level = level+1;			lstAddHost(&lstFirst,hst);		}		else			return -1;	}		return 1;}int AddExternalHost(struct sHost Host,struct sHost* from){	char* sqlQuery;	char sError[MAXHOSTSIZE+50];		if(bTesting==1 || bAddExternalHost==1)		return 1;		if(iQuit==1 || bKillThread==1)		return 1;		if(strchr(Host.Host ,' ')>Host.Host)	{		sprintf(sError,"AddExternalHost(): Found wrong url: %s",Host.Host);		printf("\r\n %s \r\n",sError);		ERROR_LOG(sError);				thrdUnBlock(BLOCKEXH);		return -1;	}		sqlQuery = malloc(MAXQUERYSIZE);	    snprintf_mysql_escaped_sql_statement(&gMysqlDB1,sqlQuery,MAXQUERYSIZE-1,"INSERT IGNORE INTO hostlist (hostname,port,status) VALUES('%s',%d, 0);",Host.Host, Host.port);	my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);		FREE(sqlQuery);	/* "from" could be NULL (ows server switch to an input-defined host) */	if(from)		pRelationships(from,&Host,nRelationships);	return 1;}/* GetDir* Page -> dir <-* Page = "/dir1/dir2/page.htm" => dir = "/dir1/dir2/"*/int GetDir(char* Page,char* dir){int i;int last=0;char* tmpPage;char* tmpP;    tmpPage = malloc(strlen(Page)+5);    strcpy(tmpPage, Page);        tmpP = strchr(tmpPage,'?');    if( tmpP > tmpPage)        tmpPage[tmpP-tmpPage]=0;	    for(i=0;i<(signed)strlen(tmpPage);i++)		if(tmpPage[i]=='/')			last=i;        	strncpy(dir,(last==0) ? "/" : tmpPage,(last==0) ? 1 : last);	dir[(last==0) ? 1 : last]=0;		if(dir[strlen(dir)-1]!='/')		strcat(dir,"/");	    FREE(tmpPage);return 1;}int CheckPage(char* page){	int c=0;	int b;	char tmpPage[MAXPAGESIZE+2000];	char rTmpPage[MAXPAGESIZE];	char *rPos;	int i;		memset(rTmpPage,0,sizeof(rTmpPage));		strncpy(rTmpPage,page,MIN(strlen(page),MAXPAGESIZE-1));		if(strlen(page)<2)		return 1;		if(page[0]==' ')		return -1;		if(page[0]=='.' && page[1]=='/')		strcpy(rTmpPage,rTmpPage+2);		for(i=1;rTmpPage[i]!=0 && i<MAXPAGESIZE-1;i++)	{		if(rTmpPage[i-1] != '.' && rTmpPage[i] == '.' && rTmpPage[i+1]=='/')		{			rTmpPage[i]=0;			strcat(rTmpPage,rTmpPage+i+2);			i-=2;		}	}		if(rTmpPage[i-1]=='.')		rTmpPage[i-1]=0;	else		rTmpPage[i]=0;		if(strstr(rTmpPage,"..")==0)	{		strcpy(page,rTmpPage);		return 1;	}		c=0;		rPos=rTmpPage;		if(page[0]=='/')	{		tmpPage[0]='/';		tmpPage[1]=0;	}	else		tmpPage[0]=0;		while(rPos[0]!=0)	{		c=strchr(rPos,'/')-rPos;				if((unsigned)c>strlen(rPos) || c<0)		{			strcat(tmpPage,rPos);			break;		}				if(rPos[0]==' ')			return -1;				if(strncmp(rPos,"..",c)!=0)		{			strncat(tmpPage,rPos,c+1);			tmpPage[strlen(tmpPage)+c+1]=0;		}		else		{			for(b=strlen(tmpPage)-2;b>0;b--)			{				if(tmpPage[b]=='/')				{					tmpPage[b+1]=0;					break;				}			}			if(b==0)			{				tmpPage[0] = (tmpPage[0]=='/') ? '/' : '\0';				tmpPage[1] = '\0';			}		}		rPos+=c+1;	}		if(tmpPage[0]==0)	{		tmpPage[0]='/';		tmpPage[1]=0;	}		strcpy(page,tmpPage);		return 1;}/* PageType* Host <-* Host->Page = "/test.htm" Host->type = 1 (type htm/html)*/int PageType(struct sHost* Host){	int i;	char rPage[MAXPAGESIZE];	int bArgs=0;		/*bArgs=1 == the page contains a '?'*/	int slHP;			if(Host==NULL)		return -1;		memset(rPage,0,MAXPAGESIZE);		strncpy(rPage,Host->Page,MAXPAGESIZE-1);		if(strchr(rPage,'?')>rPage)				//does this page contain a '?'	{		rPage[strchr(rPage,'?')-rPage]=0;	//cut it		bArgs=1;	}		if(Host->Page[strlen(Host->Page)-1]=='/')	{		Host->type = 1;			// Html file		return 1;	}		slHP = MIN(strlen(Host->Page),MAXPAGESIZE);	for(i=0;i<slHP;i++)	{		if(Host->Page[i]=='.')			break;	}		if(i==(signed)strlen(Host->Page) && bArgs==0)    //Maybe a directory (no '.' found)    {		if(strlen(Host->Page)>=MAXPAGESIZE-1)			return -1;				strcat(Host->Page,"/");		Host->type = 1;            // Html file

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -