⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 indexer.h

📁 网页抓取程序
💻 H
📖 第 1 页 / 共 2 页
字号:
/* OpenWebSpider* *  Authors:     Stefano Alimonti AND Stefano Fantin *  Version:     0.7 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it*** This file is part of OpenWebSpider** This program is free software; you can redistribute it and/or modify* it under the terms of the GNU General Public License as published by* the Free Software Foundation; either version 2 of the License, or* (at your option) any later version.** This program is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU General Public License for more details.** You should have received a copy of the GNU General Public License* along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA**/#ifndef __INDEXER#define __INDEXER/* DEFAULT: MySQL FULL-TEXT Index */int IndexPage(char* html, struct sHost host, unsigned int htmlLength){	char *cTmp;	char *pureText;	char tmpTitle[MAXDESCRIPTIONSIZE], title[MAXDESCRIPTIONSIZE];	char sanHostname[MAXHOSTSIZE];	char sanPage[MAXPAGESIZE];	int usetitle=0;	char* sqlQuery;	char *htmlcache=NULL;	unsigned int textLength;    int (*modFilter)(struct functArg*);		if( bTesting==1 || bDontIndexPages==1)		return 1;		if( bUpdate==1 )	//-u ?		if( IsPageIndexed(&host)==1 )	//Is this page Indexed ?			return 0;		//Yes, don't re-index			cTmp = (char*)malloc(MAXPACKETSIZE);	pureText = (char*)malloc(MAXPACKETSIZE);			// are we saving a cache? yes: prepare it	htmlcache = NULL;	if(xCacheHtml==1)	{		htmlcache=(char*)malloc( ( strlen(html) + 1 ) * 2 );				mysql_real_escape_string(&gMysqlDB2, htmlcache, html, strlen(html));	}		if(cTmp==NULL || pureText==NULL)		MemoryCorruptedHandler("IndexPage");		if(host.type==1)    //HTML page	{		if(BetweenTag(html, "title",tmpTitle ,1,MAXDESCRIPTIONSIZE)>0)		{			memset(title,0,MAXDESCRIPTIONSIZE);			snprintf(title,MAXDESCRIPTIONSIZE-1,"%s",tmpTitle+1);			usetitle=1;		}		textLength=UnHtml(html,cTmp,MAXPACKETSIZE);				/* if sqlTextToUTF8 doesn't complete its work (for example for text too long)		it store the text as returned by UnHtml */		if(sqlTextToUTF8(cTmp,pureText,MAXPACKETSIZE)==0)            strcpy(pureText, cTmp);	}	else    if(host.type==2)            //Plain text files	{		RemoveShit(html);		OnlyOneSpace(html,pureText,MAXPACKETSIZE);		textLength=strlen(pureText);	}    else    if(host.type==4)            //Custom handled files	{        //we empty pureText so the module can fill it with the text of the page		memset(pureText,0,MAXPACKETSIZE);		textLength=0;	}    else    {        /* impossible :-) */	    FREE(cTmp);	    FREE(pureText);	    FREE(htmlcache);			return 0;    }	//are we using a regular expression filter?	if( bUseRegularExpressionB == 1 )	{	//yes		if(regexec(&regexContentFilter, pureText, 0, 0, 0) != 0)		{			FREE(cTmp);			FREE(pureText);			FREE(htmlcache);						return 0;		}	}	//else continue    if( (modFilter = GetModFunctionHandlerByName("modFilter")) )	{	//we are using a custom function as filter		struct functArg tmpModArg;				tmpModArg.hostInfo = &host;		tmpModArg.html = html;				tmpModArg.htmlLength = htmlLength;				tmpModArg.text = pureText;				tmpModArg.textLength = textLength;				tmpModArg.mysqlDB1 = &gMysqlDB1;		tmpModArg.mysqlDB2 = &gMysqlDB2;				thrdBlock(BLOCKDB1);		thrdBlock(BLOCKINDEX);				if(modFilter(&tmpModArg)==0)		{			thrdUnBlock(BLOCKDB1);			thrdUnBlock(BLOCKINDEX);			FREE(cTmp);			FREE(pureText);			FREE(htmlcache);						return 0;		}		/*else index*/				thrdUnBlock(BLOCKDB1);		thrdUnBlock(BLOCKINDEX);	}    /* here we have text that could be dirty so we must clean it*/    if(host.type==4)        RemoveShit(pureText);	sqlQuery = malloc(MAXQUERYSIZE);		if(sqlQuery==NULL)		MemoryCorruptedHandler("IndexPage");			(usetitle==1) ? RemoveShit(title):RemoveShit(host.Description);		mysql_real_escape_string(&gMysqlDB2, sanHostname, host.Host, strlen(host.Host));	mysql_real_escape_string(&gMysqlDB2, sanPage, host.Page, strlen(host.Page));		memset(sqlQuery,0,MAXQUERYSIZE);	snprintf(sqlQuery,MAXQUERYSIZE,"INSERT DELAYED INTO %s SET host_id = %d, hostname = \'%s\',page=\'%s\',title=TRIM(\'%s\'),date=curdate(),time=curtime(),version=%i,level=%i,`text`=TRIM(CONCAT(\'%s\')) ;",gTable, GetHostId( host ), sanHostname,sanPage,(usetitle==1) ? title: host.Description,DBVERSION,host.level,pureText);		if(sqlQuery[MAXQUERYSIZE-3]!=0)	{		sqlQuery[MAXQUERYSIZE-3]='\'';		sqlQuery[MAXQUERYSIZE-2]=';';		sqlQuery[MAXQUERYSIZE-1]=0;	}		my_mysql_ping(&gMysqlDB2,BLOCKINDEX);	if(my_mysql_query(&gMysqlDB2, sqlQuery, BLOCKINDEX))	{		ERROR_LOG(mysql_error(&gMysqlDB2))			ERROR_LOG(sqlQuery)			printf("\r\nQuery Error in function IndexPage(): %s\r\n",mysql_error(&gMysqlDB2));		printf("Trying to reconnect to server...");		printf("OK\r\nConnecting to Mysql server n.2 (%s)...",MYSQLSERVER2);		if(sqlConnect(MYSQLSERVER2, USERDB2, PASSDB2, DB2,&gMysqlDB2, MYSQLSERVER_PORT2)==0)		{			printf("ERROR\r\n");			iQuit=1;						ERROR_LOG(mysql_error(&gMysqlDB2))								FREE(cTmp);			FREE(pureText);			FREE(sqlQuery);			FREE(htmlcache);						return -1;		}		printf("OK\r\n");	}		/* this page is indexed correctly */		FREE(cTmp);	FREE(pureText);		if(htmlcache && xCacheHtml==1)	//saves html cache	{		if(htmlcache==NULL)			MemoryCorruptedHandler("IndexPage");				if(xCacheHtmlCompressed==1)			sprintf(sqlQuery,"UPDATE %s SET `cache`=COMPRESS('%s') WHERE hostname='%s' and page='%s';",gTable,htmlcache,sanHostname,sanPage);		else			sprintf(sqlQuery,"UPDATE %s SET `cache`='%s' WHERE hostname='%s' and page='%s';",gTable,htmlcache,sanHostname,sanPage);				if(my_mysql_query(&gMysqlDB2, sqlQuery, BLOCKINDEX))		{			ERROR_LOG(mysql_error(&gMysqlDB2))			printf("\r\nQuery Error in function IndexPage(): %s\r\n",mysql_error(&gMysqlDB2));		}				FREE(htmlcache);	}		FREE(sqlQuery);		return 1;}/* IsPageIndexed*  if the page exists returns its id*  else returns 0*/int IsPageIndexed(struct sHost* host){	char* sqlQuery;	MYSQL_RES gRes;	MYSQL_RES** tmpRes=NULL;	int ret=1;		tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));		sqlQuery = malloc(MAXQUERYSIZE);		if(tmpRes==NULL || sqlQuery==NULL)		MemoryCorruptedHandler("IsPageIndexed");		sprintf(sqlQuery,"SELECT id FROM pagelist WHERE host_id = %d AND page='%s' LIMIT 1",GetHostId(*host), host->Page);	my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX);		if(mysql_affected_rows(&gMysqlDB2)==0)	//Page is not indexed -> return 0		ret = 0;		if(*tmpRes)	{		mysql_free_result(*tmpRes);	}		FREE(tmpRes);	FREE(sqlQuery);		return ret;}/* Takes a Text and convert all special characters to UTF-8 */int sqlTextToUTF8(char* text, char* out, int maxout){	int x,y;	int textLen;	unsigned char curC;	char* aass;	char ssaa[10];	int cont;	int bAscFound;		textLen=strlen(text);		memset(out,0,maxout);		y=0;		//out[y++]='\'';		for(x=0; x<textLen && y<maxout ;x++)	{		curC=text[x];		if(curC=='&')		{			bAscFound=0;						aass=strchr(text+x,';');			if(aass && aass-(text+x) < 10)			{				memset(ssaa,0,10);				strncpy(ssaa,text+x+1, (aass-(text+x))-1 );								if(ssaa[0]=='#')				{					char val[10];					if(ssaa[1]=='x')						strcpy(val,ssaa+2);					else						sprintf(val,"%X",atoi(ssaa+1));										if(y+21+strlen(val)+26<maxout)					{						strcat(out,"', CONVERT(CONVERT(0x");		/* 21           + */						strcat(out,val);							/* strlen(utf8) + */						strcat(out," using UCS2) using UTF8),'");	/* 26           = */						y+=21+strlen(val)+26;												bAscFound=1;					}					else						return 0;										x+=strlen(ssaa)+1;					continue;									}				

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -