📄 indexer.h
字号:
/* OpenWebSpider* * Authors: Stefano Alimonti AND Stefano Fantin * Version: 0.7 * E-Mails: shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it*** This file is part of OpenWebSpider** This program is free software; you can redistribute it and/or modify* it under the terms of the GNU General Public License as published by* the Free Software Foundation; either version 2 of the License, or* (at your option) any later version.** This program is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the* GNU General Public License for more details.** You should have received a copy of the GNU General Public License* along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA**/#ifndef __INDEXER#define __INDEXER/* DEFAULT: MySQL FULL-TEXT Index */int IndexPage(char* html, struct sHost host, unsigned int htmlLength){ char *cTmp; char *pureText; char tmpTitle[MAXDESCRIPTIONSIZE], title[MAXDESCRIPTIONSIZE]; char sanHostname[MAXHOSTSIZE]; char sanPage[MAXPAGESIZE]; int usetitle=0; char* sqlQuery; char *htmlcache=NULL; unsigned int textLength; int (*modFilter)(struct functArg*); if( bTesting==1 || bDontIndexPages==1) return 1; if( bUpdate==1 ) //-u ? if( IsPageIndexed(&host)==1 ) //Is this page Indexed ? return 0; //Yes, don't re-index cTmp = (char*)malloc(MAXPACKETSIZE); pureText = (char*)malloc(MAXPACKETSIZE); // are we saving a cache? yes: prepare it htmlcache = NULL; if(xCacheHtml==1) { htmlcache=(char*)malloc( ( strlen(html) + 1 ) * 2 ); mysql_real_escape_string(&gMysqlDB2, htmlcache, html, strlen(html)); } if(cTmp==NULL || pureText==NULL) MemoryCorruptedHandler("IndexPage"); if(host.type==1) //HTML page { if(BetweenTag(html, "title",tmpTitle ,1,MAXDESCRIPTIONSIZE)>0) { memset(title,0,MAXDESCRIPTIONSIZE); snprintf(title,MAXDESCRIPTIONSIZE-1,"%s",tmpTitle+1); usetitle=1; } textLength=UnHtml(html,cTmp,MAXPACKETSIZE); /* if sqlTextToUTF8 doesn't complete its work (for example for text too long) it store the text as returned by UnHtml */ if(sqlTextToUTF8(cTmp,pureText,MAXPACKETSIZE)==0) strcpy(pureText, cTmp); } else if(host.type==2) //Plain text files { RemoveShit(html); OnlyOneSpace(html,pureText,MAXPACKETSIZE); textLength=strlen(pureText); } else if(host.type==4) //Custom handled files { //we empty pureText so the module can fill it with the text of the page memset(pureText,0,MAXPACKETSIZE); textLength=0; } else { /* impossible :-) */ FREE(cTmp); FREE(pureText); FREE(htmlcache); return 0; } //are we using a regular expression filter? if( bUseRegularExpressionB == 1 ) { //yes if(regexec(®exContentFilter, pureText, 0, 0, 0) != 0) { FREE(cTmp); FREE(pureText); FREE(htmlcache); return 0; } } //else continue if( (modFilter = GetModFunctionHandlerByName("modFilter")) ) { //we are using a custom function as filter struct functArg tmpModArg; tmpModArg.hostInfo = &host; tmpModArg.html = html; tmpModArg.htmlLength = htmlLength; tmpModArg.text = pureText; tmpModArg.textLength = textLength; tmpModArg.mysqlDB1 = &gMysqlDB1; tmpModArg.mysqlDB2 = &gMysqlDB2; thrdBlock(BLOCKDB1); thrdBlock(BLOCKINDEX); if(modFilter(&tmpModArg)==0) { thrdUnBlock(BLOCKDB1); thrdUnBlock(BLOCKINDEX); FREE(cTmp); FREE(pureText); FREE(htmlcache); return 0; } /*else index*/ thrdUnBlock(BLOCKDB1); thrdUnBlock(BLOCKINDEX); } /* here we have text that could be dirty so we must clean it*/ if(host.type==4) RemoveShit(pureText); sqlQuery = malloc(MAXQUERYSIZE); if(sqlQuery==NULL) MemoryCorruptedHandler("IndexPage"); (usetitle==1) ? RemoveShit(title):RemoveShit(host.Description); mysql_real_escape_string(&gMysqlDB2, sanHostname, host.Host, strlen(host.Host)); mysql_real_escape_string(&gMysqlDB2, sanPage, host.Page, strlen(host.Page)); memset(sqlQuery,0,MAXQUERYSIZE); snprintf(sqlQuery,MAXQUERYSIZE,"INSERT DELAYED INTO %s SET host_id = %d, hostname = \'%s\',page=\'%s\',title=TRIM(\'%s\'),date=curdate(),time=curtime(),version=%i,level=%i,`text`=TRIM(CONCAT(\'%s\')) ;",gTable, GetHostId( host ), sanHostname,sanPage,(usetitle==1) ? title: host.Description,DBVERSION,host.level,pureText); if(sqlQuery[MAXQUERYSIZE-3]!=0) { sqlQuery[MAXQUERYSIZE-3]='\''; sqlQuery[MAXQUERYSIZE-2]=';'; sqlQuery[MAXQUERYSIZE-1]=0; } my_mysql_ping(&gMysqlDB2,BLOCKINDEX); if(my_mysql_query(&gMysqlDB2, sqlQuery, BLOCKINDEX)) { ERROR_LOG(mysql_error(&gMysqlDB2)) ERROR_LOG(sqlQuery) printf("\r\nQuery Error in function IndexPage(): %s\r\n",mysql_error(&gMysqlDB2)); printf("Trying to reconnect to server..."); printf("OK\r\nConnecting to Mysql server n.2 (%s)...",MYSQLSERVER2); if(sqlConnect(MYSQLSERVER2, USERDB2, PASSDB2, DB2,&gMysqlDB2, MYSQLSERVER_PORT2)==0) { printf("ERROR\r\n"); iQuit=1; ERROR_LOG(mysql_error(&gMysqlDB2)) FREE(cTmp); FREE(pureText); FREE(sqlQuery); FREE(htmlcache); return -1; } printf("OK\r\n"); } /* this page is indexed correctly */ FREE(cTmp); FREE(pureText); if(htmlcache && xCacheHtml==1) //saves html cache { if(htmlcache==NULL) MemoryCorruptedHandler("IndexPage"); if(xCacheHtmlCompressed==1) sprintf(sqlQuery,"UPDATE %s SET `cache`=COMPRESS('%s') WHERE hostname='%s' and page='%s';",gTable,htmlcache,sanHostname,sanPage); else sprintf(sqlQuery,"UPDATE %s SET `cache`='%s' WHERE hostname='%s' and page='%s';",gTable,htmlcache,sanHostname,sanPage); if(my_mysql_query(&gMysqlDB2, sqlQuery, BLOCKINDEX)) { ERROR_LOG(mysql_error(&gMysqlDB2)) printf("\r\nQuery Error in function IndexPage(): %s\r\n",mysql_error(&gMysqlDB2)); } FREE(htmlcache); } FREE(sqlQuery); return 1;}/* IsPageIndexed* if the page exists returns its id* else returns 0*/int IsPageIndexed(struct sHost* host){ char* sqlQuery; MYSQL_RES gRes; MYSQL_RES** tmpRes=NULL; int ret=1; tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES)); sqlQuery = malloc(MAXQUERYSIZE); if(tmpRes==NULL || sqlQuery==NULL) MemoryCorruptedHandler("IsPageIndexed"); sprintf(sqlQuery,"SELECT id FROM pagelist WHERE host_id = %d AND page='%s' LIMIT 1",GetHostId(*host), host->Page); my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX); if(mysql_affected_rows(&gMysqlDB2)==0) //Page is not indexed -> return 0 ret = 0; if(*tmpRes) { mysql_free_result(*tmpRes); } FREE(tmpRes); FREE(sqlQuery); return ret;}/* Takes a Text and convert all special characters to UTF-8 */int sqlTextToUTF8(char* text, char* out, int maxout){ int x,y; int textLen; unsigned char curC; char* aass; char ssaa[10]; int cont; int bAscFound; textLen=strlen(text); memset(out,0,maxout); y=0; //out[y++]='\''; for(x=0; x<textLen && y<maxout ;x++) { curC=text[x]; if(curC=='&') { bAscFound=0; aass=strchr(text+x,';'); if(aass && aass-(text+x) < 10) { memset(ssaa,0,10); strncpy(ssaa,text+x+1, (aass-(text+x))-1 ); if(ssaa[0]=='#') { char val[10]; if(ssaa[1]=='x') strcpy(val,ssaa+2); else sprintf(val,"%X",atoi(ssaa+1)); if(y+21+strlen(val)+26<maxout) { strcat(out,"', CONVERT(CONVERT(0x"); /* 21 + */ strcat(out,val); /* strlen(utf8) + */ strcat(out," using UCS2) using UTF8),'"); /* 26 = */ y+=21+strlen(val)+26; bAscFound=1; } else return 0; x+=strlen(ssaa)+1; continue; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -