📄 regexfilter.c
字号:
/* OpenWebSpider * * Author: Stefano Alimonti aka Shen139 * Version: 0.5.1 * Mail: shen139 [at] openwebspider (dot) org * * * This file is part of OpenWebSpider * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * *//* *OpenWebSpider option '-T' will ignore this module *//* *$ gcc -g -c regexFilter.c$ gcc -g -shared -W1,-soname,regexFilter.so.0 -o regexFilter.so regexFilter.o -lc||$ make mod_regexfilter * */#include <stdio.h>#include <stdlib.h>#include <string.h>#include "../modHeader.h"#include "../../regex.h"typedef struct __filter{ regex_t hostFilter; unsigned short int bHost; regex_t pageFilter; unsigned short int bPage; regex_t htmlFilter; unsigned short int bHtml; regex_t textFilter; unsigned short int bText;}_FILTER;struct __filter rFilter;#ifndef WIN32int stricmp(char*a,char*b){return strcasecmp(a,b);}int strnicmp(char*a,char*b,int c){return strncasecmp(a,b,c);}#endifint UnToken(char* str,char* Tokens,char* out,int len){int c,i,x=0,tokenfound,y;y=MIN(len,(signed)strlen(str)); for(c=0;c<y;c++) { tokenfound=0; for(i=0;i<(signed)strlen(Tokens);i++) if(str[c]==Tokens[i]) tokenfound=1; if(tokenfound==0) out[x++]=str[c]; } out[x]=0;return 1;}/* return: 0 -> OK : > 0 -> error while parsing file at line $iLine*/int ParseRegexConf(FILE* pF,char* hostname){char sLine[500];char sString[500];int iLine=0;int bOpenEntry=0;int bSpecificHostname=0; while(!feof(pF)) { memset(sLine,0,sizeof(sLine)); fgets(sLine,499,pF); iLine++; if(strnicmp(sLine,"[]",2)==0) { //OK parse the global path bOpenEntry=1; /*global regex*/ if(bSpecificHostname==1) return 0; else rFilter.bHost=rFilter.bPage=rFilter.bHtml=rFilter.bText=0; } else if(sLine[0]=='[' && strchr(sLine+1,']')>sLine) { char currentHostname[500]; memset(currentHostname,0,500); strncpy( currentHostname, sLine+1, strchr(sLine+1,']')-sLine-1 ); if(strcmp(currentHostname,hostname)==0) { bSpecificHostname=1; bOpenEntry=1; rFilter.bHost=rFilter.bPage=rFilter.bHtml=rFilter.bText=0; } else bOpenEntry=0; } else if(sLine[0]=='#' || sLine[0]=='\r' || sLine[0]=='\n' || sLine[0]==0) continue; else if(bOpenEntry==0) continue; else if(strnicmp(sLine,"hostname=",9)==0) { UnToken(sLine+9,"\r\n",sString,499); if(strlen(sString)==0) continue; if(regcomp(&rFilter.hostFilter,sString ,REG_EXTENDED) != 0) return iLine; rFilter.bHost=1; } else if(strnicmp(sLine,"page=",5)==0) { UnToken(sLine+5,"\r\n",sString,499); if(strlen(sString)==0) continue; if(regcomp(&rFilter.pageFilter,sString ,REG_EXTENDED) != 0) return iLine; rFilter.bPage=1; } else if(strnicmp(sLine,"html=",5)==0) { UnToken(sLine+5,"\r\n",sString,499); if(strlen(sString)==0) continue; if(regcomp(&rFilter.htmlFilter,sString ,REG_EXTENDED) != 0) return iLine; rFilter.bHtml=1; } else if(strnicmp(sLine,"text=",5)==0) { UnToken(sLine+5,"\r\n",sString,499); if(strlen(sString)==0) continue; if(regcomp(&rFilter.textFilter,sString ,REG_EXTENDED) != 0) return iLine; rFilter.bText=1; } else return iLine; }return 0;}/* modFilter should return 1 if the current page must be indexed 0 if discarded*/int modFilter (struct functArg* arg){ if(arg) { if(rFilter.bHost==1) if(regexec(&rFilter.hostFilter, arg->hostInfo->Host, 0, 0, 0) != 0) //do not match? => don't index return 0; if(rFilter.bPage==1) if(regexec(&rFilter.pageFilter, arg->hostInfo->Page, 0, 0, 0) != 0) //do not match? => don't index return 0; if(rFilter.bHtml==1) if(regexec(&rFilter.htmlFilter, arg->html, 0, 0, 0) != 0) //do not match? => don't index return 0; if(rFilter.bText==1) if(regexec(&rFilter.textFilter, arg->text, 0, 0, 0) != 0) //do not match? => don't index return 0; strcpy(arg->html,"prova"); strcpy(arg->text,"prova testo"); return 1; /*OK... index*/ }return 0; /*don't index*/}int modInitFilter (char* hostname, char* error){FILE* pF;int errLine; pF=fopen("mod_regex.conf","r"); if(pF==NULL) { strcpy(error,"File not found(in the current directory): mod_regex.conf"); return 0; } if( (errLine=ParseRegexConf(pF, hostname))>0 ) { sprintf(error,"Error while parsing mod_regex.conf at line: %i for hostname: %s",errLine, hostname); return 0; } fclose(pF);return 1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -