📄 openwebspider-0.7.c
字号:
/* OpenWebSpider * * Authors: Stefano Alimonti AND Stefano Fantin * Version: 0.7 * E-Mails: shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it * * * Compile with * + Linux: $ gcc openwebspider-0.6.c -o openwebspider `mysql_config --cflags --libs` -lpthread -ldl -rdynamic -Wall * - mysql-devel needed * + Windows: Microsoft Visual C++ 6.0 * * * Web Site: http://www.openwebspider.org/ * * * FAQ about Robots and Search engine here: http://www.robotstxt.org/wc/faq.html * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * */#define AUTHOR Shen139 AND Fantin#define VERSION "0.7"#define DBVERSION 1#define USE_REGEX#include <stdio.h>#include <stdlib.h>#include <signal.h>#include <time.h>#include <malloc.h>#include <ctype.h>#include <stdarg.h>#include "regex.h"#ifdef WIN32 #include <process.h> #include <windows.h> #include "snprintf.c" #pragma comment(lib,"libmySQL.lib") /****************************************************************************/ #include "mysql/mysql.h" /****************************************************************************/#else /*linux*/ #define _MULTI_THREADED #include <pthread.h> #include <sched.h> #include <sys/time.h> #include <unistd.h> #include <mysql/mysql.h> #include <pthread.h> #include <sys/types.h> #include <netinet/in.h> #include <netdb.h> #include <dlfcn.h> #include <string.h> #include <sys/socket.h> #include <arpa/inet.h>#endif#include "functions.h"#include "mymutex.h"#include "list.h"#include "hstlist.h"#include "htmlfnct.h"#include "socket.h"#include "sqlfnct.h"#include "getopt.h"#include "thread.h"#include "misc.h"#include "robots.h"#include "rank.h"#include "urlfunct.h"#include "temptable.h"#include "modules.h"#include "sqlfnct.h"#include "strfnct.h"#include "server.h"#include "search.h"#include "parse_conf.h"#include "indexer.h"int usage(char *txt){ printf("\n\nUsage: openwebspider [Arguments]\r\n"); printf("Arguments:\r\n"); printf("-I [Search string] (*) (Search the word or the words(between \"double-quotes\") from the database)\r\n"); printf(" or\r\n"); printf(" -O [hostname (optional)] (Build OWS own index on all un-indexed pages or on a single hostname if specified)\r\n"); printf(" or\r\n"); printf("-i [start url] (*) (Start indexing pages starting from passed url)\r\n"); printf("-t [Number of threads] (Default: 20)\r\n"); printf("-s (Single Host Mode)\r\n"); printf("-m [Limits the maximum level of depth in the tree of the pages] [Default: 0 (No limit)]\r\n"); printf("-l (Limits the maximum number of pages indexed per site) [Default: 0 (No limit)]\r\n"); printf("-c (Limits the maximum number of seconds per site) [Default: 0 (No limit)]\r\n"); printf("-b (Limits the maximum number of bytes downloaded per site) [Default: 0 (No limit)]\r\n"); printf("-E (Limits the maximum number of error codes get when downloading a page) [Default: 0 (No limit)]\r\n"); printf("-e (Doesn't Add External Host)\r\n"); printf("-F (Free indexing mode)\r\n"); printf("-x (Saves a cache of the html page (full html)) (slow)\r\n"); printf("-z (Saves a Compressed cache of the html page (full html)) (slow)\r\n"); printf("-f [module] (Import loadable functions from the library)\r\n"); printf("-X [eXtension(s)] (Set all the extensions that openwebspider must consider (eg. -X pdf,swf))\r\n"); printf("-u (index only new pages (Update))\r\n"); printf("-T (Testing Mode) No data (pages and rels) will be written into the DB\r\n"); printf("-r [0-1-2](Saves relationships between pages (Default: 1))\r\n 0: doesn't save relationships\r\n 1: saves only relationships between hosts\r\n 2: saves all relationships (between hosts and pages)\r\n"); printf("-n (No index pages) Don't index pages\r\n"); printf("-d [0-%i ms (Crawl Delay)] (Default: 0)\r\n",MAXCRAWLDELAY*1000); printf("-S [TCP PORT] (Act as a server to get commands)\r\n"); printf("-o [OpenWebSpider Own Index] (Build the OWS own index)\r\n"); printf("--\r\n-p [path] (specify the full path of the configuration file (eg.: \"/etc/openwebspider/openwebspider.conf\"))\r\n"); printf("\r\n(*) Arguments needed\r\n"); fprintf(stderr,"\r\n\r\nERROR: %s\r\n\r\n",txt); exit(0);}void sigdie(int a){ printf("\r\n\r\nCaught signal n.%i\r\n\r\n",a); if(a==15) { printf("\r\nExiting...\r\n"); exit(0); } iQuit=1;return;}int main(int argc, char*argv[]){struct sHost currentHst;char starturl[MAXURLSIZE], *starturlTmp;int c;extern int optind;char sUserQuery[MAXUSERQUERYSIZE];char sConfFilePath[MAXURLSIZE]; printf("OpenWebSpider(v%s)\r\n Developed by Stefano Alimonti And Stefano Fantin\r\n\r\n",VERSION); if(argc<2) usage("Too few arguments"); memset(starturl,0,MAXURLSIZE); memset(sConfFilePath,0,MAXURLSIZE); memset(&CustomExtensions,0,sizeof(CustomExtensions)); CRAWLER_LIMITS.nMaxPagesPerSite = 0; CRAWLER_LIMITS.nMaxDepthLevel = 0; CRAWLER_LIMITS.nMaxSecondsPerSite = 0; CRAWLER_LIMITS.nMaxBytesPerSite = 0; CRAWLER_LIMITS.nMaxErrorPerSite = 0; while ((c = getopt(argc, argv, "IisrtmTelxRfXuzdFnSpocbEO")) != -1) switch (c) { case 'I': //indexed search if(scan_mode!=0) usage("(-I): Scan Mode redefinition"); scan_mode=2; if(optind>=argc) usage("(-I): No enough arguments"); if(strlen(argv[optind])>MAXUSERQUERYSIZE-1) usage("(-I): Query too long"); else { strncpy(sUserQuery,argv[optind],MAXUSERQUERYSIZE-1); optind++; } break; case 'O': //build OOI (expected argc = 2 or 3) if(argc > 3) usage("(-O): Too much arguments"); if(scan_mode!=0) usage("(-O): Scan Mode redefinition"); if(argc == 3) { if(strlen(argv[2])>MAXHOSTSIZE-1) usage("(-O): Hostname too long"); else strncpy(starturl,argv[2],MAXHOSTSIZE-1); } scan_mode=3; break; case 'i': //Index pages if(scan_mode!=0) //At startup scan_mode==0xFF => uninitialized usage("(-i): Scan Mode redefinition"); scan_mode=1; if(optind>=argc) usage("(-i): No enough arguments"); if(strlen(argv[optind])>MAXURLSIZE-1) usage("(-i): Url too long"); else { strncpy(starturl,argv[optind],MAXURLSIZE-1); optind++; } break; case 'f': //Load library if(optind>=argc) usage("(-f): No enough arguments"); if(strlen(argv[optind])>MAXPAGESIZE-1) usage("(-f): File name too long"); else { myLoadModules(argv[optind],modHandler); optind++; } break; case 's': starthostonly=1; break; case 'r': //relationships if(optind>=argc) usage("(-r): No enough arguments"); if(strcmp(argv[optind],"0")==0 || strcmp(argv[optind],"1")==0 || strcmp(argv[optind],"2")==0) nRelationships=atoi(argv[optind]); else usage("(-r): Range value 0,1,2"); optind++; break; case 't': //n threads if(optind>=argc) usage("(-t): No enough arguments"); nThread=atoi(argv[optind]); if(nThread>MAXTHREAD) usage("(-t): Too many threads"); if(nThread<1) usage("(-t): At least one thread"); optind++; break; case 'm': //maximum level of depth if(optind>=argc) usage("(-m): No enough arguments"); CRAWLER_LIMITS.nMaxDepthLevel=atoi(argv[optind]); if(CRAWLER_LIMITS.nMaxDepthLevel<0) usage("(-m): Wrong level of depth"); optind++; break; case 'l': //maximum pages per site if(optind>=argc) usage("(-l): No enough arguments"); CRAWLER_LIMITS.nMaxPagesPerSite=atoi(argv[optind]); if(CRAWLER_LIMITS.nMaxPagesPerSite<0) usage("(-l): Wrong value for maximum number of pages per site"); optind++; break; case 'c': //maximum seconds per site if(optind>=argc) usage("(-l): No enough arguments"); CRAWLER_LIMITS.nMaxSecondsPerSite=atoi(argv[optind]); if(CRAWLER_LIMITS.nMaxSecondsPerSite<0) usage("(-l): Wrong value for maximum number of seconds per site"); optind++; break; case 'b': //maximum bytes downloaded per site if(optind>=argc) usage("(-l): No enough arguments"); CRAWLER_LIMITS.nMaxBytesPerSite=atoi(argv[optind]); if(CRAWLER_LIMITS.nMaxBytesPerSite<0) usage("(-l): Wrong value for maximum number of bytes per site"); optind++; break; case 'E': //maximum error per site if(optind>=argc) usage("(-E): No enough arguments"); CRAWLER_LIMITS.nMaxErrorPerSite=atoi(argv[optind]); if(CRAWLER_LIMITS.nMaxErrorPerSite<0) usage("(-E): Wrong value for maximum number of error codes"); optind++; break; case 'x': //Save HTML Cache case 'z': xCacheHtml=1; if(c=='z') xCacheHtmlCompressed=1; break; case 'S': //Act as a server if(optind>=argc) usage("(-S): No enough arguments"); actAsAServerPort=atoi(argv[optind]); if(actAsAServerPort<1) usage("(-S): TCP PORT must be an integer"); optind++; break; case 'p': //Path of openwebspider.conf if(optind>=argc) usage("(-p): No enough arguments"); if(strlen(argv[optind])>MAXURLSIZE-1) usage("(-p): Path too long"); else { strncpy(sConfFilePath,argv[optind],MAXURLSIZE-1); optind++; } break; case 'n': //Do not index pages bDontIndexPages=1; break; case 'T': //Test (doesn't write data to the DB) bTesting=1; break; case 'e': //Doesn't add external hosts bAddExternalHost=1; break; case 'u': //Update: index only new pages bUpdate=1; break; case 'F': //Free indexing mode bFreeIndexingMode=1; break; case 'o': //OWS Own Index bBuildOwsOwnIndex=1; break; case 'X': //Custom Extensions (Under Construction) if(optind>=argc) usage("(-X): No enough arguments"); if(strlen(argv[optind])>MAXCUSTOMEXTENSIONSIZE-1) usage("(-X): Custom extensions argument too long"); else { /*split extensions*/ char * pExt; int c=0; pExt = strtok (argv[optind],","); while (pExt != NULL) { if(c>MAXCUSTOMEXTENSIONS) break; if(strlen(pExt)<MAXEXTENSIONSIZE) { strcpy(CustomExtensions[c++],pExt); } pExt = strtok (NULL, ","); } optind++; } break; case 'd': //Crawl Delay if(optind>=argc) usage("(-d): No enough arguments"); iCrawlDelay=atoi(argv[optind]); if(iCrawlDelay>MAXCRAWLDELAY*1000 || iCrawlDelay<0) usage("(-d): Wrong Crawl Delay"); optind++; break; default: usage("Unknown option argument"); } /* CHECKs */ if( (bBuildOwsOwnIndex == 1) && ((bDontIndexPages == 1) || (bTesting == 1) || (bUpdate == 1) ) ) usage("Wrong mix of arguments"); /* Parse Config File */ if(ReadConfFile(sConfFilePath)==0) return 0; /***************************************************************/ if(scan_mode==0) usage("Scan mode undefined"); /*********************************/ if(scan_mode==2) { MYSQL mysql; printf("Scan Mode: \tIndexed\r\n"); printf("Query: \t%s\r\n",sUserQuery); printf("Surfing the DB...\r\n"); if(sqlConnect(MYSQLSERVER2, USERDB2, PASSDB2, DB2 , &mysql, MYSQLSERVER_PORT2)==0) { fprintf(stderr, "Failed to connect to database: Error: %s\n",mysql_error(&mysql)); return 0; } return( IndexedSearch(&mysql,sUserQuery) ); } /***********************/ if(scan_mode==1) { starturlTmp=(char*)malloc(MAXURLSIZE); if(strnicmp(starturl,"http://",7)!=0) { strncpy(starturlTmp,starturl,strlen(starturl)+7); sprintf(starturl, "http://%s",starturlTmp); } unencode(starturl,starturl+strlen(starturl)+1,starturlTmp); strcpy(starturl,starturlTmp); FREE(starturlTmp); if(ParseUrl(starturl,¤tHst,NULL)==-1) usage("Wrong start URL"); strncpy(currentHst.Description,starturl,MIN(strlen(starturl),MAXDESCRIPTIONSIZE-1)); /* initialize the crawler and print a banner */ if(InitCrawler(currentHst) == -1) { printf("\nSome error occourred when trying to initialize the crawler!\n\n"); return 0; } return CrawlerMainLoop(currentHst); } if(scan_mode == 3) { DWORD tStart; /* build OOI */ /* connect to mysql servers */ if(InitMysql()==-1) return 0; SetConsoleTitle("building OOI..."); tStart = GetTickCount(); if( starturl[0]!=0 ) { starturlTmp=(char*)malloc(MAXURLSIZE); if(strnicmp(starturl,"http://",7)!=0) { strncpy(starturlTmp,starturl,strlen(starturl)+7); sprintf(starturl, "http://%s",starturlTmp); } unencode(starturl,starturl+strlen(starturl)+1,starturlTmp); strcpy(starturl,starturlTmp); FREE(starturlTmp); if(ParseUrl(starturl,¤tHst,NULL)==-1) usage("Wrong start URL"); BuildOwsOwnIndex(¤tHst, 1); } else BuildOwsOwnIndex(NULL, 2); printf("\r\nBuilt OOI in %i ms\r\n\r\n", (int)(GetTickCount()-tStart)); }return 1;}/*EOF*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -