⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 openwebspider-0.7.c

📁 网页抓取程序
💻 C
字号:
/* OpenWebSpider * *  Authors:     Stefano Alimonti AND Stefano Fantin *  Version:     0.7 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it *   *  *  Compile with *  + Linux:  $ gcc openwebspider-0.6.c -o openwebspider `mysql_config --cflags --libs` -lpthread -ldl -rdynamic -Wall *   - mysql-devel needed *  + Windows: Microsoft Visual C++ 6.0 * * * Web Site: http://www.openwebspider.org/ *  * * FAQ about Robots and Search engine here: http://www.robotstxt.org/wc/faq.html * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * */#define AUTHOR          Shen139 AND Fantin#define VERSION         "0.7"#define DBVERSION       1#define USE_REGEX#include <stdio.h>#include <stdlib.h>#include <signal.h>#include <time.h>#include <malloc.h>#include <ctype.h>#include <stdarg.h>#include "regex.h"#ifdef WIN32  #include <process.h>  #include <windows.h>  #include "snprintf.c"  #pragma comment(lib,"libmySQL.lib")  /****************************************************************************/  #include "mysql/mysql.h"  /****************************************************************************/#else /*linux*/  #define _MULTI_THREADED  #include <pthread.h>  #include <sched.h>  #include <sys/time.h>  #include <unistd.h>  #include <mysql/mysql.h>  #include <pthread.h>  #include <sys/types.h>  #include <netinet/in.h>  #include <netdb.h>  #include <dlfcn.h>  #include <string.h>  #include <sys/socket.h>  #include <arpa/inet.h>#endif#include "functions.h"#include "mymutex.h"#include "list.h"#include "hstlist.h"#include "htmlfnct.h"#include "socket.h"#include "sqlfnct.h"#include "getopt.h"#include "thread.h"#include "misc.h"#include "robots.h"#include "rank.h"#include "urlfunct.h"#include "temptable.h"#include "modules.h"#include "sqlfnct.h"#include "strfnct.h"#include "server.h"#include "search.h"#include "parse_conf.h"#include "indexer.h"int usage(char *txt){	printf("\n\nUsage: openwebspider [Arguments]\r\n");	printf("Arguments:\r\n");	printf("-I [Search string] (*) (Search the word or the words(between \"double-quotes\") from the database)\r\n");	printf(" or\r\n");	printf(" -O [hostname (optional)] (Build OWS own index on all un-indexed pages or on a single hostname if specified)\r\n");		printf(" or\r\n");	printf("-i [start url] (*) (Start indexing pages starting from passed url)\r\n");	printf("-t [Number of threads] (Default: 20)\r\n");	printf("-s (Single Host Mode)\r\n");	printf("-m [Limits the maximum level of depth in the tree of the pages] [Default: 0 (No limit)]\r\n");    printf("-l (Limits the maximum number of pages indexed per site) [Default: 0 (No limit)]\r\n");    printf("-c (Limits the maximum number of seconds per site) [Default: 0 (No limit)]\r\n");    printf("-b (Limits the maximum number of bytes downloaded per site) [Default: 0 (No limit)]\r\n");	printf("-E (Limits the maximum number of error codes get when downloading a page) [Default: 0 (No limit)]\r\n");	printf("-e (Doesn't Add External Host)\r\n");	printf("-F (Free indexing mode)\r\n");	printf("-x (Saves a cache of the html page (full html)) (slow)\r\n");	printf("-z (Saves a Compressed cache of the html page (full html)) (slow)\r\n");	printf("-f [module] (Import loadable functions from the library)\r\n");	printf("-X [eXtension(s)] (Set all the extensions that openwebspider must consider (eg. -X pdf,swf))\r\n");	printf("-u (index only new pages (Update))\r\n");	printf("-T (Testing Mode) No data (pages and rels) will be written into the DB\r\n");	printf("-r [0-1-2](Saves relationships between pages (Default: 1))\r\n    0: doesn't save relationships\r\n    1: saves only relationships between hosts\r\n    2: saves all relationships (between hosts and pages)\r\n");	printf("-n (No index pages) Don't index pages\r\n");	printf("-d [0-%i ms (Crawl Delay)] (Default: 0)\r\n",MAXCRAWLDELAY*1000);	printf("-S [TCP PORT] (Act as a server to get commands)\r\n");    printf("-o [OpenWebSpider Own Index] (Build the OWS own index)\r\n");	printf("--\r\n-p [path] (specify the full path of the configuration file (eg.: \"/etc/openwebspider/openwebspider.conf\"))\r\n");	printf("\r\n(*) Arguments needed\r\n");	fprintf(stderr,"\r\n\r\nERROR: %s\r\n\r\n",txt);	exit(0);}void sigdie(int a){	printf("\r\n\r\nCaught signal n.%i\r\n\r\n",a);	if(a==15)	{		printf("\r\nExiting...\r\n");		exit(0);	}	iQuit=1;return;}int main(int argc, char*argv[]){struct sHost currentHst;char starturl[MAXURLSIZE], *starturlTmp;int c;extern int optind;char sUserQuery[MAXUSERQUERYSIZE];char sConfFilePath[MAXURLSIZE];	printf("OpenWebSpider(v%s)\r\n  Developed by Stefano Alimonti And Stefano Fantin\r\n\r\n",VERSION);	if(argc<2)		usage("Too few arguments");	memset(starturl,0,MAXURLSIZE);	memset(sConfFilePath,0,MAXURLSIZE);	memset(&CustomExtensions,0,sizeof(CustomExtensions));    CRAWLER_LIMITS.nMaxPagesPerSite    = 0;    CRAWLER_LIMITS.nMaxDepthLevel      = 0;    CRAWLER_LIMITS.nMaxSecondsPerSite  = 0;    CRAWLER_LIMITS.nMaxBytesPerSite    = 0;	CRAWLER_LIMITS.nMaxErrorPerSite    = 0;		while ((c = getopt(argc, argv, "IisrtmTelxRfXuzdFnSpocbEO")) != -1)	switch (c)	{		case 'I':				//indexed search			if(scan_mode!=0)				usage("(-I): Scan Mode redefinition");			scan_mode=2;			if(optind>=argc)				usage("(-I): No enough arguments");			if(strlen(argv[optind])>MAXUSERQUERYSIZE-1)				usage("(-I): Query too long");			else			{				strncpy(sUserQuery,argv[optind],MAXUSERQUERYSIZE-1);				optind++;			}		break;		case 'O':				//build OOI (expected argc = 2 or 3)			if(argc > 3)				usage("(-O): Too much arguments");			if(scan_mode!=0)				usage("(-O): Scan Mode redefinition");						if(argc == 3)			{				if(strlen(argv[2])>MAXHOSTSIZE-1)					usage("(-O): Hostname too long");				else					strncpy(starturl,argv[2],MAXHOSTSIZE-1);			}			scan_mode=3;		break;		case 'i':                   //Index pages			if(scan_mode!=0)        //At startup scan_mode==0xFF => uninitialized				usage("(-i): Scan Mode redefinition");			scan_mode=1;			if(optind>=argc)				usage("(-i): No enough arguments");			if(strlen(argv[optind])>MAXURLSIZE-1)				usage("(-i): Url too long");			else			{				strncpy(starturl,argv[optind],MAXURLSIZE-1);				optind++;			}		break;		case 'f':                   //Load library			if(optind>=argc)				usage("(-f): No enough arguments");			if(strlen(argv[optind])>MAXPAGESIZE-1)				usage("(-f): File name too long");			else			{				myLoadModules(argv[optind],modHandler);				optind++;			}		break;		case 's':			starthostonly=1;		break;		case 'r':                   //relationships			if(optind>=argc)				usage("(-r): No enough arguments");			if(strcmp(argv[optind],"0")==0 || strcmp(argv[optind],"1")==0 || strcmp(argv[optind],"2")==0)				nRelationships=atoi(argv[optind]);			else				usage("(-r): Range value 0,1,2");			optind++;		break;		case 't':                   //n threads			if(optind>=argc)				usage("(-t): No enough arguments");			nThread=atoi(argv[optind]);			if(nThread>MAXTHREAD)				usage("(-t): Too many threads");			if(nThread<1)				usage("(-t): At least one thread");			optind++;		break;		case 'm':                    //maximum level of depth			if(optind>=argc)				usage("(-m): No enough arguments");			CRAWLER_LIMITS.nMaxDepthLevel=atoi(argv[optind]);			if(CRAWLER_LIMITS.nMaxDepthLevel<0)				usage("(-m): Wrong level of depth");			optind++;		break;		case 'l':					 //maximum pages per site			if(optind>=argc)				usage("(-l): No enough arguments");			CRAWLER_LIMITS.nMaxPagesPerSite=atoi(argv[optind]);			if(CRAWLER_LIMITS.nMaxPagesPerSite<0)				usage("(-l): Wrong value for maximum number of pages per site");			optind++;		break;        case 'c':					 //maximum seconds per site			if(optind>=argc)				usage("(-l): No enough arguments");			CRAWLER_LIMITS.nMaxSecondsPerSite=atoi(argv[optind]);			if(CRAWLER_LIMITS.nMaxSecondsPerSite<0)				usage("(-l): Wrong value for maximum number of seconds per site");			optind++;		break;        case 'b':					 //maximum bytes downloaded per site			if(optind>=argc)				usage("(-l): No enough arguments");			CRAWLER_LIMITS.nMaxBytesPerSite=atoi(argv[optind]);			if(CRAWLER_LIMITS.nMaxBytesPerSite<0)				usage("(-l): Wrong value for maximum number of bytes per site");			optind++;		break;		case 'E':					 //maximum error per site			if(optind>=argc)				usage("(-E): No enough arguments");			CRAWLER_LIMITS.nMaxErrorPerSite=atoi(argv[optind]);			if(CRAWLER_LIMITS.nMaxErrorPerSite<0)				usage("(-E): Wrong value for maximum number of error codes");			optind++;		break;		case 'x':					//Save HTML Cache		case 'z':			xCacheHtml=1;			if(c=='z')				xCacheHtmlCompressed=1;		break;		case 'S':                   //Act as a server			if(optind>=argc)				usage("(-S): No enough arguments");			actAsAServerPort=atoi(argv[optind]);			if(actAsAServerPort<1)				usage("(-S): TCP PORT must be an integer");			optind++;		break;		case 'p':                   //Path of openwebspider.conf			if(optind>=argc)				usage("(-p): No enough arguments");			if(strlen(argv[optind])>MAXURLSIZE-1)				usage("(-p): Path too long");			else			{				strncpy(sConfFilePath,argv[optind],MAXURLSIZE-1);				optind++;			}		break;		case 'n':                    //Do not index pages			bDontIndexPages=1;		break;		case 'T':                    //Test (doesn't write data to the DB)			bTesting=1;		break;		case 'e':                    //Doesn't add external hosts			bAddExternalHost=1;		break;		case 'u':                    //Update: index only new pages			bUpdate=1;		break;		case 'F':                    //Free indexing mode			bFreeIndexingMode=1;		break;   		case 'o':                    //OWS Own Index			bBuildOwsOwnIndex=1;    	break;		case 'X':                   //Custom Extensions       (Under Construction)			if(optind>=argc)				usage("(-X): No enough arguments");			if(strlen(argv[optind])>MAXCUSTOMEXTENSIONSIZE-1)				usage("(-X): Custom extensions argument too long");			else			{				/*split extensions*/			char * pExt;			int c=0;				pExt = strtok (argv[optind],",");				while (pExt != NULL)				{					if(c>MAXCUSTOMEXTENSIONS)						break;										if(strlen(pExt)<MAXEXTENSIONSIZE)					{						strcpy(CustomExtensions[c++],pExt);					}					pExt = strtok (NULL, ",");				}				optind++;			}		break;		case 'd':                   //Crawl Delay			if(optind>=argc)				usage("(-d): No enough arguments");			iCrawlDelay=atoi(argv[optind]);			if(iCrawlDelay>MAXCRAWLDELAY*1000 || iCrawlDelay<0)				usage("(-d): Wrong Crawl Delay");			optind++;		break;		default:			usage("Unknown option argument");	}    /*    CHECKs    */    if( (bBuildOwsOwnIndex == 1)  &&       ((bDontIndexPages == 1)    ||        (bTesting == 1)           ||        (bUpdate == 1)    )        )            usage("Wrong mix of arguments");    /*    Parse Config File    */	if(ReadConfFile(sConfFilePath)==0)		return 0;	/***************************************************************/	if(scan_mode==0)		usage("Scan mode undefined");	/*********************************/	if(scan_mode==2)	{	MYSQL mysql;		printf("Scan Mode:       \tIndexed\r\n");		printf("Query:           \t%s\r\n",sUserQuery);		printf("Surfing the DB...\r\n");		if(sqlConnect(MYSQLSERVER2, USERDB2, PASSDB2, DB2 , &mysql, MYSQLSERVER_PORT2)==0)		{			fprintf(stderr, "Failed to connect to database: Error: %s\n",mysql_error(&mysql));		return 0;		}		return( IndexedSearch(&mysql,sUserQuery) );	}	/***********************/	if(scan_mode==1)	{		starturlTmp=(char*)malloc(MAXURLSIZE);		if(strnicmp(starturl,"http://",7)!=0)		{			strncpy(starturlTmp,starturl,strlen(starturl)+7);			sprintf(starturl, "http://%s",starturlTmp);		}		unencode(starturl,starturl+strlen(starturl)+1,starturlTmp);		strcpy(starturl,starturlTmp);		FREE(starturlTmp);		if(ParseUrl(starturl,&currentHst,NULL)==-1)			usage("Wrong start URL");		strncpy(currentHst.Description,starturl,MIN(strlen(starturl),MAXDESCRIPTIONSIZE-1));                /* initialize the crawler and print a banner */        if(InitCrawler(currentHst) == -1)        {            printf("\nSome error occourred when trying to initialize the crawler!\n\n");            return 0;        }                return CrawlerMainLoop(currentHst);	}	if(scan_mode == 3)	{	DWORD        tStart;		/* build OOI */		/* connect to mysql servers */		if(InitMysql()==-1)			return 0;		SetConsoleTitle("building OOI...");		tStart = GetTickCount();		if( starturl[0]!=0 )		{			starturlTmp=(char*)malloc(MAXURLSIZE);			if(strnicmp(starturl,"http://",7)!=0)			{				strncpy(starturlTmp,starturl,strlen(starturl)+7);				sprintf(starturl, "http://%s",starturlTmp);			}			unencode(starturl,starturl+strlen(starturl)+1,starturlTmp);			strcpy(starturl,starturlTmp);			FREE(starturlTmp);			if(ParseUrl(starturl,&currentHst,NULL)==-1)				usage("Wrong start URL");			BuildOwsOwnIndex(&currentHst, 1);		}		else			BuildOwsOwnIndex(NULL, 2);		printf("\r\nBuilt OOI in %i ms\r\n\r\n", (int)(GetTickCount()-tStart));	}return 1;}/*EOF*/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -