⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 options.h

📁 网页抓取程序
💻 H
字号:
/* OpenWebSpider * *  Authors:     Stefano Alimonti AND Stefano Fantin *  Version:     0.7 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it * * * This file is part of OpenWebSpider * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * */#ifndef __OPTIONS#define __OPTIONS#ifdef WIN32 #pragma comment(lib,"WS2_32.lib")#else #define  SOCKET            int #define SOCKADDR_IN        struct sockaddr_in #define LPSOCKADDR         struct sockaddr* #define SOCKET_ERROR       -1 #define DWORD	            long long int #define LPVOID	            void* #define HANDLE	            pthread_t#endif/*Host*/#define PORT                80#define MAXHOSTSIZE         100#define MAXPAGESIZE         255#define MAXURLSIZE          MAXHOSTSIZE + MAXPAGESIZE + 30/*Html*/#define	MAXPACKETSIZE       200000#define MAXTAGSIZE          20#define MAXDESCRIPTIONSIZE  255#define MAXTAGLENGTH        10000#define	MAXHTTPSTATUSSIZE   50/*robots.txt*/#define MAXDISALLOW         1000#define MAXCRAWLDELAY       999/*RANK*/#define MAXPRLEV            10/*SQL*/#define MAXQUERYSIZE        MAXPACKETSIZE + 50000#define MAXUSERQUERYSIZE    200/* minimum delay between 2 pings */#define MYSQL_MIN_PING_DELAY 60000/*Socket*/#define FIRSTTIMEOUT         50000      /*MSeconds*/#define TIMEOUTs             10000      /*MSeconds*//*Thread && Mutex*/#define MAXMUTEX             10#define	MAXTHREAD            1100#define NO_BLOCK             -1#define BLOCKTHRDHST         0#define BLOCKDB1             1#define BLOCKINDEX           2#define BLOCKEXH             3#define BLOCKEXCRAWL         4#define AVGTHREADDELAY       100000	/*100 seconds*//* External modules */#define MAXMSGERRORSIZE     1000/*OWS Server*/#define OWSSERVERMAXLOGINS  10#define MAXCOMMANDSIZE      1000#define MAXKEYWORDSIZE      10#define MAXARGUMENTSIZE     MAXCOMMANDSIZE - MAXKEYWORDSIZE - 1#define __SERVR_COMMANDERR  SEND(sock, "\r\n<div align='center'>Command not understood<div align='center'>\r\n")/*Misc*/#define MAXKEYSIZE          20#define MAXEXTERNALNODE     1000#define MAXOUTPUTLINE       500#define MAXREGULAREXPRESSIONSIZE 100#define MAXCUSTOMEXTENSIONS      10#define MAXCUSTOMEXTENSIONSIZE   50#define MAXEXTENSIONSIZE         10/*Parse Config File*/#define MAXCONFKEYSIZE       100#define MAXCONFARGSIZE       100/*Encoding*/#define UTF8_ENCODING        0#define ASCII_ENCODING       1/*OWS index*/#define OWSINDEXMINWORDSIZE  1#define OWSINDEXMAXWORDSIZE  30#define LEXICONWORDSIZE		 2000/* store the index to the DB every OWSINDEXMAXSWAPDELAY pages */#define OWSINDEXMAXSWAPDELAY 60#define INDEXERTOKENS        " ,.;:-_@#!\"\'\\/<>^[]{}()\r\n\t*%$&=+-|!?"unsigned int lexicon_number_of_elements;unsigned int lexicon_actual_size;int nThread = 20;char	DB1[MAXCONFARGSIZE];char	DB2[MAXCONFARGSIZE];char	MYSQLSERVER1[MAXCONFARGSIZE];char	MYSQLSERVER2[MAXCONFARGSIZE];char	USERDB1[MAXCONFARGSIZE];char	USERDB2[MAXCONFARGSIZE];char	PASSDB1[MAXCONFARGSIZE];char	PASSDB2[MAXCONFARGSIZE];unsigned int MYSQLSERVER_PORT1;unsigned int MYSQLSERVER_PORT2;char	OWS_SERVER_PASSWORD[MAXCONFARGSIZE];typedef struct sHost{	char Host[MAXHOSTSIZE];	char Page[MAXPAGESIZE];	char Description[MAXDESCRIPTIONSIZE];	unsigned short int port;	unsigned short int type;	unsigned short int viewed;	unsigned short int level;   unsigned int host_id;}SHOST;/*Current Host*/struct sHost IndexingHost;/*Global MySQL*/MYSQL gMysqlDB1;MYSQL gMysqlDB2;char  gTable[20];/*global Mutex*/volatile unsigned long hMutex[MAXMUTEX];/*global Status of Threads*/DWORD thrdStatus[MAXTHREAD];/*thrdStatus[]==0	-> Thread is alivethrdStatus[]==1	-> Thread is dead....*/typedef struct sHandleConnection{    SOCKET sock;    SOCKADDR_IN client;}SHC;unsigned int bytesDownloaded           = 0;unsigned int nErrorPages                = 0;char startTime[10];DWORD startTimeMS;/*SWITCHes*/unsigned int xCacheHtml            = 0;unsigned int xCacheHtmlCompressed  = 0;unsigned int nPagesViewed          = 0;unsigned int nRelationships        = 1;unsigned int bDontIndexPages       = 0;unsigned int bTesting              = 0;unsigned int starthostonly         = 0;unsigned int bFreeIndexingMode     = 0;unsigned int bUpdate               = 0;unsigned int actAsAServerPort      = 0;unsigned int bBuildOwsOwnIndex     = 0;struct __crawler_limits{    unsigned int nMaxPagesPerSite;    unsigned int nMaxDepthLevel;    unsigned int nMaxSecondsPerSite;    unsigned int nMaxBytesPerSite;	unsigned int nMaxErrorPerSite;} CRAWLER_LIMITS;struct __extra_limits{    unsigned int nMaxPagesPerSite;    unsigned int nMaxDepthLevel;    unsigned int nMaxSecondsPerSite;    unsigned int nMaxBytesPerSite;} EXTRA_LIMITS;unsigned int scan_mode=0;/* scan_mode==0 => Real time search		//Deprecated * scan_mode==1 => Index * scan_mode==2 => Indexed search * scan_mode==0xFF => uninitialized *//*SIGNALs*/unsigned int iQuit                 = 0;unsigned int bKillThread           = 0;unsigned int bKillThreadReserved   = 0;unsigned int iStop                 = 0;unsigned int iDoNextHost           = 0;/***/struct sHost* nextHost = NULL;/***/unsigned int bSwapping              = 0;unsigned int bAddExternalHost       = 0;unsigned int bUseRegularExpressionA = 0;unsigned int bUseRegularExpressionB = 0;/*STRUCTUREs*//* * bTag = Tag begin for? * eTag = Attribute * flag = 0 : <tag1 attr=123> xyz </tag1> eg.: <a href="/index.php">Home</a> *      = 1 : <tag2 attr2="test">         eg.: <base href="http://www.openwebspider.org/"> */struct{    char* bTag;    char* eTag;    int   flag;} taglist[] =     {      { "base"   ,"href" ,0 },      { "a"      ,"href" ,1 },      { "ref"    ,"href" ,0 },      { "area"   ,"href" ,0 },      { "frame"  ,"src"  ,0 },      { "iframe" ,"src"  ,0 },/* ****EXAMPLE******	  { "img" ,"src"          ,0 },	  { "body" ,"background"  ,0 },   ****EXAMPLE****** */      { ""       ,""     ,-1 }    };const char *PlainTextExtension[]=       {                                            ".txt",".c",                                            ".cpp",".bas",                                            ".pas",".h",".xml",                                            "\0"                                        };const char *HtmlExtensions[]=           {											".htm",".html",                                            ".php", ".asp",                                            ".cgi",	".mspx",                                            ".aspx",".shtml",                                            ".pl",".phtml",                                            ".cfm",".ch2",                                            ".jsp",".msnw",                                            ".php3",".xml","\0"                                         };char CustomExtensions[MAXCUSTOMEXTENSIONS][MAXCUSTOMEXTENSIONSIZE];#ifdef USE_REGEXregex_t regexPageFilter;regex_t regexContentFilter;#endifint iLastPing[MAXMUTEX];char lstRobotsExclusions[MAXDISALLOW][MAXPAGESIZE];int iRobCrawlDelay=0;	//Crawl delay get by robots.txtint iCrawlDelay=0;		//crawl delay get by program argumetsint bRobotsOK;/*module handler*/void* modHandler;struct{    char* functName;    char* functInit;    void* handler;    void* initHandler;	unsigned short int isInitialized;} loadableModules[] =    {      { "modFilter",         "modInitFilter"         , NULL, NULL },      { "\0",                "\0",                     NULL, NULL },   };/* http://www1.tip.nl/~t876506/utf8tbl.html */struct{    char* htmlChar;	char* rep;	int type;	/*	type: 0 UTF8	      1 ASCII	*/} ahList[] =    {      { "nbsp",  " "        , ASCII_ENCODING },      { "amp",   "&"        , ASCII_ENCODING },      { "euro",  "0xE282AC" , UTF8_ENCODING },      { "cent",  "0xC2A2"   , UTF8_ENCODING },      { "copy",  "0xC2A9"   , UTF8_ENCODING },      { "trade", "0xE284A2" , UTF8_ENCODING },      /* if you have problems with these lines please contact me */      { "Aacute","0xC381"   , UTF8_ENCODING },      { "aacute","0xC3A1"   , UTF8_ENCODING },      { "Eacute","0xC389"   , UTF8_ENCODING },      { "eacute","0xC3A9"   , UTF8_ENCODING },      { "Iacute","0xC38D"   , UTF8_ENCODING },      { "iacute","0xC3AD"   , UTF8_ENCODING },      { "Oacute","0xC393"   , UTF8_ENCODING },      { "oacute","0xC3B3"   , UTF8_ENCODING },      { "Uacute","0xC39A"   , UTF8_ENCODING },      { "uacute","0xC3BA"   , UTF8_ENCODING },      { "Agrave","0xC380"   , UTF8_ENCODING },      { "agrave","0xC3A0"   , UTF8_ENCODING },      { "Egrave","0xC388"   , UTF8_ENCODING },      { "egrave","0xC3A8"   , UTF8_ENCODING },      { "Igrave","0xC38C"   , UTF8_ENCODING },      { "igrave","0xC3AC"   , UTF8_ENCODING },      { "Ograve","0xC392"   , UTF8_ENCODING },      { "ograve","0xC3B2"   , UTF8_ENCODING },      { "Ugrave","0xC399"   , UTF8_ENCODING },      { "ugrave","0xC3B9"   , UTF8_ENCODING },      { "Acirc", "0xC382"   , UTF8_ENCODING },      { "acirc", "0xC3A2"   , UTF8_ENCODING },      { "Ecirc", "0xC38A"   , UTF8_ENCODING },      { "ecirc", "0xC3AA"   , UTF8_ENCODING },      { "Icirc", "0xC38E"   , UTF8_ENCODING },      { "icirc", "0xC3AE"   , UTF8_ENCODING },      { "Ocirc", "0xC394"   , UTF8_ENCODING },      { "ocirc", "0xC3B4"   , UTF8_ENCODING },      { "Ucirc", "0xC39B"   , UTF8_ENCODING },      { "ucirc", "0xC3BB"   , UTF8_ENCODING },      { "Auml",  "0xC384"   , UTF8_ENCODING },      { "auml",  "0xC5A0"   , UTF8_ENCODING },      { "Euml",  "0xC38B"   , UTF8_ENCODING },      { "euml",  "0xC3AB"   , UTF8_ENCODING },      { "Iuml",  "0xC38F"   , UTF8_ENCODING },      { "iuml",  "0xC3AF"   , UTF8_ENCODING },      { "Ouml",  "0xC396"   , UTF8_ENCODING },      { "ouml",  "0xC3B6"   , UTF8_ENCODING },      { "Uuml",  "0xC39C"   , UTF8_ENCODING },      { "uuml",  "0xC3BC"   , UTF8_ENCODING },      { "Aring", "0xC385"   , UTF8_ENCODING },      { "aring", "0xC3A5"   , UTF8_ENCODING },      { "AElig", "0xC386"   , UTF8_ENCODING },      { "aelig", "0xC3A6"   , UTF8_ENCODING },      { "Ccedil", "0xC387"  , UTF8_ENCODING },      { "ccedil", "0xC3A7"  , UTF8_ENCODING },/*      { "",'' },   */      { NULL,     0  }   };typedef struct functArg{	struct sHost* hostInfo;	char* html;	unsigned int htmlLength;	char* text;	unsigned int textLength;	int PagesViewed;	long int bytesDownloaded;	void* mysqlDB1;	void* mysqlDB2;}FUNCTION_ARGUMENT;#endif/*EOF*/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -