⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 global.h

📁 larbin是一种开源的网络爬虫/网络蜘蛛
💻 H
字号:
// Larbin// Sebastien Ailleret// 18-11-99 -> 07-03-02/* This class contains all global variables */#ifndef GLOBAL_H#define GLOBAL_H#include <time.h>#include <sys/types.h>#include <sys/socket.h>#include <sys/poll.h>#include <adns.h>#include "fetch/file.h"#include "fetch/hashTable.h"#include "utils/hashDup.h"#include "utils/url.h"#include "utils/Vector.h"#include "utils/string.h"#include "utils/PersistentFifo.h"#include "utils/ConstantSizedFifo.h"#include "utils/SyncFifo.h"#include "utils/Fifo.h"#include "fetch/site.h"#include "fetch/checker.h"#define addIPUrl() global::IPUrl++#define delIPUrl() global::IPUrl--/** This represent a connection : we have a fixed number of them * fetchOpen links them with servers * fetchPipe reads those which are linked */struct Connexion {  char state;      // what about this socket : EMPTY, CONNECTING, WRITE, OPEN  int pos;         // What part of the request has been sent  FetchError err;  // How did the fetch terminates  int socket;      // number of the fds  int timeout;  // timeout for this connexion  LarbinString request;  // what is the http request  file *parser;    // parser for the connexion (a robots.txt or an html file)  char buffer[maxPageSize];  /** Constructor */  Connexion ();  /** Dectructor : it is never used since we reuse connections */  ~Connexion ();  /** Recycle a connexion   */  void recycle ();};struct global {  /** Constructor : see global.cc for details */  global (int argc, char * argv[]);  /** Destructor : never used */  ~global ();  /** current time : avoid to many calls to time(NULL) */  static time_t now;  /** List of pages allready seen (one bit per page) */  static hashTable *seen;#ifdef NO_DUP  /** Hashtable for suppressing duplicates */  static hashDup *hDuplicate;#endif // NO_DUP  /** URLs for the sequencer with high priority */  static SyncFifo<url> *URLsPriority;  static SyncFifo<url> *URLsPriorityWait;  static uint readPriorityWait;  /** This one has a lower priority : see fetch/sequencer.cc */  static PersistentFifo *URLsDisk;  static PersistentFifo *URLsDiskWait;  static uint readWait;  /** hashtables of the site we accessed (cache) */  static NamedSite *namedSiteList;  static IPSite *IPSiteList;  /** Sites which have at least one url to fetch */  static Fifo<IPSite> *okSites;  /** Sites which have at least one url to fetch   * but need a dns call   */  static Fifo<NamedSite> *dnsSites;  /** Informations for the fetch   * This array contain all the connections (empty or not)   */  static Connexion *connexions;  /** Internal state of adns */  static adns_state ads;  /* Number of pending dns calls */  static uint nbDnsCalls;  /** free connection for fetchOpen : connections with state==EMPTY */  static ConstantSizedFifo<Connexion> *freeConns;#ifdef THREAD_OUTPUT  /** free connection for fetchOpen : connections waiting for end user */  static ConstantSizedFifo<Connexion> *userConns;#endif  /** Sum of the sizes of a fifo in Sites */  static Interval *inter;  /** How deep should we go inside a site */  static int8_t depthInSite;  /** Follow external links ? */  static bool externalLinks;  /** how many seconds should we wait beetween 2 calls at the same server    * 0 if you are only on a personnal server, >=30 otherwise   */  static time_t waitDuration;  /** Name of the bot */  static char *userAgent;  /** Name of the man who lauch the bot */  static char *sender;  /** http headers to send with requests    * sends name of the robots, from field...   */  static char *headers;  static char *headersRobots;  // used when asking a robots.txt  /* internet address of the proxy (if any) */  static sockaddr_in *proxyAddr;  /** connect to this server through a proxy using connection conn    * return >0 in case of success (connecting or connected), 0 otherwise   */  static char getProxyFds (Connexion *conn);  /** Limit to domain */  static Vector<char> *domains;  /** forbidden extensions   * extensions which are allways to avoid : .ps, .pdf...   */  static Vector<char> forbExt;  /** number of parallel connexions   * your kernel must support a little more than nb_conn file descriptors   */  static uint nb_conn;  /** number of parallel dns calls */  static uint dnsConn;  /** number of urls in IPSites */  static int IPUrl;  /** port on which is launched the http statistic webserver */  static unsigned short int httpPort;  /** port on which input wait for queries */  static unsigned short int inputPort;  /** parse configuration file */  static void parseFile (char *file);  /** read the domain limit */  static void manageDomain (char **posParse);  /** read the forbidden extensions */  static void manageExt (char **posParse);  /////////// POLL ///////////////////////////////////  /** array used by poll */  static struct pollfd *pollfds;  /** pos of the max used field in pollfds */  static uint posPoll;  /** size of pollfds */  static uint sizePoll;  /** array used for dealing with answers */  static short *ansPoll;  /** number of the biggest file descriptor */  static int maxFds;  /** make sure the new socket is not too big for ansPoll */  static void verifMax (int fd);#ifdef MAXBANDWIDTH  /** number of bits still allowed during this second */  static long int remainBand;#endif // MAXBANDWIDTH};/** set this fds for next poll */#define setPoll(fds, event) \  global::pollfds[global::posPoll].fd = fds; \  global::pollfds[global::posPoll].events = event; \  global::posPoll++#endif // GLOBAL_H

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -