⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 doc.h

📁 网络爬虫程序
💻 H
字号:
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#ifndef _doc_h_#define _doc_h_#include <sys/types.h>#include <sys/time.h>#include <time.h>#include "url.h"#include "tools.h"#include "bufio.h"#include "myssl_openssl.h"#include "myssl_nss.h"typedef struct{  bool_t log;  char *msg;} doc_msg;typedef struct{  /*******************************/  /* Basic document infos        */  /*******************************/  int doc_nr;                    /*** number of document in queue ***/  url *doc_url;                  /*** URL of document ***/  char *mime;                    /*** MIME header of document ***/  char *type_str;                /*** MIME type of document ***/  bool_t is_parsable;            /*** documemt dedicated for parsing ? (html,css,scripts) ***/  ssize_t size;                  /*** document size readed ***/  ssize_t totsz;                 /*** total size of document if known ***/  ssize_t origsize;              /*** original size of document ***/  char *contents;                /*** document content ***/  ssize_t rest_pos;              /*** restart position ***/  ssize_t rest_end_pos;          /*** restart end position ***/  time_t dtime;                  /*** creation time of document ***/  time_t stime;                  /*** time of document request ***/  time_t origtime;               /*** modification time of local copy of document */  int errcode;                   /*** error code for current doc ***/  /*******************************/  /* temporary document infos    */  /*******************************/  char *lock_fn;                 /*** lock file name for document ***/  bufio *datasock;               /*** socket for data connection ***/  bufio *s_sock;                 /*** file where is going online saved data ***/  /*******************************/  /* downloading control flags   */  /*******************************/  bool_t load;                   /*** load document into ->contents ***/  bool_t doreget;                /*** continue broken transfer ***/  bool_t report_size;            /*** show progressmeter while loading of document ***/  bool_t check_limits;           /*** check limits for adding URLs from this document ***/  bool_t remove_lock;            /*** remove lock file ***/  bool_t save_online;            /*** document should be saved online to file ***/  bool_t is_robot;               /*** redirection of robots.txt documents is handled differently ! ***/  bool_t is_http_transfer;       /*** is going current transfer over HTTP protocol ? ***/  /*******************************/  /* HTTP connection info        */  /*******************************/  int request_type;              /*** HTTP request method used for this document ***/  unsigned short http_proxy_port;/*** used HTTP proxy port ***/  char *http_proxy;              /*** used HTTP proxy ***/  bool_t http_proxy_10;          /*** is this HTTP/1.0 proxy ***/  char *etag;                    /*** ETag or Last-Modified for conditional partial HTTP GET method ***/  char *connect_host;            /*** hostname for CONNECT request ***/  unsigned short connect_port;   /*** portnumber for CONNECT request ***/  /*******************************/  /* informations for HTTP/1.1   */  /* (chunked encoding)          */  /*******************************/  bool_t is_http11;              /*** we are talking with HTTP/1.1 server ***/  ssize_t chunk_size;            /*** for HTTP/1.1 chunked transfer encoding ***/  bool_t is_chunked;             /*** current document is encoded with chunked encoding ***/  bool_t read_chunksize;         /*** in next read we expect chunksize header ***/  bool_t read_trailer;           /*** in next read we expect trailer header ***/  bool_t is_persistent;          /*** is current HTTP connection persistent                                      (and should leave or not) ***/  /*******************************/  /* HTTPS SSL connection info   */  /* (used also by FTPS datacon) */  /* need for persistent SSLID   */  /*******************************/#ifdef USE_SSL  ssl_connection ssl_data_con;#endif  /*******************************/  /* HTTP auth informations      */  /*******************************/  short num_auth;                /*** number of attempts to authenticate ***/  short num_proxy_auth;          /*** number of attempts to authenticate with proxy ***/  void *auth_digest;             /*** HTTP digest access authentification info ***/  void *auth_proxy_digest;       /*** HTTP digest access proxy authentification info ***/  char *additional_headers;      /*** additional headers (currently required by NTLM) ***/  /*******************************/  /* FTP connection info         */  /*******************************/  bool_t ftp_fatal_err;          /*** was FTP error fatal ? ***/  short ftp_respc;               /*** last FTP response code ***/  bufio *ftp_control;            /*** socket for FTP control connection ***/  char *ftp_pasv_host;           /*** info for passive data connection ***/  unsigned short ftp_pasv_port;  bool_t ftp_data_con_finished;  /*** FTP data connection was just fully established ***/  /*******************************/  /* progress meter informations */  /*******************************/#ifdef HAVE_GETTIMEOFDAY  struct timeval start_time;     /*** for progress metter ***/  struct timeval hr_start_time;  /*** when documant processing started ***/  struct timeval redirect_time;  /*** when all redirects finished ***/  struct timeval dns_time;       /*** when dns lookup finished ***/  struct timeval connect_time;   /*** when connect(2) finished ***/  struct timeval first_byte_time;/*** when first byte was received ***/  struct timeval end_time;       /*** when download finished ***/#else  time_t start_time;             /*** for progress metter ***/#endif  ssize_t current_size;          /*** size for current speed ***/  ssize_t adj_sz;                /*** adjustment of doc size (HTTP header size ) ***/  /*******************************/  /* per thread informations     */  /*******************************/#ifdef HAVE_MT  dllist *msgbuf;                /*** list of buffered messages ***/  int __herrno;                  /*** per document h_errno value ***/  int threadnr;                  /*** number of current thread ***/#endif} doc;extern int doc_download_init(doc *, int);extern int doc_download(doc *, int, int);extern int doc_store(doc *, int);extern int doc_remove(url *);extern int doc_lock(doc *, int);extern time_t doc_etime(doc *, int);extern void doc_init(doc *, url *);extern void doc_cleanup(doc *);extern void doc_destroy(doc *);extern void doc_remove_lock(doc *);extern void doc_update_parent_links(doc *);#ifdef HAVE_MTextern void doc_finish_processing(doc *);#endif#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -