⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 doc.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 3 页
字号:
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#include "config.h"#include <unistd.h>#include <stdio.h>#include <string.h>#include <stdlib.h>#include <errno.h>#include <fcntl.h>#include <limits.h>#include <sys/types.h>#include <sys/socket.h>#include <sys/stat.h>#ifdef HAVE_SYS_PARAM_H#include <sys/param.h>#endif#ifdef HAVE_FSTATVFS#ifdef HAVE_SYS_STATVFS_H#include <sys/statvfs.h>#endif#else#ifdef HAVE_FSTATFS#ifdef HAVE_SYS_STATFS_H#include <sys/statfs.h>#endif#ifdef HAVE_SYS_VFS_H#include <sys/vfs.h>#endif#ifdef HAVE_SYS_MOUNT_H#include <sys/mount.h>#endif#endif#endif#include <sys/time.h>#include <time.h>#include <utime.h>#include "url.h"#include "doc.h"#include "tools.h"#include "mime.h"#include "http.h"#include "ftp.h"#include "gopher.h"#include "decode.h"#include "abstract.h"#include "mode.h"#include "times.h"#include "dinfo.h"#include "errcode.h"#include "log.h"#include "gui_api.h"#include "html.h"#ifdef I_FACEstatic void doc_set_info(doc *);#endifstatic void show_progress(doc *, ssize_t, int);static double compute_speed_rate(time_t, ssize_t);int doc_download_init(doc * docu, int load){  docu->remove_lock = TRUE;  docu->lock_fn = NULL;  docu->is_parsable = cfg.enable_js && (docu->doc_url->status & URL_ISSCRIPT);  docu->contents = NULL;  docu->mime = NULL;  docu->type_str = NULL;  docu->save_online = FALSE;  docu->size = 0;  docu->current_size = 0;  docu->totsz = -1;  docu->origsize = 0;  docu->rest_pos = 0;  docu->stime = time(NULL);  docu->s_sock = NULL;  docu->is_http11 = FALSE;  docu->is_chunked = FALSE;  docu->is_persistent = FALSE;  docu->read_chunksize = FALSE;  docu->read_trailer = FALSE;  docu->doreget = FALSE;  docu->origtime = docu->dtime;  docu->adj_sz = 0;  docu->load = load;  docu->http_proxy_10 = FALSE;  docu->ftp_data_con_finished = FALSE;  docu->num_auth = cfg.auth_reuse_nonce ? 1 : 0;  docu->num_proxy_auth = cfg.auth_reuse_proxy_nonce ? 1 : 0;  docu->is_http_transfer =    docu->doc_url->type == URLT_HTTP ||    docu->doc_url->type == URLT_HTTPS ||    (docu->doc_url->type == URLT_FTP &&    priv_cfg.ftp_proxy && cfg.ftp_via_http && !cfg.ftp_dirtyp) ||    (docu->doc_url->type == URLT_GOPHER &&    priv_cfg.gopher_proxy && cfg.gopher_via_http);  /*** just default value, later will be assigned properly ***/  docu->request_type = HTTP_REQ_UNKNOWN;  if(docu->is_http_transfer && !docu->http_proxy)  {    char *proxy = NULL;    unsigned short port = 0;    switch (docu->doc_url->type)    {    case URLT_HTTP:      {        http_proxy *pr = NULL;        LOCK_PROXY;        pr = http_proxy_get();        if(pr)        {          http_proxy_check(pr, docu);          proxy = tl_strdup(pr->addr);          port = pr->port;          docu->http_proxy_10 = (pr->is_10 != 0);        }        UNLOCK_PROXY;      }      break;#ifdef USE_SSL    case URLT_HTTPS:      if(priv_cfg.ssl_proxy)      {        proxy = tl_strdup(priv_cfg.ssl_proxy);        port = cfg.ssl_proxy_port;      }      break;#endif    case URLT_FTP:      if(priv_cfg.ftp_proxy)      {        proxy = tl_strdup(priv_cfg.ftp_proxy);        port = cfg.ftp_proxy_port;      }      break;    case URLT_GOPHER:      if(priv_cfg.gopher_proxy)      {        proxy = tl_strdup(priv_cfg.gopher_proxy);        port = cfg.gopher_proxy_port;      }      break;    default:      proxy = NULL;      port = 0;      break;    }    docu->http_proxy = proxy;    docu->http_proxy_port = port;  }  if(cfg.dumpfd >= 0)  {    docu->remove_lock = FALSE;    if(cfg.dump_after)    {      docu->load = TRUE;      docu->save_online = FALSE;      docu->s_sock = NULL;    }    else    {      docu->save_online = TRUE;      docu->s_sock = bufio_dupfd(cfg.dumpfd);      if(!docu->s_sock)      {        xperror("bufio_dupfd()");        docu->errcode = ERR_STORE_DOC;        return -1;      }    }  }  gettimeofday(&docu->hr_start_time, NULL);  timerclear(&docu->redirect_time);  timerclear(&docu->dns_time);  timerclear(&docu->connect_time);  timerclear(&docu->first_byte_time);  timerclear(&docu->end_time);  return 0;}static int doc_check_quotas(doc * docu, ssize_t len, ssize_t totallen){  int retcode = 0;#define KILL_PERSISTANT_CONNECTION \  if(docu->doc_url->type == URLT_FTP || docu->doc_url->type == URLT_FTPS) \    docu->ftp_fatal_err = TRUE; \  if(docu->is_http11) \    docu->is_persistent = FALSE;  if(cfg.minrate > 0.0 && (docu->doc_url->type != URLT_FILE &&    !(docu->doc_url->status & URL_REDIRECT)))  {    time_t _tm = doc_etime(docu, FALSE);    double _rt = compute_speed_rate(_tm, totallen);    if(_rt < (cfg.minrate * 1024.0))    {      KILL_PERSISTANT_CONNECTION;      docu->errcode = ERR_LOW_TRANSFER_RATE;      retcode = -1;    }  }  if(cfg.max_time > 0.0)  {    if((cfg.start_time + (int) (60.0 * cfg.max_time)) < time(NULL))    {      KILL_PERSISTANT_CONNECTION;      docu->errcode = ERR_QUOTA_TIME;      retcode = -1;    }  }  if(docu->doc_url->type != URLT_FILE &&    !(docu->doc_url->status & URL_REDIRECT))    cfg.trans_size += len;  if(cfg.file_quota && ((cfg.file_quota * 1024) <= totallen) &&    (docu->doc_url->type != URLT_FILE) &&    !(docu->doc_url->status & URL_REDIRECT))  {    KILL_PERSISTANT_CONNECTION;    docu->errcode = ERR_QUOTA_FILE;    retcode = 1;  }  if(cfg.trans_quota && ((cfg.trans_quota * 1024) <= cfg.trans_size))  {    KILL_PERSISTANT_CONNECTION;    docu->errcode = ERR_QUOTA_TRANS;    retcode = -1;  }#if defined HAVE_FSTATFS || defined HAVE_FSTATVFS  if((cfg.dumpfd < 0) && cfg.fs_quota &&    (docu->doc_url->type != URLT_FILE) &&    !(docu->doc_url->status & URL_REDIRECT) && docu->s_sock && cfg.dumpfd < 0)  {#ifdef HAVE_FSTATVFS    struct statvfs fss;    if(fstatvfs(bufio_getfd(docu->s_sock), &fss))      xperror("fstatvfs");#else    struct statfs fss;    if(fstatfs(bufio_getfd(docu->s_sock), &fss))      xperror("fstatfs");#endif    else    {      long freespace = (fss.f_bsize * fss.f_bavail) / 1024;      if(freespace < cfg.fs_quota)      {        KILL_PERSISTANT_CONNECTION;        docu->errcode = ERR_QUOTA_FS;        retcode = -1;      }    }  }#endif  return retcode;}static int doc_transfer_data(doc * docu){  char *buf;  int bufsize;  ssize_t len, totallen = 0;  int retcode = 0;  if(docu->report_size)    gui_set_status(gettext("Transfering data"));  show_progress(docu, docu->adj_sz, FALSE);  bufsize = (cfg.bufsize > 0 ? cfg.bufsize : 1) * 1024;  buf = _malloc(bufsize);#ifdef SO_RCVBUF#ifndef __QNX__  if(bufio_is_sock(docu->datasock))  {    if(setsockopt(bufio_getfd(docu->datasock),        SOL_SOCKET, SO_RCVBUF, (char *) &bufsize, sizeof(bufsize)))    {      xperror(gettext("setsockopt: SO_RCVBUF failed"));    }  }#endif#endif  if(docu->save_online)  {    DEBUG_USER("Storing to file: %s\n", url_to_filename(docu->doc_url, TRUE));  }  if(docu->mime && cfg.dump_resp && cfg.dumpfd >= 0 && !cfg.dump_after)    bufio_write(docu->s_sock, docu->mime, strlen(docu->mime));  while((len = abs_read_data(docu, docu->datasock, buf, bufsize)) > 0)  {    if(docu->save_online)    {      if(write(bufio_getfd(docu->s_sock), buf, len) != len)      {        docu->errcode = ERR_STORE_DOC;        xperror(gettext("storing document"));        retcode = -1;        if(docu->doc_url->type == URLT_FTP          || docu->doc_url->type == URLT_FTPS)          docu->ftp_fatal_err = TRUE;        if(docu->is_http11)          docu->is_persistent = FALSE;        break;      }    }    totallen += len;    docu->current_size += len;    if(cfg.maxrate > 0.0 &&      (docu->doc_url->type != URLT_FILE &&        !(docu->doc_url->status & URL_REDIRECT)))    {      time_t _tm = doc_etime(docu, FALSE);      double _rt = compute_speed_rate(_tm, totallen);      if(_rt > (cfg.maxrate * 1024.0))      {        tl_msleep((time_t) (1000.0 * ((double) totallen) / (cfg.maxrate *              1024.0)) - _tm);      }    }    docu->size = totallen;    show_progress(docu, docu->adj_sz, FALSE);    if(docu->load || docu->is_parsable ||      ((docu->doc_url->type == URLT_FTP ||          docu->doc_url->type == URLT_FTPS) &&        docu->doc_url->p.ftp.dir) ||      (docu->doc_url->type == URLT_GOPHER &&        (docu->doc_url->p.gopher.selector[0] == '1' ||          docu->doc_url->p.gopher.selector[0] == 'h')))    {      docu->contents = _realloc(docu->contents, totallen + 1);      memmove(docu->contents + totallen - len, buf, len);    }    retcode = doc_check_quotas(docu, len, totallen);    if(retcode)    {      if(retcode == 1)        retcode = 0;      break;    }    if(docu->totsz > 0 && docu->totsz <= docu->current_size)      break;  }  show_progress(docu, docu->adj_sz, TRUE);  if(cfg.dumpfd >= 0 && !cfg.dump_after)  {    bufio_close(docu->s_sock);    docu->s_sock = NULL;    docu->save_online = FALSE;  }  if(cfg.progres && docu->report_size#ifdef I_FACE    && !cfg.xi_face#endif  )  {    xprintf(0, "\n");  }  if(len < 0 || ((docu->totsz > 0)  && (docu->totsz != (docu->size + docu->rest_pos))))  {    xperror(gettext("Document transfer data"));    if((docu->doc_url->type == URLT_HTTP || docu->doc_url->type ==    URLT_HTTPS) && (!(docu->doc_url->status & URL_REDIRECT)))    {      docu->errcode = ERR_HTTP_TRUNC;    }    else if((docu->doc_url->type == URLT_FTP || docu->doc_url->type ==    URLT_FTPS) && (!(docu->doc_url->status & URL_REDIRECT)))    {      docu->errcode = ERR_FTP_TRUNC;    }    else if(!docu->errcode)      docu->errcode = ERR_READ;    docu->remove_lock = FALSE;    retcode = -1;  }  if(docu->report_size)    gui_set_status(gettext("Data transfer done"));  if((docu->doc_url->type == URLT_FTP ||      docu->doc_url->type == URLT_FTPS) && docu->errcode == ERR_FTP_TRUNC)  {    docu->remove_lock = FALSE;    retcode = -1;  }  /*** if transfer was not from begining, reread  ***/  /*** document content to memory form local file ***/  /*** to be sure we will process whole document  ***/  if(!retcode &&    docu->rest_pos && (docu->load || docu->is_parsable) && (cfg.dumpfd < 0))  {    _free(docu->contents);    totallen = 0;    lseek(bufio_getfd(docu->s_sock), 0, SEEK_SET);    bufio_reset(docu->s_sock);    while((len = bufio_read(docu->s_sock, buf, bufsize)) > 0)    {      totallen += len;      docu->contents = _realloc(docu->contents, totallen + 1);      memmove(docu->contents + totallen - len, buf, len);    }  }  if(docu->contents)    *(docu->contents + totallen) = '\0';  _free(buf);  docu->size = totallen;  return retcode;}static int doc_check_doc_file(doc * docu, int *rv){  char *fn;  struct stat estat;  fn = url_to_filename(docu->doc_url, TRUE);  if(cfg.mode != MODE_SYNC && cfg.mode != MODE_MIRROR)  {    if(docu->doc_url->type != URLT_FILE && (access(fn, R_OK) != -1))    {      if(!stat(fn, &estat))      {        if(!S_ISDIR(estat.st_mode))        {          docu->doc_url->status |= URL_REDIRECT;        }        else        {          char *pom;          char *savepath = url_get_path(docu->doc_url);          pom = tl_str_concat(NULL, fn, "/", priv_cfg.index_name, NULL);          if(!stat(pom, &estat))          {            _free(pom);            if(!S_ISDIR(estat.st_mode))            {              url *newurl = url_dup_url(docu->doc_url);              if(newurl->type != URLT_FILE)                pom = tl_str_concat(NULL, savepath, "/", NULL);              if(newurl->type == URLT_FTP || newurl->type == URLT_FTPS)                newurl->p.ftp.dir = TRUE;              url_set_path(newurl, pom);              _free(pom);              if(url_redirect_to(docu->doc_url, newurl, FALSE))                docu->errcode = ERR_HTTP_CYCLIC;              else                docu->errcode = ERR_HTTP_REDIR;              *rv = -1;              return -1;            }          }          _free(pom);          fn = url_to_filename(docu->doc_url, TRUE);        }      }    }    if((docu->doc_url->type == URLT_FILE ||        (docu->doc_url->status & URL_REDIRECT)) && !docu->load)    {      if(!stat(fn, &estat))      {        if(S_ISDIR(estat.st_mode))        {          docu->errcode = ERR_DIR_URL;          *rv = -1;          return -1;        }      }      else      {        docu->errcode = ERR_FILE_OPEN;        *rv = -1;        return -1;      }      if((!cfg.ftp_html &&          strcmp(tl_get_basename(fn), priv_cfg.index_name) &&          (docu->doc_url->type == URLT_FTP ||            docu->doc_url->type == URLT_FTPS) &&          !docu->doc_url->p.ftp.dir) || !file_is_html(fn))      {        docu->is_parsable = FALSE;        docu->save_online = TRUE;        docu->size = estat.st_size;#ifdef I_FACE        if(cfg.xi_face)          doc_set_info(docu);#endif        xprintf(1, gettext("File redirect\n"));        *rv = 0;        return -1;      }      else      {        if(!strcasecmp("css", tl_get_extension(fn)))          docu->doc_url->status |= URL_STYLE;        docu->is_parsable = TRUE;      }    }  }  else  {    if(!stat(fn, &estat))    {      docu->origsize = estat.st_size;      /*         pro: somehow it must have been forgotten to set the         time as well...       */      docu->origtime = estat.st_mtime;    }  }  return 0;}static int doc_open_existing_in_file(doc * docu, int b_lock, int *rv){  char *inname;  struct stat estat;  if((cfg.dumpfd < 0) && (inname = url_to_in_filename(docu->doc_url)))  {    if(!stat(inname, &estat) && !S_ISDIR(estat.st_mode))    {      if(doc_lock(docu, b_lock))      {        docu->errcode = ERR_STORE_DOC;        _free(inname);        *rv = -1;        return -1;      }      docu->rest_pos = estat.st_size - cfg.rollback;      if(docu->rest_pos)      {        xprintf(1, gettext("Trying to resume from position %d\n"),          docu->rest_pos);        docu->origtime = estat.st_mtime;        docu->stime = estat.st_mtime;        docu->doreget = TRUE;        docu->remove_lock = FALSE;      }    }    _free(inname);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -