⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 recurse.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 2 页
字号:
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#include "config.h"#include <assert.h>#include <string.h>#include <stdlib.h>#include <stdio.h>#include <errno.h>#include <netdb.h>#include <errno.h>#include <unistd.h>#include <sys/socket.h>#include <sys/types.h>#include <sys/stat.h>#include <dirent.h>#include <limits.h>#include <time.h>#include <signal.h>#include "url.h"#include "doc.h"#include "tools.h"#include "html.h"#include "http.h"#include "ftp.h"#include "myssl.h"#include "abstract.h"#include "recurse.h"#include "mime.h"#include "robots.h"#include "mode.h"#include "times.h"#include "stats.h"#include "errcode.h"#include "cookie.h"#include "log.h"#include "gui_api.h"#include "form.h"#include "ainterface.h"#include "gcinfo.h"static void dump_ftp_list(dllist *);static void dump_urls_list(dllist *);#define SETNEXTURL  doc_cleanup(docu); \      _free(pstr); \                        return docu->errcode;#ifdef HAVE_MTstatic void _sigintthr(int nr){#ifdef I_FACE  if(!cfg.processing)  {    exit(0);  }#endif  errno = EINTR;  cfg.stop = TRUE;  cfg.rbreak = TRUE;}static void _sigquitthr(int nr){  pthread_exit(NULL);}#endifstatic void reschedule_url(url * urlp){  DEBUG_MISC(gettext("Rescheduling locked URL as no. %d\n"), cfg.total_cnt);  LOCK_CFG_URLSTACK;  cfg.urlstack = dllist_append(cfg.urlstack, (dllist_t) urlp);#ifdef HAVE_MT  mt_semaphore_up(&cfg.urlstack_sem);#endif  cfg.total_cnt++;  UNLOCK_CFG_URLSTACK;}static void run_post_command(doc * docp){  char *urlstr;  char *cmd;  DEBUG_MISC(gettext("Running post-processing command\n"));  urlstr = url_to_urlstr(docp->doc_url, TRUE);  cmd = tl_str_concat(NULL, priv_cfg.post_cmd, " \'",    url_to_filename(docp->doc_url, FALSE),    docp->is_parsable ? "\' 1 \'" : "\' 0 \'", urlstr, "\'", NULL);  _free(urlstr);  tl_system(cmd);  _free(cmd);}static void add_matching_form(doc * docp, int nform, url_info * ui){  char *ftext;  int flen;  form_info *fi;  dllist *ptr, *fields, *sfields;  url_info *nui;  if(!(ftext = form_get_text(nform, docp->contents, docp->size, &flen)))  {    return;  }  fi = form_parse(ftext, flen);  if(!fi)    return;  /* copy all fields supplied on cmdln */  fields = NULL;  for(ptr = ui->fields; ptr; ptr = ptr->next)  {    fields = dllist_prepend(fields, (dllist_t)      form_field_duplicate((form_field *) ptr->data));  }  /* copy all suitable fields from HTML form */  sfields = NULL;  form_get_default_successful(NULL, fi->infos, &sfields);  for(; sfields; sfields = dllist_remove_entry(sfields, sfields))  {    form_field *ff = (form_field *) sfields->data;    if(dllist_find2(fields, (dllist_t) ff, form_field_compare_name))    {      _free(ff->name);      _free(ff->value);      _free(ff);    }    else    {      fields = dllist_prepend(fields, (dllist_t) ff);    }  }  nui = url_info_new(fi->action);  nui->type = URLI_FORM;  nui->fields = fields;  nui->encoding = fi->encoding;  nui->method = fi->method;  nui->localname = tl_strdup(ui->localname);  form_free(fi);  append_starting_url(nui, docp->doc_url);  url_info_free(nui);}static void add_matching_forms(doc * docp, dllist * formlist){  dllist *fptr, *uptr;  int nform;  for(fptr = formlist, nform = 0; fptr; fptr = fptr->next, nform++)  {    url *urlp;    urlp = url_parse((char *) fptr->data);    assert(urlp->type != URLT_FROMPARENT);    if((urlp->type != URLT_HTTP) && (urlp->type != URLT_HTTPS))    {      free_deep_url(urlp);      _free(urlp);      continue;    }    free_deep_url(urlp);    _free(urlp);    for(uptr = priv_cfg.formdata; uptr; uptr = uptr->next)    {      url_info *ui = (url_info *) uptr->data;      if(!strcmp(ui->urlstr, (char *) fptr->data))      {        add_matching_form(docp, nform, ui);      }    }  }}int process_document(doc * docu, int check_lim){  url *urlr;  int nreget = 0, nredir = 0, pokus = 0;  time_t atm;  char cpom[64];  char *pstr = NULL;  int store_stat;  struct stat estat;  urlr = docu->doc_url;  docu->check_limits = check_lim;  _Xt_Serve;  if(docu->check_limits)    docu->check_limits = (urlr->parent_url != NULL);  while(!cfg.stop && !cfg.rbreak)  {    _free(docu->ftp_pasv_host);    docu->errcode = ERR_NOERROR;    docu->mime = NULL;    docu->type_str = NULL;    docu->doc_url = urlr;    docu->dtime = 0L;    docu->contents = NULL;    docu->is_chunked = FALSE;    docu->read_chunksize = FALSE;    docu->read_trailer = FALSE;    docu->ftp_fatal_err = FALSE;    pstr = url_to_urlstr(urlr, FALSE);    if(pokus)      xprintf(1, gettext("retry no. %d\n"), pokus);#ifdef HAVE_MT    xprintf(1, gettext("URL[%2d]: %5d(%d) of %5d  %s\n"), docu->threadnr + 1,      docu->doc_nr, cfg.fail_cnt, cfg.total_cnt, pstr);#else    xprintf(1, gettext("URL: %5d(%d) of %5d  %s\n"), docu->doc_nr,      cfg.fail_cnt, cfg.total_cnt, pstr);#endif#ifdef I_FACE    if(cfg.xi_face)    {      gui_set_doccounter();      gui_set_url(pstr);      gui_set_status(gettext("Starting download"));    }#endif    /*** to be able to revisit moved documents ***/    /*** especially for authorization purposes ***/    if((urlr->status & URL_PROCESSED) && urlr->moved_to && nredir)    {      urlr->status &= ~URL_PROCESSED;    }    if(docu->check_limits)    {      cond_info_t condp;      condp.level = 2;      condp.urlnr = docu->doc_nr;      condp.size = 0;      condp.time = 0L;      condp.mimet = NULL;      condp.full_tag = NULL;      condp.params = NULL;      condp.html_doc = NULL;      condp.html_doc_offset = 0;      condp.tag = NULL;      condp.attrib = NULL;      if(urlr->status & URL_PROCESSED)      {        xprintf(1, gettext("Already processed\n"));        docu->errcode = ERR_PROCESSED;        SETNEXTURL;      }      if(urlr->status & URL_USER_DISABLED)      {        xprintf(1, gettext("Disallowed by user\n"));        docu->errcode = ERR_UDISABLED;        SETNEXTURL;      }      if(!prottable[urlr->type].supported || (urlr->parent_url      && (urlr->type == URLT_FTP || urlr->type == URLT_FTPS)      && urlr->p.ftp.dir && !cfg.condition.ftpdir)      || (urlr->parent_url && !url_append_condition(urlr, &condp)))      {        xprintf(1, gettext("Disallowed by rules\n"));        urlr->status |= URL_REJECTED;        docu->errcode = ERR_RDISABLED;        SETNEXTURL;      }      gui_set_status(gettext("Checking \"robots.txt\""));      if(!robots_check(urlr))      {        xprintf(1, gettext("Disallowed by \"robots.txt\"\n"));        urlr->status |= URL_REJECTED;        docu->errcode = ERR_RDISABLED;        SETNEXTURL;      }    }    if(cfg.mode == MODE_FTPDIR &&      (urlr->type != URLT_FTP && urlr->type != URLT_FTPS))    {      xprintf(1,        gettext("This URL type is not supported with ftpdir mode\n"));      urlr->status |= URL_REJECTED;      docu->errcode = ERR_RDISABLED;      SETNEXTURL;    }    _Xt_Serve;    if(cfg.mode == MODE_SYNC)    {      char *pp = url_to_filename(urlr, TRUE);      if(!stat(pp, &estat) && !S_ISDIR(estat.st_mode))      {        atm = time(NULL) - 86400 * cfg.ddays;        /*           pro: We do not want the message           "No transfer - file not expired"           if the server's clock is ahead of our clock.           If no parameter cfg.ddays is given, then           we do not compare the file modification times.         */        if(cfg.ddays == 0 || estat.st_mtime < atm)          docu->dtime = estat.st_mtime;        else        {          xprintf(1, gettext("No transfer - file not expired\n"));          urlr->status |= URL_REJECTED;          docu->errcode = ERR_RDISABLED;          SETNEXTURL;        }        urlr->status |= URL_ISLOCAL;        docu->origsize = estat.st_size;      }    }    if(cfg.show_time)    {      atm = time(NULL);      LOCK_TIME;      strftime(cpom, sizeof(cpom), "%H:%M:%S", localtime(&atm));      UNLOCK_TIME;      xprintf(1, gettext("Starting time :  %s\n"), cpom);    }#ifdef I_FACE    if(cfg.stop || cfg.rbreak)    {      _free(pstr);      break;    }#endif    _Xt_Serve;    if((urlr->type == URLT_FTP || urlr->type == URLT_FTP)      && urlr->extension &&      ((ftp_url_extension *) urlr->extension)->type == FTP_TYPE_L &&      ((ftp_url_extension *) urlr->extension)->slink)    {      if(cfg.retrieve_slink)      {        /** need to kill extension, because we must **/        /** guess the file type beside the symlink  **/        ftp_url_ext_free(urlr->extension);        urlr->extension = NULL;      }      else      {        ftp_make_symlink(urlr);        urlr->status |= URL_PROCESSED;        docu->errcode = ERR_NOERROR;        SETNEXTURL;      }    }    gui_set_status(gettext("Starting download"));    if(doc_download(docu, FALSE, FALSE))    {      if(cfg.show_time)      {        atm = time(NULL);        LOCK_TIME;        strftime(cpom, sizeof(cpom), "%H:%M:%S", localtime(&atm));        UNLOCK_TIME;        xprintf(1, gettext("Ending time :    %s\n"), cpom);      }      _Xt_Serve;      doc_remove_lock(docu);      _free(docu->contents);      report_error(docu, gettext("download"));      DEBUG_USER("Error status code - (%d)\n");      if((nreget < cfg.nreget &&          (docu->errcode == ERR_HTTP_TRUNC ||            docu->errcode == ERR_FTP_TRUNC ||            docu->errcode == ERR_LOW_TRANSFER_RATE ||            docu->errcode == ERR_HTTP_FAILREGET ||            docu->errcode == ERR_HTTP_TIMEOUT ||            docu->errcode == ERR_HTTP_GW_TIMEOUT)) ||        (nredir < cfg.nredir &&          docu->errcode == ERR_HTTP_REDIR) ||        (docu->errcode == ERR_HTTP_AUTH) ||        (docu->errcode == ERR_HTTP_PROXY_AUTH))      {        if(docu->errcode == ERR_HTTP_REDIR)        {          urlr->status |= URL_PROCESSED;          if((urlr->moved_to->status & URL_PROCESSED) &&            (!urlr->moved_to->moved_to))          {            SETNEXTURL;          }          else          {#ifdef I_FACE            if(cfg.xi_face)              gui_tree_set_icon_for_doc(docu);#endif            urlr = urlr->moved_to;          }        }        if(docu->errcode == ERR_HTTP_TRUNC)        {          urlr->status |= URL_TRUNCATED;          _free(docu->etag);          docu->etag = get_mime_param_val_str("ETag:", docu->mime);          if(!docu->etag)            docu->etag =              get_mime_param_val_str("Content-Location:", docu->mime);          if(!docu->etag)            docu->etag = get_mime_param_val_str("Last-Modified", docu->mime);        }        if(docu->errcode == ERR_HTTP_AUTH)        {          docu->doc_url->status |= URL_PROCESSED;          docu->doc_url->status |= URL_ERR_REC;          SETNEXTURL;        }        if(docu->errcode == ERR_HTTP_PROXY_AUTH)        {          docu->doc_url->status |= URL_PROCESSED;          docu->doc_url->status |= URL_ERR_REC;          SETNEXTURL;        }        _free(docu->mime);        _free(docu->type_str);        nreget += (docu->errcode == ERR_HTTP_TRUNC ||          docu->errcode == ERR_FTP_TRUNC) && cfg.mode != MODE_SREGET;        nredir += (docu->errcode == ERR_HTTP_REDIR);        _free(pstr);        continue;      }      if(docu->errcode == ERR_FTP_UNKNOWN ||        docu->errcode == ERR_FTP_CONNECT ||        docu->errcode == ERR_FTP_DATACON ||        docu->errcode == ERR_FTPS_CONNECT ||        docu->errcode == ERR_FTPS_DATASSLCONNECT ||        docu->errcode == ERR_HTTP_UNKNOWN ||        docu->errcode == ERR_HTTP_CONNECT ||        docu->errcode == ERR_HTTP_SNDREQ ||        docu->errcode == ERR_HTTP_SNDREQDATA ||        docu->errcode == ERR_HTTP_RCVRESP ||        docu->errcode == ERR_HTTP_SERV ||        docu->errcode == ERR_HTTP_TIMEOUT ||        docu->errcode == ERR_HTTP_PROXY_CONN ||        docu->errcode == ERR_HTTPS_CONNECT ||        docu->errcode == ERR_READ ||        docu->errcode == ERR_ZERO_SIZE ||        docu->errcode == ERR_GOPHER_CONNECT ||        docu->errcode == ERR_PROXY_CONNECT || docu->errcode == ERR_HTTP_SERV)      {        urlr->status |= URL_ERR_REC;        pokus++;        /*** retry only when allowed ***/        if(pokus >= cfg.nretry)        {          urlr->status |= URL_PROCESSED;          SETNEXTURL;        }        _free(pstr);        _free(docu->mime);        _free(docu->type_str);        continue;      }      else if(docu->errcode == ERR_LOCKED)      {        if(!cfg.urlstack)        {          xprintf(1,            gettext("last document locked -> sleeping for 5 seconds\n"));          tl_sleep(5);        }        reschedule_url(urlr);        SETNEXTURL;      }      else if(docu->errcode == ERR_BIGGER ||        docu->errcode == ERR_SMALLER ||        docu->errcode == ERR_NOMIMET ||        docu->errcode == ERR_OUTTIME || docu->errcode == ERR_SCRIPT_DISABLED)      {        urlr->status |= URL_PROCESSED;        urlr->status |= URL_ERR_REC;        SETNEXTURL;      }      else      {        /*** remove improper documents if required ***/        if((cfg.remove_old &&            (cfg.mode == MODE_SYNC ||              cfg.mode == MODE_MIRROR)) &&          (((docu->errcode == ERR_FTP_GET ||                docu->errcode == ERR_FTP_BDIR ||                docu->errcode == ERR_FTP_NODIR) &&              docu->ftp_respc == 550) ||            docu->errcode == ERR_HTTP_NFOUND ||            docu->errcode == ERR_HTTP_GONE))        {          doc_remove(docu->doc_url);        }        urlr->status |= URL_ERR_UNREC;        urlr->status |= URL_PROCESSED;        SETNEXTURL;      }    }    _Xt_Serve;    if(urlr->status & URL_TRUNCATED)      urlr->status &= ~URL_TRUNCATED;    if(urlr->status & URL_ERR_REC)      urlr->status &= ~URL_ERR_REC;    if(cfg.show_time)    {      atm = time(NULL);      LOCK_TIME;      strftime(cpom, sizeof(cpom), "%H:%M:%S", localtime(&atm));      UNLOCK_TIME;      xprintf(1, gettext("Ending time :    %s\n"), cpom);    }    report_error(docu, gettext("download"));    _Xt_Serve;    if(docu->contents)    {      if(docu->is_parsable)      {        dllist *formlist = NULL;        dllist *urls;        gui_set_status(gettext("Relocating and scanning HTML document"));        urls =          html_process_document(docu, priv_cfg.formdata ? &formlist : NULL);        _Xt_Serve;        if(urls && cfg.dump_urlfd >= 0)        {          dump_urls_list(urls);        }        if(priv_cfg.formdata && formlist)        {          add_matching_forms(docu, formlist);          while(formlist)          {            if(formlist->data) free((void *) formlist->data);            formlist = dllist_remove_entry(formlist, formlist);          }        }        if(cfg.mode != MODE_SREGET &&          cfg.mode != MODE_FTPDIR && !(docu->doc_url->status & URL_NORECURSE))        {          gui_tree_add_start();          cat_links_to_url_list(urls);          gui_tree_add_end();        }        else if(cfg.mode == MODE_FTPDIR)        {          dump_ftp_list(urls);        }        else        {          for(; urls; urls = dllist_remove_entry(urls, urls))          {            free_deep_url((url *) urls->data);            if(urls->data) free((url *)urls->data);          }        }        _Xt_Serve;      }      store_stat = 0;      if(cfg.dumpfd >= 0 && cfg.dump_after)      {        bufio *fd;        gui_set_status(gettext("Dumping processed document"));        LOCK_DUMPFD;        fd = bufio_dupfd(cfg.dumpfd);        if(docu->mime && cfg.dump_resp)          bufio_write(fd, docu->mime, strlen(docu->mime));

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -