⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 doc.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 3 页
字号:
}#ifdef I_FACE/********************************************************//* nastavenie info dokumentu pre informaciu pouzivatela *//* FIXME: Translate me!                                 *//********************************************************/static void doc_set_info(doc * docp){#ifdef WITH_TREE  url_prop *prp = _malloc(sizeof(url_prop));  prp->size = docp->size;  prp->mdtm = docp->dtime;  prp->type = NULL;  switch (docp->doc_url->type)  {  case URLT_HTTP:#ifdef USE_SSL  case URLT_HTTPS:#endif    if(docp->type_str)      prp->type = tl_strdup(docp->type_str);    break;  case URLT_FILE:    prp->type = tl_strdup(gettext_nop("Local file"));    break;  case URLT_GOPHER:    switch (docp->doc_url->p.gopher.selector[0])    {    case '0':      prp->type = tl_strdup(gettext_nop("Gopher/Text File"));      break;    case '1':      prp->type = tl_strdup(gettext_nop("Gopher/Directory"));      break;    case '2':      prp->type = tl_strdup(gettext_nop("Gopher/CSO phone book"));      break;    case '3':      prp->type = tl_strdup(gettext_nop("Gopher/Error"));      break;    case '4':      prp->type = tl_strdup(gettext_nop("Gopher/BINHEX"));      break;    case '5':      prp->type = tl_strdup(gettext_nop("Gopher/DOS bin"));      break;    case '6':      prp->type = tl_strdup(gettext_nop("Gopher/UUencoded"));      break;    case '7':      prp->type = tl_strdup(gettext_nop("Gopher/Search index"));      break;    case '8':      prp->type = tl_strdup(gettext_nop("Gopher/Telnet session"));      break;    case '9':      prp->type = tl_strdup(gettext_nop("Gopher/bin"));      break;    case '+':      prp->type = tl_strdup(gettext_nop("Gopher/Duplicated server"));      break;    case 'T':      prp->type = tl_strdup(gettext_nop("Gopher/TN3270"));      break;    case 'g':      prp->type = tl_strdup(gettext_nop("Gopher/GIF"));      break;    case 'I':      prp->type = tl_strdup(gettext_nop("Gopher/Image"));      break;    }    break;  case URLT_FTP:    if(docp->doc_url->p.ftp.dir)      prp->type = tl_strdup(gettext_nop("FTP/Directory"));    else      prp->type = tl_strdup(gettext_nop("FTP/File"));    break;  case URLT_FTPS:    if(docp->doc_url->p.ftp.dir)      prp->type = tl_strdup(gettext_nop("FTPS/Directory"));    else      prp->type = tl_strdup(gettext_nop("FTPS/File"));    break;  default:    prp->type = tl_strdup(gettext_nop("Unsupported type"));    break;  }  if(!prp->type)    prp->type = tl_strdup(gettext_nop("Local file"));  docp->doc_url->prop = prp;#endif}#endifvoid doc_init(doc * docp, url * urlp){  docp->doc_nr = 0;  docp->doc_url = urlp;  docp->mime = NULL;  docp->type_str = NULL;  docp->is_parsable = cfg.enable_js && (docp->doc_url->status & URL_ISSCRIPT);  docp->size = 0;  docp->totsz = -1;  docp->contents = NULL;  docp->save_online = FALSE;  docp->dtime = 0L;  docp->stime = 0L;  docp->rest_pos = 0;  docp->rest_end_pos = -1;  docp->etag = NULL;  docp->errcode = ERR_NOERROR;  docp->origsize = 0;  docp->ftp_fatal_err = FALSE;  docp->ftp_respc = 0;  docp->ftp_pasv_host = NULL;  docp->ftp_pasv_port = 0;  docp->ftp_data_con_finished = FALSE;  docp->datasock = NULL;  docp->ftp_control = NULL;  docp->s_sock = NULL;#ifdef USE_SSL  memset(&docp->ssl_data_con, '\0', sizeof(ssl_connection));#endif  docp->num_auth = 0;  docp->num_proxy_auth = 0;  docp->auth_digest = NULL;  docp->auth_proxy_digest = NULL;  docp->lock_fn = NULL;  docp->report_size = TRUE;  docp->check_limits = TRUE;  docp->remove_lock = FALSE;  docp->is_http11 = FALSE;  docp->chunk_size = 0;  docp->is_chunked = FALSE;  docp->read_chunksize = FALSE;  docp->read_trailer = FALSE;  docp->is_persistent = FALSE;#ifdef HAVE_MT  docp->__herrno = 0;  docp->msgbuf = NULL;  docp->threadnr = 0;#endif  docp->is_robot = FALSE;  docp->additional_headers = NULL;  docp->is_http_transfer = FALSE;  docp->http_proxy = NULL;  docp->http_proxy_port = DEFAULT_HTTP_PROXY_PORT;  docp->http_proxy_10 = FALSE;  docp->request_type = HTTP_REQ_UNKNOWN;  docp->connect_host = NULL;  docp->connect_port = 0;  timerclear(&docp->hr_start_time);  timerclear(&docp->redirect_time);  timerclear(&docp->dns_time);  timerclear(&docp->connect_time);  timerclear(&docp->first_byte_time);  timerclear(&docp->end_time);}static char *get_rate_str(char *str, double rate){  if(rate <= 1024.0)    sprintf(str, "%5.0f  B/s", rate);  else if(rate <= 1048576.0)    sprintf(str, "%5.1f kB/s", rate / 1024.0);  else if(rate <= 1073741824.0)    sprintf(str, "%5.1f MB/s", rate / 1048576.0);  else    sprintf(str, "%5.1f GB/s", rate / 1073741824.0);  return str;}static char *get_time_str(char *str, time_t tm){  sprintf(str, "%ld:%02ld:%02ld",    tm / 3600000, (tm % 3600000) / 60000, (tm % 60000) / 1000);  return str;}static char *get_size_str(char *str, int total, int actual){  if(total)  {    if(total < 1000000)      sprintf(str, "%6d / %d B [%5.1f%%]",        actual, total, (100.0 * (double) actual / (double) total));    else      sprintf(str, "%7d / %d kB [%5.1f%%]",        actual / 1024, total / 1024,        (100.0 * (double) actual / (double) total));  }  else  {    if(actual < 1000000)      sprintf(str, "%6d B", actual);    else      sprintf(str, "%6d kB", actual / 1024);  }  return str;}time_t doc_etime(doc * docp, int init){#ifdef HAVE_GETTIMEOFDAY  if(init)  {    gettimeofday(&docp->start_time, NULL);    return 0;  }  else  {    struct timeval t;    gettimeofday(&t, NULL);    return (1000 * (t.tv_sec - docp->start_time.tv_sec) +      (t.tv_usec - docp->start_time.tv_usec) / 1000);  }#else  if(init)  {    docp->start_time = time(NULL);    return 0;  }  else  {    return 1000 * (time(NULL) - docp->start_time);  }#endif}static double compute_speed_rate(time_t etime, ssize_t size){  return (double) size *1000.0 / (etime == 0.0 ? 1.0 : etime);}static void show_progress(doc * docp, ssize_t adjsz, int dolog){  time_t etime = doc_etime(docp, FALSE);  double rate = compute_speed_rate(etime, docp->size + adjsz);  char s_rate[30] = "", s_etime[30] = "", s_rtime[30] = "", s_size[30] = "";  ftp_url_extension *fe;  if(docp->doc_url->type == URLT_FTP || docp->doc_url->type == URLT_FTPS)    fe = (ftp_url_extension *) docp->doc_url->extension;  else    fe = NULL;  if(docp->totsz >= 0 || (fe && fe->size > 0))  {    int size = docp->totsz >= 0 ? docp->totsz : fe->size;    time_t rtime =      (time_t) ((double) (size -        docp->rest_pos) / (double) (docp->size ? docp->size : 10) *      (double) ((etime != 0.0) ? etime : 1.0)) - etime;    get_time_str(s_rtime, rtime);    get_size_str(s_size, size, docp->size + docp->rest_pos);  }  else    get_size_str(s_size, 0, docp->size + docp->rest_pos);  get_rate_str(s_rate, rate);  get_time_str(s_etime, etime);  if(cfg.progres && docp->report_size && !cfg.quiet && !cfg.bgmode#ifdef I_FACE    && !cfg.xi_face#endif    )  {    if(*s_rtime)      xprintf(0, gettext("S: %s [R: %s] [ET: %s] [RT: %s]"),        s_size, s_rate, s_etime, s_rtime);    else      xprintf(0, gettext("S: %s [R: %s] [ET: %s]"), s_size, s_rate, s_etime);    xprintf(0, " \r");  }#ifdef I_FACE  if(docp->report_size && cfg.xi_face)  {    gui_set_progress(s_size, s_rate, s_etime, s_rtime);  }#endif}int doc_lock(doc * docp, int b_lock){  char *lock_name;  int cyclenr = 0;  bufio *s_sock = NULL;  if(!(lock_name = url_to_in_filename(docp->doc_url)))    return -1;  do  {    if(makealldirs(lock_name))    {      if(errno != ENOENT)      {        xperror(lock_name);        docp->errcode = ERR_STORE_DOC;        break;      }      else        continue;    }    else    {      if(cyclenr == 1)      {        xprintf(0, gettext("Waiting to releases document lock on: %s\n"),          lock_name);      }      if(!(s_sock = bufio_copen(lock_name,            O_BINARY | O_RDWR | O_CREAT, 0644)))      {        if(errno != ENOENT)        {          xperror(lock_name);          docp->errcode = ERR_STORE_DOC;          break;        }      }      if(s_sock)      {        if(_flock(bufio_getfd(s_sock), lock_name,            O_BINARY | O_RDWR | O_CREAT, FALSE))        {          if(!b_lock)            docp->errcode = ERR_LOCKED;          bufio_close(s_sock);          s_sock = NULL;        }        else        {          tl_msleep(50);          if(access(lock_name, F_OK))          {            if(!b_lock)              docp->errcode = ERR_LOCKED;            else            {              bufio_close(s_sock);              s_sock = NULL;            }          }        }      }      if(b_lock && !s_sock)      {        tl_sleep(1);        cyclenr++;      }    }  }  while(b_lock && !s_sock);  if(!s_sock)  {    _free(lock_name);    docp->s_sock = NULL;  }  else  {    docp->s_sock = s_sock;    docp->lock_fn = lock_name;  }  return (s_sock == NULL);}/****************************************************//* Unlock document and remove lock file if required *//****************************************************/void doc_remove_lock(doc * docp){  struct utimbuf utmbf;  if(docp->s_sock)  {    DEBUG_LOCKS("Unlocking document %s\n", docp->lock_fn);    /*_funlock(bufio_getfd(docp->s_sock));*/    bufio_close(docp->s_sock);    docp->s_sock = NULL;    /* required because close() causes modification time */    /* change on hard linked file on w2k (maybe generaly */    /* on winnt or just on ntfs ???          */    if(cfg.preserve_time && docp->dtime)    {      struct utimbuf utmbf;      struct stat estat;      stat(docp->lock_fn, &estat);      utmbf.actime = estat.st_atime;      utmbf.modtime = docp->dtime;      utime(docp->lock_fn, &utmbf);    }    if(docp->remove_lock)    {      unlink(docp->lock_fn);    }    else    {      utmbf.actime = time(NULL);      if(docp->dtime && cfg.preserve_time)        utmbf.modtime = docp->dtime;      else        utmbf.modtime = docp->stime;      utime(docp->lock_fn, &utmbf);    }    _free(docp->lock_fn);  }}static void doc_make_clean_dir(doc * docp){  char *p, *ustr;  ustr = tl_strdup(url_to_filename(docp->doc_url, FALSE));  if(!docp->mime && cfg.enable_info)    dinfo_remove(ustr);  p = strrchr(ustr, '/');  if(p)    *p = '\0';  while(strlen(ustr) > strlen(priv_cfg.cache_dir))  {    if(rmdir(ustr))    {      if(errno != ENOTEMPTY && errno != ENOENT && errno != EEXIST)        xperror(ustr);      break;    }    p = strrchr(ustr, '/');    if(p)      *p = '\0';  }  _free(ustr);}void doc_cleanup(doc * docu){  gui_finish_document(docu);  short_log(docu, docu->doc_url);  LOCK_FAILCNT;  if(!((docu->doc_url->status & URL_DOWNLOADED) ||      (docu->doc_url->status & URL_REDIRECT)))    cfg.fail_cnt++;  cfg.process_cnt++;  UNLOCK_FAILCNT;  if(cfg.xi_face)    gui_tree_set_icon_for_doc(docu);  if(docu->errcode)  {    char *infn, *fn;    fn = url_to_filename(docu->doc_url, FALSE);    infn = url_to_in_filename(docu->doc_url);    if(access(fn, F_OK) && access(infn, F_OK))    {      doc_make_clean_dir(docu);      url_forget_filename(docu->doc_url);    }    _free(infn);  }  _free(docu->mime);  _free(docu->type_str);  _free(docu->contents);  _free(docu->etag);  _free(docu->ftp_pasv_host);  _free(docu->additional_headers);  _free(docu->http_proxy);  if(!cfg.auth_reuse_nonce)  {    if(docu->auth_digest)      http_digest_deep_free(docu->auth_digest);    docu->auth_digest = NULL;  }  if(!cfg.auth_reuse_proxy_nonce)  {    if(docu->auth_proxy_digest)      http_digest_deep_free(docu->auth_proxy_digest);    docu->auth_proxy_digest = NULL;  }  gui_clear_status();}void doc_destroy(doc * docu){  short_log(docu, docu->doc_url);  if(docu->s_sock)    bufio_close(docu->s_sock);  if(docu->datasock)    bufio_close(docu->datasock);  _free(docu->mime);  _free(docu->type_str);  _free(docu->contents);  _free(docu->etag);  _free(docu->ftp_pasv_host);  _free(docu->additional_headers);  _free(docu->http_proxy);  if(!cfg.auth_reuse_nonce)  {    if(docu->auth_digest)      http_digest_deep_free(docu->auth_digest);    docu->auth_digest = NULL;  }  if(!cfg.auth_reuse_proxy_nonce)  {    if(docu->auth_proxy_digest)      http_digest_deep_free(docu->auth_proxy_digest);    docu->auth_proxy_digest = NULL;  }}#ifdef HAVE_MTvoid doc_finish_processing(doc * docp){  dllist *ptr = docp->msgbuf;  char *logstr = NULL;  pthread_setspecific(cfg.currdoc_key, (void *) NULL);  LOCK_OUTPUT;  while(ptr)  {    doc_msg *dm = (doc_msg *) ptr->data;    if(dm->log && cfg.logfile)    {      logstr = tl_str_concat(logstr, dm->msg, NULL);    }    xprintf(0, "%s", dm->msg);    _free(dm->msg);    _free(dm);    ptr = dllist_remove_entry(ptr, ptr);  }  UNLOCK_OUTPUT;  if(logstr)    log_str(logstr);}#endif /* HAVE_MT */void doc_update_parent_links(doc * docu){  if((cfg.mode != MODE_NOSTORE) &&    (cfg.dumpfd < 0) &&    (docu->doc_url->type != URLT_FILE) &&    !(docu->doc_url->status & URL_REDIRECT) &&    !(docu->doc_url->status & URL_ISLOCAL) && docu->doc_url->parent_url)  {    if(cfg.rewrite_links &&      !cfg.all_to_local && !cfg.sel_to_local && !cfg.all_to_remote)    {      gui_set_status(gettext("Rewriting links inside parent documents"));      rewrite_parents_links(docu->doc_url, NULL);    }  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -