⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 doc.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 3 页
字号:
  }  return 0;}static int doc_open_new_in_file(doc * docu, int b_lock){  if((cfg.dumpfd < 0) && !docu->s_sock)  {    if(cfg.post_update && docu->type_str)    {      /** dirty hack, but is required to support **/      /** file naming by its MIME type     **/      url_forget_filename(docu->doc_url);      url_to_filename_with_type(docu->doc_url, docu->type_str, TRUE);    }    if(doc_lock(docu, b_lock))    {      docu->errcode = ERR_STORE_DOC;      return -1;    }  }  return 0;}/********************************************************//* nacitanie dokumentu + specificke upravy              *//* FIXME: Translate me!                                 *//********************************************************/static int doc_download_helper(doc * docu, int load, int b_lock){  char *p = NULL, *p1 = 0;  ssize_t len = 0;  int retcode = 0;  int rv;  struct stat estat;  bufio *saved_datasock = NULL;  if(doc_download_init(docu, load))    return -1;  gettimeofday(&docu->hr_start_time, NULL);  if(doc_check_doc_file(docu, &rv))    return rv;  if(doc_open_existing_in_file(docu, b_lock, &rv))    return rv;  if(docu->report_size)    gui_set_status(gettext("Opening connection"));  if(!(docu->datasock = abs_get_data_socket(docu)) &&    /*       pro: add test for errcodes. The errcode tests are needed;       otherwise the "if" some lines later will never get       anything to do...     */    docu->errcode != ERR_FTP_ACTUAL && docu->errcode != ERR_HTTP_ACTUAL)  {    if(docu->mime &&      docu->doc_url->type != URLT_FILE &&      !(docu->doc_url->status & URL_REDIRECT))      dinfo_save(docu);    docu->remove_lock = FALSE;    abs_close_socket(docu, FALSE);    return -1;  }  doc_etime(docu, TRUE);  if(docu->errcode == ERR_HTTP_ACTUAL || docu->errcode == ERR_FTP_ACTUAL)  {    abs_close_socket(docu, FALSE);    saved_datasock = docu->datasock;    if(docu->load || docu->is_parsable)    {      xprintf(1, gettext("Loading local copy\n"));      p = url_to_filename(docu->doc_url, TRUE);      if(!(docu->datasock = bufio_open(p, O_BINARY | O_RDONLY)))      {        docu->datasock = saved_datasock;        docu->errcode = ERR_FILE_OPEN;        return -1;      }      docu->doc_url->status |= URL_REDIRECT;      docu->doc_url->status |= URL_ISLOCAL;      docu->save_online = FALSE;    }    else    {      docu->doc_url->status |= URL_REDIRECT;      return 1;    }  }  else if(docu->errcode == ERR_HTTP_NOREGET)  {    if(!cfg.freget)    {      docu->is_persistent = FALSE;      abs_close_socket(docu, FALSE);      docu->remove_lock = FALSE;      docu->ftp_fatal_err = TRUE;      return -1;    }    else    {      docu->rest_pos = 0;    }  }  else if(docu->errcode == ERR_FTP_NOREGET)  {    if(!cfg.freget)    {      docu->is_persistent = FALSE;      abs_close_socket(docu, FALSE);      docu->remove_lock = FALSE;      docu->ftp_fatal_err = TRUE;      return -1;    }    else    {      docu->rest_pos = 0;    }  }  else if(docu->errcode)  {    if(docu->mime &&      docu->doc_url->type != URLT_FILE &&      !(docu->doc_url->status & URL_REDIRECT))      dinfo_save(docu);    docu->is_persistent = FALSE;    docu->ftp_fatal_err = TRUE;    abs_close_socket(docu, FALSE);    return -1;  }  if(doc_open_new_in_file(docu, b_lock))  {    docu->is_persistent = FALSE;    docu->ftp_fatal_err = TRUE;    abs_close_socket(docu, FALSE);    return -1;  }  if(docu->doc_url->type != URLT_FILE &&    !(docu->doc_url->status & URL_REDIRECT))    dinfo_save(docu);  if((((docu->doc_url->type == URLT_FTP ||          docu->doc_url->type == URLT_FTPS) &&        !docu->doc_url->p.ftp.dir) ||      (docu->doc_url->type == URLT_GOPHER &&        (docu->doc_url->p.gopher.selector[0] != '1' ||          docu->doc_url->p.gopher.selector[0] != 'h')) ||      (docu->doc_url->type == URLT_HTTP ||        docu->doc_url->type == URLT_HTTPS)) &&    !(docu->doc_url->status & URL_REDIRECT))  {    if(cfg.dumpfd >= 0 && cfg.dump_after)      docu->save_online = FALSE;    else      docu->save_online = TRUE;  }  if(cfg.ftp_html &&    (docu->doc_url->type == URLT_FTP ||      docu->doc_url->type == URLT_FTPS) &&    ext_is_html(docu->doc_url->p.ftp.path))  {    docu->is_parsable = TRUE;  }  if(docu->doc_url->status & URL_INNSCACHE)  {    fstat(bufio_getfd(docu->datasock), &estat);    docu->totsz = estat.st_size;    docu->is_parsable = (docu->doc_url->status & URL_ISHTML) != 0;  }  if(docu->errcode == ERR_HTTP_FAILREGET)  {    docu->rest_pos = 0;    docu->save_online = FALSE;  }  if(docu->save_online)  {    if(cfg.dumpfd < 0)    {      ftruncate(bufio_getfd(docu->s_sock), docu->rest_pos);      lseek(bufio_getfd(docu->s_sock), docu->rest_pos, SEEK_SET);      bufio_reset(docu->s_sock);    }  }  /* We measure time-to-first-byte here again, to add all the processing   * timing noise (1-2ms) to the large value -- FB, which is typically   * on the order of 100ms, rather then to the time-to-last-byte,   * which is often around 0ms   */  gettimeofday(&docu->first_byte_time, NULL);  retcode = doc_transfer_data(docu);  abs_close_socket(docu, TRUE);  if(!retcode)    docu->remove_lock = TRUE;  if(docu->errcode == ERR_HTTP_ACTUAL)  {    docu->doc_url->status &= ~URL_REDIRECT;    docu->datasock = saved_datasock;  }  if(!retcode && docu->doc_url->status & URL_INNSCACHE)  {    docu->is_parsable = (docu->doc_url->status & URL_ISHTML) != 0;  }  else if(!retcode &&    docu->doc_url->type == URLT_GOPHER &&    !(docu->doc_url->status & URL_REDIRECT) &&    !(priv_cfg.gopher_proxy && cfg.gopher_via_http))  {    docu->is_parsable = FALSE;    /**** convert Gopher directory to HTML ****/    if(docu->doc_url->p.gopher.selector[0] == '1')    {      if(!(docu->doc_url->status & URL_REDIRECT))        gopher_dir_to_html(docu);      docu->is_parsable = TRUE;    }    else if(docu->doc_url->p.gopher.selector[0] == 'h')    {      docu->is_parsable = TRUE;    }  }  else if(!retcode &&    (docu->doc_url->type == URLT_FTP ||      docu->doc_url->type == URLT_FTPS) &&    !(priv_cfg.ftp_proxy && cfg.ftp_via_http && !cfg.ftp_dirtyp) &&    !(docu->doc_url->status & URL_REDIRECT))  {    docu->is_parsable = ext_is_html(docu->doc_url->p.ftp.path) != 0;    /*** convert FTP directory listing to HTML ***/    if(docu->doc_url->p.ftp.dir)    {      if(!(docu->doc_url->status & URL_REDIRECT))      {        ftp_dir_to_html(docu);      }      docu->is_parsable = TRUE;    }  }  else if(docu->is_http_transfer && !retcode &&    !(docu->doc_url->status & URL_REDIRECT))  {    http_response *resp;    /*** get HTTP response status info ***/    resp = http_get_response_info(docu->mime);    if(resp)    {      /*** set proper HTTP error code ***/      if(resp->ret_code >= 400)      {        docu->errcode = 2000 + resp->ret_code;        http_response_free(resp);        return -1;      }      /*** redirect to other URL ***/      if(resp->ret_code == 303 ||        resp->ret_code == 302 ||        resp->ret_code == 307 || resp->ret_code == 301)      {        http_handle_redirect(docu, resp->ret_code);        http_response_free(resp);        if(docu->is_persistent)        {          if(docu->doc_url->moved_to &&            ((url_get_port(docu->doc_url) !=                url_get_port(docu->doc_url->moved_to))              || strcmp(url_get_site(docu->doc_url),                url_get_site(docu->doc_url->moved_to))))          {            abs_close_socket(docu, TRUE);          }          docu->is_persistent = FALSE;        }        return -1;      }      http_response_free(resp);    }    /*** check if document was downloaded whole  ***/    /*** when we know real document size and no  ***/    /*** other error was detected before         ***/    if(cfg.check_size && docu->totsz > 0 && docu->errcode == ERR_NOERROR)    {      /*** if docu->contents && docu->rest_pos  ***/      /*** document was reareaded from file and ***/      /*** docu->size is total len              ***/      if(docu->totsz != docu->size + (docu->contents ? 0 : docu->rest_pos))      {        docu->errcode = ERR_HTTP_TRUNC;        docu->remove_lock = FALSE;        retcode = -1;        xprintf(1, gettext("File may be truncated\n"));      }    }    /*** handle encoded document and decode   ***/    /*** it if possible and user requested it ***/    p = get_mime_param_val_str("Content-Encoding:", docu->mime);    if(cfg.use_enc && !retcode && p &&      (!strncasecmp(docu->type_str, "text/plain", 10) ||        !strncasecmp(docu->type_str, "text/css", 8) ||        !strncasecmp(docu->type_str, "text/html", 9)))    {      if((!strcasecmp(p, "x-gzip")) ||        (!strcasecmp(p, "gzip")) ||        (!strcasecmp(p, "x-compress")) || (!strcasecmp(p, "compress")))      {        if(!gzip_decode(docu->contents, docu->size,            &p1, &len, (docu->contents ? NULL : docu->lock_fn)))        {          docu->size = len;          _free(docu->contents);          docu->contents = p1;          xprintf(1, gettext("Decoding document - OK\n"));        }        else          xperror(gettext("Decoding document - failed\n"));      }      else if(!strcasecmp(p, "deflate"))      {        if(!inflate_decode(docu->contents, docu->size,            &p1, &len, (docu->contents ? NULL : docu->lock_fn)))        {          docu->size = len;          _free(docu->contents);          docu->contents = p1;          xprintf(1, gettext("Decoding document - OK\n"));        }        else        {          xperror(gettext("Deflating document - failed\n"));        }      }      else        xprintf(1, gettext("Unsupported document encoding\n"));    }    else if(p && !retcode)    {      xprintf(1,        gettext        ("Received Encoded file but decoding not allowed (untouched)\n"));    }    _free(p);  }  else  {    if(docu->doc_url->type == URLT_FILE ||      (docu->doc_url->status & URL_REDIRECT))    {      p1 = url_to_filename(docu->doc_url, TRUE);      if(file_is_html(p1))      {        docu->is_parsable = TRUE;      }    }    else    {      docu->is_parsable = FALSE;    }  }  if(docu->totsz > 0 &&    docu->size == 0 &&    (docu->doc_url->type == URLT_HTTP || docu->doc_url->type == URLT_HTTPS))  {    if(!docu->errcode)      docu->errcode = ERR_ZERO_SIZE;    docu->remove_lock = FALSE;    retcode = -1;  }#ifdef I_FACE  if(cfg.xi_face)    doc_set_info(docu);#endif  if(!retcode && docu->lock_fn && docu->save_online &&    (cfg.dumpfd < 0) && !docu->contents &&    (cfg.mode != MODE_NOSTORE) &&    (cfg.mode != MODE_FTPDIR) &&    !(docu->doc_url->status & URL_REDIRECT) &&    (docu->doc_url->type != URLT_FILE))  {    p1 = url_to_filename(docu->doc_url, TRUE);    if(!access(p1, F_OK))    {      if(unlink(p1))        xperror(p1);    }    if(link(docu->lock_fn, p1))    {#ifdef __CYGWIN__      if(errno != EPERM && errno != EACCES)#elif __BEOS__      /* ?? no working link() on BeOS ?? */      if(FALSE)#else      if(errno != EPERM)#endif        xperror(p1);      else      {        if(copy_fd_to_file(bufio_getfd(docu->s_sock), p1))          xperror(p1);      }    }    if(cfg.preserve_time && docu->dtime)    {      struct utimbuf utmbf;      stat(p1, &estat);      utmbf.actime = estat.st_atime;      utmbf.modtime = docu->dtime;      utime(p1, &utmbf);    }    if(cfg.preserve_perm &&      (docu->doc_url->type == URLT_FTP ||        docu->doc_url->type == URLT_FTPS) &&      docu->doc_url->extension &&      (((ftp_url_extension *) docu->doc_url->extension)->perm > 0))    {      chmod(p1, ((ftp_url_extension *) docu->doc_url->extension)->perm);    }  }  return retcode;}int doc_download(doc * docu, int load, int b_lock){  const int rc = doc_download_helper(docu, load, b_lock);  gettimeofday(&docu->end_time, NULL);  time_log(docu);  return rc;}/********************************************************//* ulozi dokument ak je to potrebne vytvori adresare    *//* FIXME: Translate me!                                 *//********************************************************/int doc_store(doc * docu, int overwrite){  char *pom;  int f;  struct utimbuf utmbf;  struct stat estat;  if(cfg.mode == MODE_NOSTORE || cfg.mode == MODE_FTPDIR)    return 0;  /*** don't store directory indexes ***/  if(!cfg.store_index && url_is_dir_index(docu->doc_url))    return 0;  pom = url_to_filename(docu->doc_url, TRUE);  if(makealldirs(pom))    xperror(pom);  if(!access(pom, R_OK) && !overwrite)  {    return 0;  }  /*     pro: before we open the file we unlink it. This way we assure that     other directory that have a hard link to our (old) file will still     have a hard link to the old file.   */  if(cfg.remove_before_store)  {    unlink(pom);  }  if((f =      open(pom, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY,        S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR)) == -1)  {    if(!access(pom, R_OK))      unlink(pom);    xperror(pom);    return -1;  }  if(write(f, docu->contents, docu->size) != docu->size)  {    if(!access(pom, R_OK))      unlink(pom);    xperror(pom);    close(f);    return -1;  }  close(f);  if(docu->dtime && cfg.preserve_time)  {    utmbf.modtime = docu->dtime;  }  else  {    utmbf.modtime = docu->stime;  }  stat(pom, &estat);  utmbf.actime = estat.st_atime;  utime(pom, &utmbf);  if((docu->doc_url->type == URLT_FTP ||      docu->doc_url->type == URLT_FTPS) &&    docu->doc_url->extension &&    cfg.preserve_perm &&    (((ftp_url_extension *) docu->doc_url->extension)->perm > 0))  {    chmod(pom, ((ftp_url_extension *) docu->doc_url->extension)->perm);  }  return 0;}/*** remove improper documents if required ***/int doc_remove(url * urlr){  char *fn;#ifdef DEBUG  if(cfg.debug)  {    fn = url_to_urlstr(urlr, FALSE);    xprintf(1, gettext("Removing improper document : %s\n"), fn);    _free(fn);  }#endif  fn = url_to_filename(urlr, TRUE);  if(urlr->type == URLT_FTP || urlr->type == URLT_FTPS)  {    char *p;    p = strrchr(fn, '/');    if(p)      p++;    else      p = fn;    /*** if URL FTPdir index ***/    if(!strcmp(p, priv_cfg.index_name))      *p = '\0';    if(cfg.enable_info)      dinfo_remove(fn);    return unlink_recursive(fn);  }  else  {    if(cfg.enable_info)      dinfo_remove(fn);    if(!access(fn, F_OK) && unlink(fn))    {      xperror(fn);      return -1;    }  }  return 0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -