⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 html.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 2 页
字号:
  html_parser_add_tag_func(hp,    (html_parser_func_t) html_parser_parse_tag_slash_a, &einfo);  html_parser_add_tag_func(hp, html_parser_parse_tag_meta_refresh, NULL);  if(cfg.condition.allow_robots)    html_parser_add_tag_func(hp,      (html_parser_func_t) html_parser_parse_tag_meta_robots, &oinfo);  html_parser_add_attrib_func(hp, html_parser_url_to_absolute_url, NULL);#ifdef HAVE_REGEX  if(rewrite && cfg.remove_adv && priv_cfg.advert_res)    html_parser_add_attrib_func(hp, html_parser_remove_advertisement, NULL);#endif  html_parser_add_attrib_func(hp, html_parser_process_base, NULL);  html_parser_add_attrib_func(hp,    (html_parser_func_t) html_parser_process_form, formlist);  if(follow)    html_parser_add_attrib_func(hp,      (html_parser_func_t) html_parser_get_url, &einfo);  if(rewrite && !cfg.post_update)    html_parser_add_attrib_func(hp,      (html_parser_func_t) html_parser_url_to_local, &rinfo);  html_parser_add_style_func(hp, html_parser_style_to_absolute_urls, NULL);  if(follow)    html_parser_add_style_func(hp,      (html_parser_func_t) html_parser_get_style_urls, &einfo);  if(rewrite && !cfg.post_update)    html_parser_add_style_func(hp,      (html_parser_func_t) html_parser_style_to_local_urls, &rinfo);  if(cfg.enable_js)  {    html_parser_add_script_func(hp, html_parser_parse_jspatterns, NULL);    html_parser_add_script_func(hp, html_parser_parse_body_jspatterns, NULL);#ifdef HAVE_REGEX    if(priv_cfg.js_transform)    {      html_parser_add_tag_func(hp, html_parser_parse_tag_jstransform, NULL);      html_parser_add_script_func(hp, html_parser_parse_body_jstransform,        NULL);    }#endif  }  html_parser_parse(hp);  if(rewrite)  {    _free(html_doc->contents);    html_parser_take_document(hp, &html_doc->contents, &html_doc->size);  }  html_parser_kill(hp);  /*** support for robots limits in META only ***/  /*** nofollow supported, rest doesn't have  ***/  /*** any real meaning in pavuk              ***/  if(!oinfo.follow)  {    DEBUG_HTML("NOFOLLOW attribute in meta data found\n");    while(einfo.urls)    {      free_deep_url((url *) einfo.urls->data);      free((url *)einfo.urls->data);      einfo.urls = dllist_remove_entry(einfo.urls, einfo.urls);    }  }  return einfo.urls;}/*****************************************//* adjust URLs inside document to point  *//* to present local documents            *//*****************************************/void html_process_parent_document(doc * html_doc, url * url_old,  char *url_new){  char *base, *baset;  html_parser_t *hp;  html_extract_info_t einfo;  html_rewrite_info_t rinfo;  html_change_info_t chinfo;  int purestyle;  int purescript;  char *relfn = NULL;  purestyle = (html_doc->doc_url->status & URL_STYLE);  purescript = (html_doc->doc_url->status & URL_ISSCRIPT);  if(cfg.all_to_local || cfg.sel_to_local || cfg.all_to_remote)    return;  einfo.prev_a = NULL;  einfo.urls = NULL;  einfo.no_limits = FALSE;  einfo.only_inline = FALSE;  einfo.enable_js = cfg.enable_js;  rinfo.einfo = &einfo;  rinfo.all_to_local = cfg.all_to_local;  rinfo.selected_to_local = cfg.sel_to_local;  rinfo.all_to_remote = cfg.all_to_remote;  chinfo.url_old = url_old;  if(url_new)    chinfo.url_new = url_new;  else if(cfg.post_update)  {    relfn = get_relative_path(url_to_filename(html_doc->doc_url, FALSE),      url_to_filename(url_old, FALSE));    chinfo.url_new = relfn;  }  else    chinfo.url_new = NULL;  hp = html_parser_init(html_link_tags, html_link_tags_num(),    TRUE, purestyle, purescript);  html_get_base_url(html_doc, &base, &baset);  html_parser_set_base(hp, base, baset);  html_parser_set_document(hp, html_doc->doc_url,    html_doc->contents, html_doc->size);  html_parser_add_tag_func(hp, html_parser_parse_tag, NULL);  html_parser_add_tag_func(hp, html_parser_parse_tag_meta_refresh, NULL);  if(chinfo.url_new)    html_parser_add_attrib_func(hp,      (html_parser_func_t) html_parser_change_url, &chinfo);  if(!cfg.post_update)    html_parser_add_attrib_func(hp,      (html_parser_func_t) html_parser_url_to_local, &rinfo);  if(chinfo.url_new)    html_parser_add_style_func(hp,      (html_parser_func_t) html_parser_style_change_url, &chinfo);  if(!cfg.post_update)    html_parser_add_style_func(hp,      (html_parser_func_t) html_parser_style_to_local_urls, &rinfo);  if(cfg.enable_js)  {    html_parser_add_script_func(hp, html_parser_parse_jspatterns, NULL);    html_parser_add_script_func(hp, html_parser_parse_body_jspatterns, NULL);  }  html_parser_parse(hp);  _free(html_doc->contents);  html_parser_take_document(hp, &html_doc->contents, &html_doc->size);  html_parser_kill(hp);  _free(relfn);}/*************************************************//* load parent document adjust it and store back *//* with locking and modification time preserving *//*************************************************/void rewrite_one_parent_links(url * doc_url, url * parent_url, char *dst_name){  char pom[PATH_MAX];  char *fnamep;  char *rfn = NULL;  char *savetmp, *p;  int fd;  doc pdoc;  struct stat estat;  struct utimbuf ut;  int perm;  url dum;  DEBUG_PROCS("rewrite_one_parent_links()");  if(!parent_url || !(parent_url->status & URL_DOWNLOADED))    return;  /*** parent document was not stored ***/  if(!cfg.store_index && url_is_dir_index(parent_url))    return;  fnamep = url_to_filename(parent_url, FALSE);  if(stat(fnamep, &estat) == 0)  {    if(S_ISDIR(estat.st_mode))    {      xprintf(1, gettext("Can't work on directory\n"));      return;    }  }  else  {    xperror("stat");    return;  }  perm = estat.st_mode;  ut.actime = estat.st_atime;  ut.modtime = estat.st_mtime;  memset(&dum, 0, sizeof(url));  dum.type = URLT_FILE;  dum.p.file.filename = fnamep;  dum.local_name = fnamep;  dum.status = parent_url->status & URL_STYLE;  dum.status &= ~URL_REDIRECT;  doc_init(&pdoc, &dum);  pdoc.report_size = FALSE;  if(doc_download(&pdoc, TRUE, TRUE))  {    doc_remove_lock(&pdoc);    if(pdoc.errcode)      report_error(&pdoc, gettext("rewrite parent"));    return;  }  if(pdoc.errcode)    report_error(&pdoc, gettext("rewrite parent"));  _free(pdoc.mime);  /* dst_name != NULL means child document was moved */  if(dst_name &&    !access(dst_name, R_OK) &&    !stat(dst_name, &estat) && !S_ISDIR(estat.st_mode))  {    rfn = get_relative_path(fnamep, dst_name);  }  html_process_parent_document(&pdoc, doc_url, rfn);  _free(rfn);  strncpy(pom, fnamep, sizeof(pom) - 20);  pom[sizeof(pom) - 21] = '\0';  p = strrchr(pom, '/');  if(p)    sprintf(p + 1, "_*%d", (int) getpid());  else    snprintf(pom, sizeof(pom), "%s/_*%d", pom, (int) getpid());  savetmp = tl_strdup(pom);  rename(fnamep, savetmp);  if((fd =      open(fnamep, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY,        S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH)) < 0)  {    xperror(fnamep);    rename(savetmp, fnamep);    doc_remove_lock(&pdoc);    free(savetmp);    free(pdoc.contents);    return;  }  if(write(fd, pdoc.contents, pdoc.size) != pdoc.size)  {    xperror(fnamep);    close(fd);    rename(savetmp, fnamep);    doc_remove_lock(&pdoc);    free(savetmp);    free(pdoc.contents);    return;  }  close(fd);  doc_remove_lock(&pdoc);  utime(fnamep, &ut);  chmod(fnamep, perm);  unlink(savetmp);  free(savetmp);  free(pdoc.contents);  DEBUG_PROCE("rewrite_one_parent_links()");}/*************************************************//* take all parent documents and adjust inside   *//* all URLs, recurse up when document was moved  *//*************************************************/void rewrite_parents_links(url * doc_url, char *dst_name){  char *fn = NULL;  dllist *ptr;  if((doc_url->status & URL_MOVED) && !dst_name)    return;  LOCK_URL(doc_url);  for(ptr = doc_url->parent_url; ptr; ptr = ptr->next)  {    url *parent_url = (url *) ptr->data;    if(cfg.rbreak)      break;    if(parent_url->status & URL_MOVED)    {      fn = dst_name ? dst_name : url_to_filename(doc_url, FALSE);      rewrite_parents_links(parent_url, fn);    }    else    {      rewrite_one_parent_links(doc_url, parent_url, dst_name);    }  }  UNLOCK_URL(doc_url);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -