📄 html.c
字号:
html_parser_add_tag_func(hp, (html_parser_func_t) html_parser_parse_tag_slash_a, &einfo); html_parser_add_tag_func(hp, html_parser_parse_tag_meta_refresh, NULL); if(cfg.condition.allow_robots) html_parser_add_tag_func(hp, (html_parser_func_t) html_parser_parse_tag_meta_robots, &oinfo); html_parser_add_attrib_func(hp, html_parser_url_to_absolute_url, NULL);#ifdef HAVE_REGEX if(rewrite && cfg.remove_adv && priv_cfg.advert_res) html_parser_add_attrib_func(hp, html_parser_remove_advertisement, NULL);#endif html_parser_add_attrib_func(hp, html_parser_process_base, NULL); html_parser_add_attrib_func(hp, (html_parser_func_t) html_parser_process_form, formlist); if(follow) html_parser_add_attrib_func(hp, (html_parser_func_t) html_parser_get_url, &einfo); if(rewrite && !cfg.post_update) html_parser_add_attrib_func(hp, (html_parser_func_t) html_parser_url_to_local, &rinfo); html_parser_add_style_func(hp, html_parser_style_to_absolute_urls, NULL); if(follow) html_parser_add_style_func(hp, (html_parser_func_t) html_parser_get_style_urls, &einfo); if(rewrite && !cfg.post_update) html_parser_add_style_func(hp, (html_parser_func_t) html_parser_style_to_local_urls, &rinfo); if(cfg.enable_js) { html_parser_add_script_func(hp, html_parser_parse_jspatterns, NULL); html_parser_add_script_func(hp, html_parser_parse_body_jspatterns, NULL);#ifdef HAVE_REGEX if(priv_cfg.js_transform) { html_parser_add_tag_func(hp, html_parser_parse_tag_jstransform, NULL); html_parser_add_script_func(hp, html_parser_parse_body_jstransform, NULL); }#endif } html_parser_parse(hp); if(rewrite) { _free(html_doc->contents); html_parser_take_document(hp, &html_doc->contents, &html_doc->size); } html_parser_kill(hp); /*** support for robots limits in META only ***/ /*** nofollow supported, rest doesn't have ***/ /*** any real meaning in pavuk ***/ if(!oinfo.follow) { DEBUG_HTML("NOFOLLOW attribute in meta data found\n"); while(einfo.urls) { free_deep_url((url *) einfo.urls->data); free((url *)einfo.urls->data); einfo.urls = dllist_remove_entry(einfo.urls, einfo.urls); } } return einfo.urls;}/*****************************************//* adjust URLs inside document to point *//* to present local documents *//*****************************************/void html_process_parent_document(doc * html_doc, url * url_old, char *url_new){ char *base, *baset; html_parser_t *hp; html_extract_info_t einfo; html_rewrite_info_t rinfo; html_change_info_t chinfo; int purestyle; int purescript; char *relfn = NULL; purestyle = (html_doc->doc_url->status & URL_STYLE); purescript = (html_doc->doc_url->status & URL_ISSCRIPT); if(cfg.all_to_local || cfg.sel_to_local || cfg.all_to_remote) return; einfo.prev_a = NULL; einfo.urls = NULL; einfo.no_limits = FALSE; einfo.only_inline = FALSE; einfo.enable_js = cfg.enable_js; rinfo.einfo = &einfo; rinfo.all_to_local = cfg.all_to_local; rinfo.selected_to_local = cfg.sel_to_local; rinfo.all_to_remote = cfg.all_to_remote; chinfo.url_old = url_old; if(url_new) chinfo.url_new = url_new; else if(cfg.post_update) { relfn = get_relative_path(url_to_filename(html_doc->doc_url, FALSE), url_to_filename(url_old, FALSE)); chinfo.url_new = relfn; } else chinfo.url_new = NULL; hp = html_parser_init(html_link_tags, html_link_tags_num(), TRUE, purestyle, purescript); html_get_base_url(html_doc, &base, &baset); html_parser_set_base(hp, base, baset); html_parser_set_document(hp, html_doc->doc_url, html_doc->contents, html_doc->size); html_parser_add_tag_func(hp, html_parser_parse_tag, NULL); html_parser_add_tag_func(hp, html_parser_parse_tag_meta_refresh, NULL); if(chinfo.url_new) html_parser_add_attrib_func(hp, (html_parser_func_t) html_parser_change_url, &chinfo); if(!cfg.post_update) html_parser_add_attrib_func(hp, (html_parser_func_t) html_parser_url_to_local, &rinfo); if(chinfo.url_new) html_parser_add_style_func(hp, (html_parser_func_t) html_parser_style_change_url, &chinfo); if(!cfg.post_update) html_parser_add_style_func(hp, (html_parser_func_t) html_parser_style_to_local_urls, &rinfo); if(cfg.enable_js) { html_parser_add_script_func(hp, html_parser_parse_jspatterns, NULL); html_parser_add_script_func(hp, html_parser_parse_body_jspatterns, NULL); } html_parser_parse(hp); _free(html_doc->contents); html_parser_take_document(hp, &html_doc->contents, &html_doc->size); html_parser_kill(hp); _free(relfn);}/*************************************************//* load parent document adjust it and store back *//* with locking and modification time preserving *//*************************************************/void rewrite_one_parent_links(url * doc_url, url * parent_url, char *dst_name){ char pom[PATH_MAX]; char *fnamep; char *rfn = NULL; char *savetmp, *p; int fd; doc pdoc; struct stat estat; struct utimbuf ut; int perm; url dum; DEBUG_PROCS("rewrite_one_parent_links()"); if(!parent_url || !(parent_url->status & URL_DOWNLOADED)) return; /*** parent document was not stored ***/ if(!cfg.store_index && url_is_dir_index(parent_url)) return; fnamep = url_to_filename(parent_url, FALSE); if(stat(fnamep, &estat) == 0) { if(S_ISDIR(estat.st_mode)) { xprintf(1, gettext("Can't work on directory\n")); return; } } else { xperror("stat"); return; } perm = estat.st_mode; ut.actime = estat.st_atime; ut.modtime = estat.st_mtime; memset(&dum, 0, sizeof(url)); dum.type = URLT_FILE; dum.p.file.filename = fnamep; dum.local_name = fnamep; dum.status = parent_url->status & URL_STYLE; dum.status &= ~URL_REDIRECT; doc_init(&pdoc, &dum); pdoc.report_size = FALSE; if(doc_download(&pdoc, TRUE, TRUE)) { doc_remove_lock(&pdoc); if(pdoc.errcode) report_error(&pdoc, gettext("rewrite parent")); return; } if(pdoc.errcode) report_error(&pdoc, gettext("rewrite parent")); _free(pdoc.mime); /* dst_name != NULL means child document was moved */ if(dst_name && !access(dst_name, R_OK) && !stat(dst_name, &estat) && !S_ISDIR(estat.st_mode)) { rfn = get_relative_path(fnamep, dst_name); } html_process_parent_document(&pdoc, doc_url, rfn); _free(rfn); strncpy(pom, fnamep, sizeof(pom) - 20); pom[sizeof(pom) - 21] = '\0'; p = strrchr(pom, '/'); if(p) sprintf(p + 1, "_*%d", (int) getpid()); else snprintf(pom, sizeof(pom), "%s/_*%d", pom, (int) getpid()); savetmp = tl_strdup(pom); rename(fnamep, savetmp); if((fd = open(fnamep, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY, S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH)) < 0) { xperror(fnamep); rename(savetmp, fnamep); doc_remove_lock(&pdoc); free(savetmp); free(pdoc.contents); return; } if(write(fd, pdoc.contents, pdoc.size) != pdoc.size) { xperror(fnamep); close(fd); rename(savetmp, fnamep); doc_remove_lock(&pdoc); free(savetmp); free(pdoc.contents); return; } close(fd); doc_remove_lock(&pdoc); utime(fnamep, &ut); chmod(fnamep, perm); unlink(savetmp); free(savetmp); free(pdoc.contents); DEBUG_PROCE("rewrite_one_parent_links()");}/*************************************************//* take all parent documents and adjust inside *//* all URLs, recurse up when document was moved *//*************************************************/void rewrite_parents_links(url * doc_url, char *dst_name){ char *fn = NULL; dllist *ptr; if((doc_url->status & URL_MOVED) && !dst_name) return; LOCK_URL(doc_url); for(ptr = doc_url->parent_url; ptr; ptr = ptr->next) { url *parent_url = (url *) ptr->data; if(cfg.rbreak) break; if(parent_url->status & URL_MOVED) { fn = dst_name ? dst_name : url_to_filename(doc_url, FALSE); rewrite_parents_links(parent_url, fn); } else { rewrite_one_parent_links(doc_url, parent_url, dst_name); } } UNLOCK_URL(doc_url);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -