htmlparser.c
来自「网络爬虫程序」· C语言 代码 · 共 1,536 行 · 第 1/3 页
C
1,536 行
} if(tagstart || stylestart) { hpinfo->stack[hpinfo->stack_offset] = '\0'; html_parser_flush_stack_to_output(hpinfo); } if(hpinfo->rewrite) hpinfo->out_content[hpinfo->out_offset] = '\0';}/********************************************//* functions for processing whole HTML tags *//********************************************/void html_parser_parse_tag(html_parser_t * hpinfo, char *stack, void *data){ int j; dllist *ptr; if(!html_parser_check_tag(hpinfo, hpinfo->stack + 1)) return; if(hpinfo->current_tag->type == HTML_TAG_META) return; for(j = 0; hpinfo->current_tag->attribs[j].attrib; j++) { hpinfo->current_attrib = &hpinfo->current_tag->attribs[j]; if(hpinfo->current_attrib->stat & LINK_DISABLED) continue; hpinfo->tag_attrib = html_get_attrib_from_tag(hpinfo->stack, hpinfo->current_attrib->attrib); /*** -dont_touch_url_pattern support ***/ if(hpinfo->tag_attrib && cfg.dont_touch_url_pattern) { if(is_in_pattern_list(hpinfo->tag_attrib, cfg.dont_touch_url_pattern)) { _free(hpinfo->tag_attrib); } }#ifdef HAVE_REGEX /*** -dont_touch_url_rpattern support ***/ for(ptr = cfg.dont_touch_url_rpattern; ptr && hpinfo->tag_attrib; ptr = ptr->next) { if(re_pmatch((re_entry *) ptr->data, hpinfo->tag_attrib)) _free(hpinfo->tag_attrib); } /*** -dont_touch_tag_rpattern support ***/ for(ptr = cfg.dont_touch_tag_rpattern; ptr && hpinfo->tag_attrib; ptr = ptr->next) { if(re_pmatch((re_entry *)ptr->data, hpinfo->stack)) _free(hpinfo->tag_attrib); }#endif if(hpinfo->tag_attrib) { /* to support javascript:... URLs */ /* inside any attribute */ if(!strncasecmp(hpinfo->tag_attrib, "javascript:", 11)) { char *saved_attrib = hpinfo->tag_attrib; hpinfo->tag_attrib = tl_strdup(saved_attrib + 11); html_parser_call_funcs(hpinfo, hpinfo->script_funcs); if(hpinfo->rewrite) { int len; len = strlen(hpinfo->tag_attrib); saved_attrib = _realloc(saved_attrib, 12 + len); memcpy(saved_attrib + 11, hpinfo->tag_attrib, len + 1); _free(hpinfo->tag_attrib); hpinfo->tag_attrib = saved_attrib; } else _free(saved_attrib); } else if(hpinfo->current_attrib->stat & LINK_STYLE) html_parser_call_funcs(hpinfo, hpinfo->style_funcs); else if(hpinfo->current_attrib->stat & LINK_JS) html_parser_call_funcs(hpinfo, hpinfo->script_funcs); else html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs); } if(hpinfo->rewrite && hpinfo->tag_attrib) { int l = strlen(hpinfo->tag_attrib); html_parser_SEND(hpinfo); html_parser_SEXPAND(hpinfo, l); html_replace_url_in_stack(hpinfo->stack, hpinfo->current_attrib->attrib, hpinfo->tag_attrib, FALSE); } _free(hpinfo->tag_attrib); }}void html_parser_parse_tag_slash_a(html_parser_t * hpinfo, char *stack, html_extract_info_t * einfo){ if(einfo->prev_a && !strcasecmp(hpinfo->stack, "</A>")) { einfo->prev_a = NULL; }}void html_parser_parse_tag_meta_refresh(html_parser_t * hpinfo, char *stack, void *data){ char *saved_meta = (char *) 0; char *meta_type; if(!hpinfo->current_tag || hpinfo->current_tag->type != HTML_TAG_META) return; hpinfo->current_attrib = &hpinfo->current_tag->attribs[0]; meta_type = html_get_attrib_from_tag(hpinfo->stack, "HTTP-EQUIV"); if(!meta_type || strcasecmp(meta_type, "Refresh")) { _free(meta_type); return; } _free(meta_type); saved_meta = html_get_attrib_from_tag(hpinfo->stack, "CONTENT"); if(!saved_meta) return; hpinfo->tag_attrib = html_get_attrib_from_tag(saved_meta, "URL"); if(hpinfo->tag_attrib) { html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs); if(hpinfo->rewrite) { /* little hack to prevent writing */ /* outside of allocated memory chunk */ saved_meta = _realloc(saved_meta, strlen(saved_meta) + strlen(hpinfo->tag_attrib) + 4); html_replace_url_in_stack(saved_meta, "URL", hpinfo->tag_attrib, TRUE); _free(hpinfo->tag_attrib); hpinfo->tag_attrib = saved_meta; if(hpinfo->tag_attrib) { int l = strlen(hpinfo->tag_attrib); html_parser_SEND(hpinfo); html_parser_SEXPAND(hpinfo, l); html_replace_url_in_stack(hpinfo->stack, hpinfo->current_attrib->attrib, hpinfo->tag_attrib, FALSE); hpinfo->tag_attrib = 0; } } else { _free(hpinfo->tag_attrib); } } _free(saved_meta);}void html_parser_parse_tag_meta_robots(html_parser_t * hpinfo, char *stack, html_robots_info_t * oinfo){ char *meta_type; char *content; char **flags; int i; if(!hpinfo->current_tag || hpinfo->current_tag->type != HTML_TAG_META) return; meta_type = html_get_attrib_from_tag(hpinfo->stack, "NAME"); if(!meta_type || strcasecmp(meta_type, "Robots")) { _free(meta_type); return; } _free(meta_type); content = html_get_attrib_from_tag(hpinfo->stack, "CONTENT"); if(!content) return; flags = tl_str_split(content, ","); _free(content); for(i = 0; flags && flags[i]; i++) { if(!strcasecmp(flags[i], "all")) { oinfo->index = TRUE; oinfo->follow = TRUE; oinfo->images = TRUE; } else if(!strcasecmp(flags[i], "none")) { oinfo->index = FALSE; oinfo->follow = FALSE; oinfo->images = FALSE; } else if(!strcasecmp(flags[i], "index")) oinfo->index = TRUE; else if(!strcasecmp(flags[i], "follow")) oinfo->follow = TRUE; else if(!strcasecmp(flags[i], "noimageindex")) oinfo->images = FALSE; else if(!strcasecmp(flags[i], "noindex")) oinfo->index = FALSE; else if(!strcasecmp(flags[i], "nofollow")) oinfo->follow = FALSE; _free(flags[i]); } _free(flags);}void html_parser_parse_tag_jstransform(html_parser_t * hpinfo, char *stack, void *data){#ifdef HAVE_REGEX dllist *ptr; html_tag_t t = { HTML_TAG_HACK, "HACK", {{HTML_ATTRIB_HACK, "HACK", LINK_INLINE | LINK_DOWNLD}, {HTML_ATTRIB_NULL, NULL, 0}} }; for(ptr = priv_cfg.js_transform; ptr; ptr = ptr->next) { js_transform_t *jt = (js_transform_t *) ptr->data; if(js_transform_match_tag(jt, hpinfo->stack)) { int nsub, *subs; char *attr = html_get_attrib_from_tag(hpinfo->stack, jt->attrib); if(!attr) continue; if(!re_pmatch_subs(jt->re, attr, &nsub, &subs)) { _free(attr); continue; } hpinfo->tag_attrib = js_transform_apply(jt, attr, nsub, subs); /*****************************************/ /* quite dirty hack to make happy attrib */ /* parsing funcs which require valid */ /* current_tag & current_attrib */ /*****************************************/ hpinfo->current_tag = &t; hpinfo->current_attrib = &(t.attribs[0]); if(hpinfo->tag_attrib) html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs); if(hpinfo->rewrite && jt->type == 1 && nsub) { int l = strlen(hpinfo->tag_attrib); attr = _realloc(attr, strlen(attr) + l + 1); memmove(attr + l + subs[2], attr + subs[3], strlen(attr + subs[3]) + 1); memcpy(attr + subs[2], hpinfo->tag_attrib, l); l = strlen(attr); html_parser_SEND(hpinfo); html_parser_SEXPAND(hpinfo, l); html_replace_url_in_stack(hpinfo->stack, jt->attrib, attr, FALSE); } _free(subs); _free(attr); /* :-) unhack */ hpinfo->current_tag = NULL; hpinfo->current_attrib = NULL; _free(hpinfo->tag_attrib); } }#endif}/********************************************************//* functions for processing URL attributes of HTML tags *//********************************************************/void html_parser_url_to_absolute_url(html_parser_t * hpinfo, char *stack, void *data){ char *ustr; /* printf("http_parser sees %s %s=\"%s\"<\n", hpinfo->current_tag->tag, hpinfo->current_attrib->attrib, hpinfo->tag_attrib); */ ustr = url_to_absolute_url(hpinfo->base, hpinfo->baset, hpinfo->doc_url, hpinfo->tag_attrib); if(ustr && *ustr) { DEBUG_HTML("Rewriting URL (to abs) - %s -> %s\n", hpinfo->tag_attrib, ustr); _free(hpinfo->tag_attrib); hpinfo->tag_attrib = ustr; }}void html_parser_process_base(html_parser_t * hpinfo, char *stack, void *data){ if(hpinfo->current_tag->type == HTML_TAG_BASE && hpinfo->current_attrib->type == HTML_ATTRIB_HREF) { int lp, ls; html_parser_process_new_base_url(hpinfo, hpinfo->tag_attrib); /* comment BASE tag because pavuk */ /* overwrites URLs according to this tag */ lp = strlen(COMMENT_PREFIX); ls = strlen(COMMENT_SUFFIX); html_parser_SEND(hpinfo); html_parser_SEXPAND(hpinfo, (lp + ls)); memmove(hpinfo->stack + lp, hpinfo->stack, strlen(hpinfo->stack) + 1); memcpy(hpinfo->stack, COMMENT_PREFIX, lp); strcat(hpinfo->stack, COMMENT_SUFFIX); }}void html_parser_process_form(html_parser_t * hpinfo, char *stack, dllist ** formlist){ if(hpinfo->current_attrib->stat & LINK_FORM && hpinfo->current_attrib->type == HTML_ATTRIB_ACTION) { hpinfo->doc_url->status |= URL_HAVE_FORMS; if(formlist && hpinfo->tag_attrib) { *formlist = dllist_append(*formlist, (dllist_t) tl_strdup(hpinfo->tag_attrib)); } }}void html_parser_get_url(html_parser_t * hpinfo, char *stack, html_extract_info_t * einfo){ if(*hpinfo->tag_attrib /* Never follow "" */ && (hpinfo->current_attrib->stat & LINK_DOWNLD) && (!einfo->only_inline || (einfo->only_inline && hpinfo->current_attrib->stat & LINK_INLINE)) && (!(hpinfo->current_attrib->stat & LINK_SCRIPT) || (einfo->enable_js && hpinfo->current_attrib->stat & LINK_SCRIPT))) { url *purl = (url *) 0; cond_info_t condp; condp.level = 0; condp.urlnr = 0; condp.size = 0; condp.time = 0L; condp.mimet = NULL; condp.full_tag = stack; condp.params = NULL; condp.html_doc = hpinfo->in_content; condp.html_doc_offset = hpinfo->in_offset; condp.tag = hpinfo->current_tag ? hpinfo->current_tag->tag : NULL; condp.attrib = hpinfo->current_attrib ? hpinfo->current_attrib->attrib : NULL; purl = url_parse(hpinfo->tag_attrib); assert(purl->type != URLT_FROMPARENT); url_path_abs(purl); if(hpinfo->current_attrib->stat & LINK_INLINE) purl->status |= URL_INLINE_OBJ; if(hpinfo->current_attrib->stat & LINK_SCRIPT) purl->status |= URL_ISSCRIPT; purl->level = hpinfo->doc_url->level + 1; purl->parent_url = dllist_append(purl->parent_url, (dllist_t) hpinfo->doc_url); /*****************************************************/ /* if we are in SYNC/MIRROR mode try to get original */ /* URL rather than processing it as file */ /* (mandatory thing to get working SYNC/MIRROR mode) */ /*****************************************************/ if((cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR) && cfg.request && (purl->type == URLT_FILE)) { url *pomurl = filename_to_url(purl->p.file.filename); if(pomurl) { free_deep_url(purl); _free(purl); purl = pomurl; } } /**********************************/ /* remove last anchor URL because */ /* it is server side image map */ /**********************************/ if(einfo->prev_a && hpinfo->current_tag->type == HTML_TAG_IMG && hpinfo->current_attrib->type == HTML_ATTRIB_SRC && html_tag_co_elem(hpinfo->stack, "ISMAP")) { DEBUG_HTML("Removing server image map\n"); free_deep_url((url *) einfo->prev_a->data); free((url *) einfo->prev_a->data); einfo->urls = dllist_remove_entry(einfo->urls, einfo->prev_a); einfo->prev_a = NULL; } if(hpinfo->current_tag->type == HTML_TAG_A && hpinfo->current_attrib->type == HTML_ATTRIB_HREF) { einfo->prev_a = NULL; } /* Do not accept links, which only link inside the already loaded document like <a href="#top">. This is a local relative reference, so remove it. */ if((hpinfo->current_attrib->type == HTML_ATTRIB_USEMAP || hpinfo->current_attrib->type == HTML_ATTRIB_HREF) && hpinfo->tag_attrib[0] == '#') { LOCK_REJCNT; cfg.reject_cnt++; UNLOCK_REJCNT; DEBUG_HTML("Rejecting local anchor URL - %s\n", hpinfo->tag_attrib); } else if(einfo->no_limits || url_append_condition(purl, &condp)) { DEBUG_HTML("Accepting URL - %s\n", hpinfo->tag_attrib); /***************************************/ /* process special add-on tag PAVUKEXT */ /* where are stored some additional */ /* informations about FTP URLs */ /***************************************/ if(purl->type == URLT_FTP || purl->type == URLT_FTPS) { char *pext; pext = html_get_attrib_from_tag(hpinfo->stack, "PAVUKEXT"); if(pext) { ftp_url_extension *uext; uext = ftp_parse_ftpinf_ext(pext); purl->extension = uext; if(uext->type == FTP_TYPE_D) purl->p.ftp.dir = TRUE; } _free(pext); } einfo->urls = dllist_append(einfo->urls, (dllist_t) purl); if(hpinfo->current_tag->type == HTML_TAG_A && hpinfo->current_attrib->type == HTML_ATTRIB_HREF) { einfo->prev_a = dllist_last(einfo->urls); } } else { LOCK_REJCNT; cfg.reject_cnt++; UNLOCK_REJCNT; DEBUG_HTML("Rejecting URL - %s\n", hpinfo->tag_attrib); free_deep_url(purl); _free(purl);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?