htmlparser.c
来自「网络爬虫程序」· C语言 代码 · 共 1,536 行 · 第 1/3 页
C
1,536 行
/***************************************************************************//* This code is part of WWW grabber called pavuk *//* Copyright (c) 1997 - 2001 Stefan Ondrejicka *//* Distributed under GPL 2 or later *//***************************************************************************/#include "config.h"#include <assert.h>#include <sys/types.h>#include <sys/stat.h>#include <unistd.h>#include <string.h>#include "htmlparser.h"#include "tools.h"#include "css.h"#include "re.h"#include "ftp.h"#include "jstrans.h"static dlhash *html_parser_tag_hash = NULL;#define COMMENT_PREFIX "<!-- "#define COMMENT_SUFFIX " -->"#define ADVERT_PREFIX "adv<!-- Removed by pavuk "#define ADVERT_SUFFIX " -->"static int html_parser_tag_comp_func(dllist_t key1, dllist_t key2){ return (!strcasecmp((void *) key1, (void *) key2));}static unsigned int html_parser_tag_hash_func(unsigned int size, dllist_t key){ unsigned char *p = (unsigned char *) key; unsigned int retv = 0; while(*p) { retv = (retv + tl_ascii_tolower(*p)) % size; p++; } return retv;}static dllist_t html_parser_tag_key_func(dllist_t data){ return (dllist_t) ((html_tag_t *) data)->tag;}html_parser_t *html_parser_init(html_tag_t * tags, int ntags, int with_tag_rewriting, int purestyle, int purescript){ html_parser_t *rv; int i; rv = _malloc(sizeof(html_parser_t)); rv->rewrite = with_tag_rewriting; rv->purestyle = purestyle; rv->purescript = purescript; rv->in_content = NULL; rv->out_content = NULL; rv->in_size = 0; rv->aout_size = 0; rv->out_offset = 0; rv->in_offset = 0; rv->stack = NULL; rv->stack_size = 0; rv->stack_offset = 0; rv->base = NULL; rv->baset = NULL; rv->tag_attrib = NULL; rv->tag_funcs = NULL; rv->attrib_funcs = NULL; rv->style_funcs = NULL; rv->script_funcs = NULL; rv->current_tag = NULL; rv->current_attrib = NULL; LOCK_TAG_HASH; if(html_parser_tag_hash) rv->tag_hash = html_parser_tag_hash; else { rv->tag_hash = dlhash_new(20, html_parser_tag_key_func, html_parser_tag_hash_func, html_parser_tag_comp_func); for(i = 0; i < ntags; i++) dlhash_insert(rv->tag_hash, (dllist_t) &tags[i]); html_parser_tag_hash = rv->tag_hash; } UNLOCK_TAG_HASH; return rv;}void html_parser_do_cleanup(void){ if(html_parser_tag_hash) dlhash_free(html_parser_tag_hash);}void html_parser_kill(html_parser_t * hpinfo){#define KILL_FUNC_CHAIN(chain) \ while (chain) \ { \ if(chain->data) free((void *)chain->data);\ chain = dllist_remove_entry(chain, chain);\ } KILL_FUNC_CHAIN(hpinfo->tag_funcs); KILL_FUNC_CHAIN(hpinfo->attrib_funcs); KILL_FUNC_CHAIN(hpinfo->style_funcs); KILL_FUNC_CHAIN(hpinfo->script_funcs); _free(hpinfo->stack); _free(hpinfo->out_content); _free(hpinfo->base); _free(hpinfo->baset); _free(hpinfo);}void html_parser_add_tag_func(html_parser_t * hpinfo, html_parser_func_t func, void *data){ html_parser_func_info_t *nfunc; nfunc = _malloc(sizeof(html_parser_func_info_t)); nfunc->func = func; nfunc->data = data; hpinfo->tag_funcs = dllist_append(hpinfo->tag_funcs, (dllist_t) nfunc);}void html_parser_add_attrib_func(html_parser_t * hpinfo, html_parser_func_t func, void *data){ html_parser_func_info_t *nfunc; nfunc = _malloc(sizeof(html_parser_func_info_t)); nfunc->func = func; nfunc->data = data; hpinfo->attrib_funcs = dllist_append(hpinfo->attrib_funcs, (dllist_t) nfunc);}void html_parser_add_style_func(html_parser_t * hpinfo, html_parser_func_t func, void *data){ html_parser_func_info_t *nfunc; nfunc = _malloc(sizeof(html_parser_func_info_t)); nfunc->func = func; nfunc->data = data; hpinfo->style_funcs = dllist_append(hpinfo->style_funcs, (dllist_t) nfunc);}void html_parser_add_script_func(html_parser_t * hpinfo, html_parser_func_t func, void *data){ html_parser_func_info_t *nfunc; nfunc = _malloc(sizeof(html_parser_func_info_t)); nfunc->func = func; nfunc->data = data; hpinfo->script_funcs = dllist_append(hpinfo->script_funcs, (dllist_t) nfunc);}void html_parser_set_document(html_parser_t * hpinfo, url * doc_url, char *content, ssize_t size){ hpinfo->doc_url = doc_url; hpinfo->in_content = content; hpinfo->in_size = size;}void html_parser_take_document(html_parser_t * hpinfo, char **out_content, ssize_t * out_size){ *out_content = hpinfo->out_content; *out_size = hpinfo->out_offset; hpinfo->out_content = NULL; hpinfo->out_offset = 0; hpinfo->aout_size = 0;}void html_parser_set_base(html_parser_t * hpinfo, char *base, char *baset){ if(base) { _free(hpinfo->base); hpinfo->base = base; } if(baset) { _free(hpinfo->baset); hpinfo->baset = baset; }}static void html_parser_process_new_base_url(html_parser_t * hpinfo, char *baseattr){ url *purl; char *newbase; purl = url_parse(baseattr); assert(purl->type != URLT_FROMPARENT); if(!prottable[purl->type].supported) { xprintf(1, gettext("Unsupported BASE URL - %s (probably bad handled)\n"), baseattr); newbase = tl_strdup(baseattr); } else { char *idx; newbase = url_to_absolute_url(hpinfo->base, hpinfo->baset, hpinfo->doc_url, baseattr); if(!newbase) return; /* collect base="" and ignore it */ if((idx = strrchr(newbase, '?'))) *idx = '\0'; if(!tl_is_dirname(newbase)) { idx = strrchr(newbase, '/'); if(idx) *(idx + 1) = '\0'; } } DEBUG_HTML("NEW BASE URL - %s\n", newbase); free_deep_url(purl); _free(purl); _free(hpinfo->base); hpinfo->base = newbase;}static void html_parser_call_funcs(html_parser_t * hpinfo, dllist * funcs){ dllist *ptr; for(ptr = funcs; ptr; ptr = ptr->next) { html_parser_func_info_t *fi = (html_parser_func_info_t *) ptr->data; fi->func(hpinfo, hpinfo->stack, fi->data); }}static void html_parser_flush_stack_to_output(html_parser_t * hpinfo){ int l; if(!hpinfo->rewrite) return; l = strlen(hpinfo->stack); html_parser_MEXPAND(hpinfo, l) memcpy(hpinfo->out_content + hpinfo->out_offset, hpinfo->stack, l); hpinfo->out_offset += l; hpinfo->stack_offset = 0;}static int html_parser_check_tag(html_parser_t * hpinfo, char *tagstart){ int tl; hpinfo->current_tag = NULL; for(tl = 0; tl_ascii_isalpha(tagstart[tl]); tl++); if(strchr(" \t\r\n>", tagstart[tl])) { char *tagname; tagname = tl_strndup(tagstart, tl); hpinfo->current_tag = (html_tag_t *) dlhash_find_by_key(hpinfo->tag_hash, (dllist_t) tagname); _free(tagname); } return (hpinfo->current_tag != NULL);}static void html_parser_parse_init(html_parser_t * hpinfo){ hpinfo->in_offset = 0; if(hpinfo->rewrite) { hpinfo->aout_size = hpinfo->in_size + html_parser_FENDER; hpinfo->out_content = _malloc(hpinfo->aout_size); hpinfo->out_offset = 0; } hpinfo->stack_size = 2 * html_parser_FENDER; hpinfo->stack = _malloc(hpinfo->stack_size); hpinfo->stack_offset = 0;}void html_parser_parse(html_parser_t * hpinfo){ int tagstart = FALSE; int scriptstart = FALSE; int commentstart = FALSE; int stylestart = FALSE; int singlequoteintag = FALSE; int doublequoteintag = FALSE; char *p; html_parser_parse_init(hpinfo); if(hpinfo->purestyle) stylestart = TRUE; if(hpinfo->purescript) scriptstart = TRUE; for(p = hpinfo->in_content; (p - hpinfo->in_content) < hpinfo->in_size; p++, hpinfo->in_offset++) { if(stylestart) { if(!strncasecmp(p, "</STYLE", 7)) { stylestart = FALSE; hpinfo->stack[hpinfo->stack_offset] = *p; hpinfo->stack[hpinfo->stack_offset + 1] = '\0'; html_parser_call_funcs(hpinfo, hpinfo->style_funcs); html_parser_flush_stack_to_output(hpinfo); } else { html_parser_SEXPAND(hpinfo, 1) hpinfo->stack[hpinfo->stack_offset] = *p; hpinfo->stack_offset++; } continue; } if(scriptstart) { if(!strncasecmp(p + 1, "</SCRIPT", 8)) { scriptstart = FALSE; hpinfo->stack[hpinfo->stack_offset] = *p; hpinfo->stack[hpinfo->stack_offset + 1] = '\0'; html_parser_call_funcs(hpinfo, hpinfo->script_funcs); html_parser_flush_stack_to_output(hpinfo); } else { html_parser_SEXPAND(hpinfo, 1) hpinfo->stack[hpinfo->stack_offset] = *p; hpinfo->stack_offset++; } continue; } if(commentstart) { if(!strncmp(p, "-->", 3)) commentstart = FALSE; if(hpinfo->rewrite) { hpinfo->out_content[hpinfo->out_offset] = *p; hpinfo->out_offset++; } continue; } if((*p == '\"') && tagstart && !singlequoteintag) { if(doublequoteintag) { doublequoteintag = FALSE; } else { doublequoteintag = TRUE; } } else if((*p == '\'') && tagstart && !doublequoteintag) { if(singlequoteintag) { singlequoteintag = FALSE; } else { singlequoteintag = TRUE; } } else if(*p == '<') { if(singlequoteintag || doublequoteintag) { continue; } if(tagstart) { hpinfo->stack[hpinfo->stack_offset] = '\0'; html_parser_flush_stack_to_output(hpinfo); } tagstart = FALSE; if(!strncasecmp(p, "<STYLE", 6)) { stylestart = TRUE; hpinfo->stack_offset = 0; } else if(!strncmp(p, "<!--", 4)) { commentstart = TRUE; } else { hpinfo->stack_offset = 0; tagstart = TRUE; singlequoteintag = FALSE; doublequoteintag = FALSE; } } else if(*p == '>' && tagstart) { if(singlequoteintag || doublequoteintag) { continue; } hpinfo->stack[hpinfo->stack_offset] = *p; hpinfo->stack[hpinfo->stack_offset + 1] = '\0'; html_parser_call_funcs(hpinfo, hpinfo->tag_funcs); html_parser_flush_stack_to_output(hpinfo); if(hpinfo->current_tag && hpinfo->current_tag->type == HTML_TAG_SCRIPT && !html_tag_co_elem(hpinfo->stack, "SRC")) { scriptstart = TRUE; } tagstart = FALSE; singlequoteintag = FALSE; doublequoteintag = FALSE; continue; } if(tagstart || stylestart || scriptstart) { hpinfo->stack[hpinfo->stack_offset] = *p; hpinfo->stack_offset++; html_parser_SEXPAND(hpinfo, 1); } else { if(hpinfo->rewrite) { hpinfo->out_content[hpinfo->out_offset] = *p; hpinfo->out_offset++; } } } /* pure style don't need to end with </STYLE> */ /* so we must parse CSS also at end of document */ if(stylestart && hpinfo->purestyle) { stylestart = FALSE; hpinfo->stack[hpinfo->stack_offset] = *p; hpinfo->stack[hpinfo->stack_offset + 1] = '\0'; html_parser_call_funcs(hpinfo, hpinfo->style_funcs); html_parser_flush_stack_to_output(hpinfo); } /* pure script don't need to end with </SCRIPT> so we */ /* must parse script patterns also at end of document */ if(scriptstart && hpinfo->purescript) { scriptstart = FALSE; hpinfo->stack[hpinfo->stack_offset] = *p; hpinfo->stack[hpinfo->stack_offset + 1] = '\0'; html_parser_call_funcs(hpinfo, hpinfo->script_funcs); html_parser_flush_stack_to_output(hpinfo);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?