htmlparser.c

来自「网络爬虫程序」· C语言 代码 · 共 1,536 行 · 第 1/3 页

C
1,536
字号
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#include "config.h"#include <assert.h>#include <sys/types.h>#include <sys/stat.h>#include <unistd.h>#include <string.h>#include "htmlparser.h"#include "tools.h"#include "css.h"#include "re.h"#include "ftp.h"#include "jstrans.h"static dlhash *html_parser_tag_hash = NULL;#define COMMENT_PREFIX "<!-- "#define COMMENT_SUFFIX " -->"#define ADVERT_PREFIX "adv<!-- Removed by pavuk "#define ADVERT_SUFFIX " -->"static int html_parser_tag_comp_func(dllist_t key1, dllist_t key2){  return (!strcasecmp((void *) key1, (void *) key2));}static unsigned int html_parser_tag_hash_func(unsigned int size, dllist_t key){  unsigned char *p = (unsigned char *) key;  unsigned int retv = 0;  while(*p)  {    retv = (retv + tl_ascii_tolower(*p)) % size;    p++;  }  return retv;}static dllist_t html_parser_tag_key_func(dllist_t data){  return (dllist_t) ((html_tag_t *) data)->tag;}html_parser_t *html_parser_init(html_tag_t * tags, int ntags,  int with_tag_rewriting, int purestyle, int purescript){  html_parser_t *rv;  int i;  rv = _malloc(sizeof(html_parser_t));  rv->rewrite = with_tag_rewriting;  rv->purestyle = purestyle;  rv->purescript = purescript;  rv->in_content = NULL;  rv->out_content = NULL;  rv->in_size = 0;  rv->aout_size = 0;  rv->out_offset = 0;  rv->in_offset = 0;  rv->stack = NULL;  rv->stack_size = 0;  rv->stack_offset = 0;  rv->base = NULL;  rv->baset = NULL;  rv->tag_attrib = NULL;  rv->tag_funcs = NULL;  rv->attrib_funcs = NULL;  rv->style_funcs = NULL;  rv->script_funcs = NULL;  rv->current_tag = NULL;  rv->current_attrib = NULL;  LOCK_TAG_HASH;  if(html_parser_tag_hash)    rv->tag_hash = html_parser_tag_hash;  else  {    rv->tag_hash = dlhash_new(20,      html_parser_tag_key_func,      html_parser_tag_hash_func, html_parser_tag_comp_func);    for(i = 0; i < ntags; i++)      dlhash_insert(rv->tag_hash, (dllist_t) &tags[i]);    html_parser_tag_hash = rv->tag_hash;  }  UNLOCK_TAG_HASH;  return rv;}void html_parser_do_cleanup(void){  if(html_parser_tag_hash)    dlhash_free(html_parser_tag_hash);}void html_parser_kill(html_parser_t * hpinfo){#define KILL_FUNC_CHAIN(chain) \  while (chain) \  { \    if(chain->data) free((void *)chain->data);\    chain = dllist_remove_entry(chain, chain);\  }  KILL_FUNC_CHAIN(hpinfo->tag_funcs);  KILL_FUNC_CHAIN(hpinfo->attrib_funcs);  KILL_FUNC_CHAIN(hpinfo->style_funcs);  KILL_FUNC_CHAIN(hpinfo->script_funcs);  _free(hpinfo->stack);  _free(hpinfo->out_content);  _free(hpinfo->base);  _free(hpinfo->baset);  _free(hpinfo);}void html_parser_add_tag_func(html_parser_t * hpinfo, html_parser_func_t func,  void *data){  html_parser_func_info_t *nfunc;  nfunc = _malloc(sizeof(html_parser_func_info_t));  nfunc->func = func;  nfunc->data = data;  hpinfo->tag_funcs = dllist_append(hpinfo->tag_funcs, (dllist_t) nfunc);}void html_parser_add_attrib_func(html_parser_t * hpinfo,  html_parser_func_t func, void *data){  html_parser_func_info_t *nfunc;  nfunc = _malloc(sizeof(html_parser_func_info_t));  nfunc->func = func;  nfunc->data = data;  hpinfo->attrib_funcs = dllist_append(hpinfo->attrib_funcs, (dllist_t) nfunc);}void html_parser_add_style_func(html_parser_t * hpinfo,  html_parser_func_t func, void *data){  html_parser_func_info_t *nfunc;  nfunc = _malloc(sizeof(html_parser_func_info_t));  nfunc->func = func;  nfunc->data = data;  hpinfo->style_funcs = dllist_append(hpinfo->style_funcs, (dllist_t) nfunc);}void html_parser_add_script_func(html_parser_t * hpinfo,  html_parser_func_t func, void *data){  html_parser_func_info_t *nfunc;  nfunc = _malloc(sizeof(html_parser_func_info_t));  nfunc->func = func;  nfunc->data = data;  hpinfo->script_funcs = dllist_append(hpinfo->script_funcs, (dllist_t) nfunc);}void html_parser_set_document(html_parser_t * hpinfo, url * doc_url,  char *content, ssize_t size){  hpinfo->doc_url = doc_url;  hpinfo->in_content = content;  hpinfo->in_size = size;}void html_parser_take_document(html_parser_t * hpinfo, char **out_content,  ssize_t * out_size){  *out_content = hpinfo->out_content;  *out_size = hpinfo->out_offset;  hpinfo->out_content = NULL;  hpinfo->out_offset = 0;  hpinfo->aout_size = 0;}void html_parser_set_base(html_parser_t * hpinfo, char *base, char *baset){  if(base)  {    _free(hpinfo->base);    hpinfo->base = base;  }  if(baset)  {    _free(hpinfo->baset);    hpinfo->baset = baset;  }}static void html_parser_process_new_base_url(html_parser_t * hpinfo,  char *baseattr){  url *purl;  char *newbase;  purl = url_parse(baseattr);  assert(purl->type != URLT_FROMPARENT);  if(!prottable[purl->type].supported)  {    xprintf(1, gettext("Unsupported BASE URL -  %s (probably bad handled)\n"),      baseattr);    newbase = tl_strdup(baseattr);  }  else  {    char *idx;    newbase =      url_to_absolute_url(hpinfo->base, hpinfo->baset, hpinfo->doc_url,      baseattr);    if(!newbase) return; /* collect base="" and ignore it */    if((idx = strrchr(newbase, '?')))      *idx = '\0';    if(!tl_is_dirname(newbase))    {      idx = strrchr(newbase, '/');      if(idx)        *(idx + 1) = '\0';    }  }  DEBUG_HTML("NEW BASE URL - %s\n", newbase);  free_deep_url(purl);  _free(purl);  _free(hpinfo->base);  hpinfo->base = newbase;}static void html_parser_call_funcs(html_parser_t * hpinfo, dllist * funcs){  dllist *ptr;  for(ptr = funcs; ptr; ptr = ptr->next)  {    html_parser_func_info_t *fi = (html_parser_func_info_t *) ptr->data;    fi->func(hpinfo, hpinfo->stack, fi->data);  }}static void html_parser_flush_stack_to_output(html_parser_t * hpinfo){  int l;  if(!hpinfo->rewrite)    return;  l = strlen(hpinfo->stack);  html_parser_MEXPAND(hpinfo, l)    memcpy(hpinfo->out_content + hpinfo->out_offset, hpinfo->stack, l);  hpinfo->out_offset += l;  hpinfo->stack_offset = 0;}static int html_parser_check_tag(html_parser_t * hpinfo, char *tagstart){  int tl;  hpinfo->current_tag = NULL;  for(tl = 0; tl_ascii_isalpha(tagstart[tl]); tl++);  if(strchr(" \t\r\n>", tagstart[tl]))  {    char *tagname;    tagname = tl_strndup(tagstart, tl);    hpinfo->current_tag = (html_tag_t *) dlhash_find_by_key(hpinfo->tag_hash,    (dllist_t) tagname);    _free(tagname);  }  return (hpinfo->current_tag != NULL);}static void html_parser_parse_init(html_parser_t * hpinfo){  hpinfo->in_offset = 0;  if(hpinfo->rewrite)  {    hpinfo->aout_size = hpinfo->in_size + html_parser_FENDER;    hpinfo->out_content = _malloc(hpinfo->aout_size);    hpinfo->out_offset = 0;  }  hpinfo->stack_size = 2 * html_parser_FENDER;  hpinfo->stack = _malloc(hpinfo->stack_size);  hpinfo->stack_offset = 0;}void html_parser_parse(html_parser_t * hpinfo){  int tagstart = FALSE;  int scriptstart = FALSE;  int commentstart = FALSE;  int stylestart = FALSE;  int singlequoteintag = FALSE;  int doublequoteintag = FALSE;  char *p;  html_parser_parse_init(hpinfo);  if(hpinfo->purestyle)    stylestart = TRUE;  if(hpinfo->purescript)    scriptstart = TRUE;  for(p = hpinfo->in_content; (p - hpinfo->in_content) < hpinfo->in_size;    p++, hpinfo->in_offset++)  {    if(stylestart)    {      if(!strncasecmp(p, "</STYLE", 7))      {        stylestart = FALSE;        hpinfo->stack[hpinfo->stack_offset] = *p;        hpinfo->stack[hpinfo->stack_offset + 1] = '\0';        html_parser_call_funcs(hpinfo, hpinfo->style_funcs);        html_parser_flush_stack_to_output(hpinfo);      }      else      {        html_parser_SEXPAND(hpinfo, 1)          hpinfo->stack[hpinfo->stack_offset] = *p;        hpinfo->stack_offset++;      }      continue;    }    if(scriptstart)    {      if(!strncasecmp(p + 1, "</SCRIPT", 8))      {        scriptstart = FALSE;        hpinfo->stack[hpinfo->stack_offset] = *p;        hpinfo->stack[hpinfo->stack_offset + 1] = '\0';        html_parser_call_funcs(hpinfo, hpinfo->script_funcs);        html_parser_flush_stack_to_output(hpinfo);      }      else      {        html_parser_SEXPAND(hpinfo, 1)          hpinfo->stack[hpinfo->stack_offset] = *p;        hpinfo->stack_offset++;      }      continue;    }    if(commentstart)    {      if(!strncmp(p, "-->", 3))        commentstart = FALSE;      if(hpinfo->rewrite)      {        hpinfo->out_content[hpinfo->out_offset] = *p;        hpinfo->out_offset++;      }      continue;    }    if((*p == '\"') && tagstart && !singlequoteintag)    {      if(doublequoteintag)      {        doublequoteintag = FALSE;      }      else      {        doublequoteintag = TRUE;      }    }    else if((*p == '\'') && tagstart && !doublequoteintag)    {      if(singlequoteintag)      {        singlequoteintag = FALSE;      }      else      {        singlequoteintag = TRUE;      }    }    else if(*p == '<')    {      if(singlequoteintag || doublequoteintag)      {        continue;      }      if(tagstart)      {        hpinfo->stack[hpinfo->stack_offset] = '\0';        html_parser_flush_stack_to_output(hpinfo);      }      tagstart = FALSE;      if(!strncasecmp(p, "<STYLE", 6))      {        stylestart = TRUE;        hpinfo->stack_offset = 0;      }      else if(!strncmp(p, "<!--", 4))      {        commentstart = TRUE;      }      else      {        hpinfo->stack_offset = 0;        tagstart = TRUE;        singlequoteintag = FALSE;        doublequoteintag = FALSE;      }    }    else if(*p == '>' && tagstart)    {      if(singlequoteintag || doublequoteintag)      {        continue;      }      hpinfo->stack[hpinfo->stack_offset] = *p;      hpinfo->stack[hpinfo->stack_offset + 1] = '\0';      html_parser_call_funcs(hpinfo, hpinfo->tag_funcs);      html_parser_flush_stack_to_output(hpinfo);      if(hpinfo->current_tag &&        hpinfo->current_tag->type == HTML_TAG_SCRIPT &&        !html_tag_co_elem(hpinfo->stack, "SRC"))      {        scriptstart = TRUE;      }      tagstart = FALSE;      singlequoteintag = FALSE;      doublequoteintag = FALSE;      continue;    }    if(tagstart || stylestart || scriptstart)    {      hpinfo->stack[hpinfo->stack_offset] = *p;      hpinfo->stack_offset++;      html_parser_SEXPAND(hpinfo, 1);    }    else    {      if(hpinfo->rewrite)      {        hpinfo->out_content[hpinfo->out_offset] = *p;        hpinfo->out_offset++;      }    }  }  /* pure style don't need to end with </STYLE>   */  /* so we must parse CSS also at end of document */  if(stylestart && hpinfo->purestyle)  {    stylestart = FALSE;    hpinfo->stack[hpinfo->stack_offset] = *p;    hpinfo->stack[hpinfo->stack_offset + 1] = '\0';    html_parser_call_funcs(hpinfo, hpinfo->style_funcs);    html_parser_flush_stack_to_output(hpinfo);  }  /* pure script don't need to end with </SCRIPT> so we */  /* must parse script patterns also at end of document */  if(scriptstart && hpinfo->purescript)  {    scriptstart = FALSE;    hpinfo->stack[hpinfo->stack_offset] = *p;    hpinfo->stack[hpinfo->stack_offset + 1] = '\0';    html_parser_call_funcs(hpinfo, hpinfo->script_funcs);    html_parser_flush_stack_to_output(hpinfo);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?