htmlparser.c

来自「网络爬虫程序」· C语言 代码 · 共 1,536 行 · 第 1/3 页

C
1,536
字号
  }  if(tagstart || stylestart)  {    hpinfo->stack[hpinfo->stack_offset] = '\0';    html_parser_flush_stack_to_output(hpinfo);  }  if(hpinfo->rewrite)    hpinfo->out_content[hpinfo->out_offset] = '\0';}/********************************************//* functions for processing whole HTML tags *//********************************************/void html_parser_parse_tag(html_parser_t * hpinfo, char *stack, void *data){  int j;  dllist *ptr;  if(!html_parser_check_tag(hpinfo, hpinfo->stack + 1))    return;  if(hpinfo->current_tag->type == HTML_TAG_META)    return;  for(j = 0; hpinfo->current_tag->attribs[j].attrib; j++)  {    hpinfo->current_attrib = &hpinfo->current_tag->attribs[j];    if(hpinfo->current_attrib->stat & LINK_DISABLED)      continue;    hpinfo->tag_attrib = html_get_attrib_from_tag(hpinfo->stack,      hpinfo->current_attrib->attrib);    /*** -dont_touch_url_pattern support ***/    if(hpinfo->tag_attrib && cfg.dont_touch_url_pattern)    {      if(is_in_pattern_list(hpinfo->tag_attrib, cfg.dont_touch_url_pattern))      {        _free(hpinfo->tag_attrib);      }    }#ifdef HAVE_REGEX    /*** -dont_touch_url_rpattern support ***/    for(ptr = cfg.dont_touch_url_rpattern;      ptr && hpinfo->tag_attrib; ptr = ptr->next)    {      if(re_pmatch((re_entry *) ptr->data, hpinfo->tag_attrib))        _free(hpinfo->tag_attrib);    }    /*** -dont_touch_tag_rpattern support ***/    for(ptr = cfg.dont_touch_tag_rpattern;      ptr && hpinfo->tag_attrib; ptr = ptr->next)    {      if(re_pmatch((re_entry *)ptr->data, hpinfo->stack))        _free(hpinfo->tag_attrib);    }#endif    if(hpinfo->tag_attrib)    {      /* to support javascript:... URLs  */      /* inside any attribute            */      if(!strncasecmp(hpinfo->tag_attrib, "javascript:", 11))      {        char *saved_attrib = hpinfo->tag_attrib;        hpinfo->tag_attrib = tl_strdup(saved_attrib + 11);        html_parser_call_funcs(hpinfo, hpinfo->script_funcs);        if(hpinfo->rewrite)        {          int len;          len = strlen(hpinfo->tag_attrib);          saved_attrib = _realloc(saved_attrib, 12 + len);          memcpy(saved_attrib + 11, hpinfo->tag_attrib, len + 1);          _free(hpinfo->tag_attrib);          hpinfo->tag_attrib = saved_attrib;        }        else          _free(saved_attrib);      }      else if(hpinfo->current_attrib->stat & LINK_STYLE)        html_parser_call_funcs(hpinfo, hpinfo->style_funcs);      else if(hpinfo->current_attrib->stat & LINK_JS)        html_parser_call_funcs(hpinfo, hpinfo->script_funcs);      else        html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);    }    if(hpinfo->rewrite && hpinfo->tag_attrib)    {      int l = strlen(hpinfo->tag_attrib);      html_parser_SEND(hpinfo);      html_parser_SEXPAND(hpinfo, l);      html_replace_url_in_stack(hpinfo->stack,        hpinfo->current_attrib->attrib, hpinfo->tag_attrib, FALSE);    }    _free(hpinfo->tag_attrib);  }}void html_parser_parse_tag_slash_a(html_parser_t * hpinfo, char *stack,  html_extract_info_t * einfo){  if(einfo->prev_a && !strcasecmp(hpinfo->stack, "</A>"))  {    einfo->prev_a = NULL;  }}void html_parser_parse_tag_meta_refresh(html_parser_t * hpinfo, char *stack,  void *data){  char *saved_meta = (char *) 0;  char *meta_type;  if(!hpinfo->current_tag || hpinfo->current_tag->type != HTML_TAG_META)    return;  hpinfo->current_attrib = &hpinfo->current_tag->attribs[0];  meta_type = html_get_attrib_from_tag(hpinfo->stack, "HTTP-EQUIV");  if(!meta_type || strcasecmp(meta_type, "Refresh"))  {    _free(meta_type);    return;  }  _free(meta_type);  saved_meta = html_get_attrib_from_tag(hpinfo->stack, "CONTENT");  if(!saved_meta)    return;  hpinfo->tag_attrib = html_get_attrib_from_tag(saved_meta, "URL");  if(hpinfo->tag_attrib)  {    html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);    if(hpinfo->rewrite)    {      /* little hack to prevent writing    */      /* outside of allocated memory chunk */      saved_meta = _realloc(saved_meta,        strlen(saved_meta) + strlen(hpinfo->tag_attrib) + 4);      html_replace_url_in_stack(saved_meta, "URL", hpinfo->tag_attrib, TRUE);      _free(hpinfo->tag_attrib);      hpinfo->tag_attrib = saved_meta;      if(hpinfo->tag_attrib)      {        int l = strlen(hpinfo->tag_attrib);        html_parser_SEND(hpinfo);        html_parser_SEXPAND(hpinfo, l);        html_replace_url_in_stack(hpinfo->stack,          hpinfo->current_attrib->attrib, hpinfo->tag_attrib, FALSE);        hpinfo->tag_attrib = 0;      }    }    else    {      _free(hpinfo->tag_attrib);    }  }  _free(saved_meta);}void html_parser_parse_tag_meta_robots(html_parser_t * hpinfo, char *stack,  html_robots_info_t * oinfo){  char *meta_type;  char *content;  char **flags;  int i;  if(!hpinfo->current_tag || hpinfo->current_tag->type != HTML_TAG_META)    return;  meta_type = html_get_attrib_from_tag(hpinfo->stack, "NAME");  if(!meta_type || strcasecmp(meta_type, "Robots"))  {    _free(meta_type);    return;  }  _free(meta_type);  content = html_get_attrib_from_tag(hpinfo->stack, "CONTENT");  if(!content)    return;  flags = tl_str_split(content, ",");  _free(content);  for(i = 0; flags && flags[i]; i++)  {    if(!strcasecmp(flags[i], "all"))    {      oinfo->index = TRUE;      oinfo->follow = TRUE;      oinfo->images = TRUE;    }    else if(!strcasecmp(flags[i], "none"))    {      oinfo->index = FALSE;      oinfo->follow = FALSE;      oinfo->images = FALSE;    }    else if(!strcasecmp(flags[i], "index"))      oinfo->index = TRUE;    else if(!strcasecmp(flags[i], "follow"))      oinfo->follow = TRUE;    else if(!strcasecmp(flags[i], "noimageindex"))      oinfo->images = FALSE;    else if(!strcasecmp(flags[i], "noindex"))      oinfo->index = FALSE;    else if(!strcasecmp(flags[i], "nofollow"))      oinfo->follow = FALSE;    _free(flags[i]);  }  _free(flags);}void html_parser_parse_tag_jstransform(html_parser_t * hpinfo, char *stack,  void *data){#ifdef HAVE_REGEX  dllist *ptr;  html_tag_t t = { HTML_TAG_HACK, "HACK",    {{HTML_ATTRIB_HACK, "HACK", LINK_INLINE | LINK_DOWNLD},      {HTML_ATTRIB_NULL, NULL, 0}}  };  for(ptr = priv_cfg.js_transform; ptr; ptr = ptr->next)  {    js_transform_t *jt = (js_transform_t *) ptr->data;    if(js_transform_match_tag(jt, hpinfo->stack))    {      int nsub, *subs;      char *attr = html_get_attrib_from_tag(hpinfo->stack,        jt->attrib);      if(!attr)        continue;      if(!re_pmatch_subs(jt->re, attr, &nsub, &subs))      {        _free(attr);        continue;      }      hpinfo->tag_attrib = js_transform_apply(jt, attr, nsub, subs);      /*****************************************/      /* quite dirty hack to make happy attrib */      /* parsing funcs which require valid     */      /* current_tag & current_attrib          */      /*****************************************/      hpinfo->current_tag = &t;      hpinfo->current_attrib = &(t.attribs[0]);      if(hpinfo->tag_attrib)        html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);      if(hpinfo->rewrite && jt->type == 1 && nsub)      {        int l = strlen(hpinfo->tag_attrib);        attr = _realloc(attr, strlen(attr) + l + 1);        memmove(attr + l + subs[2], attr + subs[3],          strlen(attr + subs[3]) + 1);        memcpy(attr + subs[2], hpinfo->tag_attrib, l);        l = strlen(attr);        html_parser_SEND(hpinfo);        html_parser_SEXPAND(hpinfo, l);        html_replace_url_in_stack(hpinfo->stack, jt->attrib, attr, FALSE);      }      _free(subs);      _free(attr);      /* :-) unhack */      hpinfo->current_tag = NULL;      hpinfo->current_attrib = NULL;      _free(hpinfo->tag_attrib);    }  }#endif}/********************************************************//* functions for processing URL attributes of HTML tags *//********************************************************/void html_parser_url_to_absolute_url(html_parser_t * hpinfo, char *stack,  void *data){  char *ustr;  /*     printf("http_parser sees %s %s=\"%s\"<\n",     hpinfo->current_tag->tag,     hpinfo->current_attrib->attrib,     hpinfo->tag_attrib);   */  ustr = url_to_absolute_url(hpinfo->base, hpinfo->baset,    hpinfo->doc_url, hpinfo->tag_attrib);  if(ustr && *ustr)  {    DEBUG_HTML("Rewriting URL (to abs) - %s -> %s\n", hpinfo->tag_attrib,      ustr);    _free(hpinfo->tag_attrib);    hpinfo->tag_attrib = ustr;  }}void html_parser_process_base(html_parser_t * hpinfo, char *stack, void *data){  if(hpinfo->current_tag->type == HTML_TAG_BASE &&    hpinfo->current_attrib->type == HTML_ATTRIB_HREF)  {    int lp, ls;    html_parser_process_new_base_url(hpinfo, hpinfo->tag_attrib);    /* comment BASE tag because pavuk        */    /* overwrites URLs according to this tag */    lp = strlen(COMMENT_PREFIX);    ls = strlen(COMMENT_SUFFIX);    html_parser_SEND(hpinfo);    html_parser_SEXPAND(hpinfo, (lp + ls));    memmove(hpinfo->stack + lp, hpinfo->stack, strlen(hpinfo->stack) + 1);    memcpy(hpinfo->stack, COMMENT_PREFIX, lp);    strcat(hpinfo->stack, COMMENT_SUFFIX);  }}void html_parser_process_form(html_parser_t * hpinfo, char *stack,  dllist ** formlist){  if(hpinfo->current_attrib->stat & LINK_FORM &&    hpinfo->current_attrib->type == HTML_ATTRIB_ACTION)  {    hpinfo->doc_url->status |= URL_HAVE_FORMS;    if(formlist && hpinfo->tag_attrib)    {      *formlist = dllist_append(*formlist,      (dllist_t) tl_strdup(hpinfo->tag_attrib));    }  }}void html_parser_get_url(html_parser_t * hpinfo, char *stack,  html_extract_info_t * einfo){  if(*hpinfo->tag_attrib /* Never follow "" */  &&    (hpinfo->current_attrib->stat & LINK_DOWNLD) &&    (!einfo->only_inline ||      (einfo->only_inline &&        hpinfo->current_attrib->stat & LINK_INLINE)) &&    (!(hpinfo->current_attrib->stat & LINK_SCRIPT) ||      (einfo->enable_js && hpinfo->current_attrib->stat & LINK_SCRIPT)))  {    url *purl = (url *) 0;    cond_info_t condp;    condp.level = 0;    condp.urlnr = 0;    condp.size = 0;    condp.time = 0L;    condp.mimet = NULL;    condp.full_tag = stack;    condp.params = NULL;    condp.html_doc = hpinfo->in_content;    condp.html_doc_offset = hpinfo->in_offset;    condp.tag = hpinfo->current_tag ? hpinfo->current_tag->tag : NULL;    condp.attrib = hpinfo->current_attrib ?      hpinfo->current_attrib->attrib : NULL;    purl = url_parse(hpinfo->tag_attrib);    assert(purl->type != URLT_FROMPARENT);    url_path_abs(purl);    if(hpinfo->current_attrib->stat & LINK_INLINE)      purl->status |= URL_INLINE_OBJ;    if(hpinfo->current_attrib->stat & LINK_SCRIPT)      purl->status |= URL_ISSCRIPT;    purl->level = hpinfo->doc_url->level + 1;    purl->parent_url = dllist_append(purl->parent_url,    (dllist_t) hpinfo->doc_url);    /*****************************************************/    /* if we are in SYNC/MIRROR mode try to get original */    /* URL rather than processing it as file             */    /* (mandatory thing to get working SYNC/MIRROR mode) */    /*****************************************************/    if((cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR) &&      cfg.request && (purl->type == URLT_FILE))    {      url *pomurl = filename_to_url(purl->p.file.filename);      if(pomurl)      {        free_deep_url(purl);        _free(purl);        purl = pomurl;      }    }    /**********************************/    /* remove last anchor URL because */    /* it is server side image map    */    /**********************************/    if(einfo->prev_a &&      hpinfo->current_tag->type == HTML_TAG_IMG &&      hpinfo->current_attrib->type == HTML_ATTRIB_SRC &&      html_tag_co_elem(hpinfo->stack, "ISMAP"))    {      DEBUG_HTML("Removing server image map\n");      free_deep_url((url *) einfo->prev_a->data);      free((url *) einfo->prev_a->data);      einfo->urls = dllist_remove_entry(einfo->urls, einfo->prev_a);      einfo->prev_a = NULL;    }    if(hpinfo->current_tag->type == HTML_TAG_A &&      hpinfo->current_attrib->type == HTML_ATTRIB_HREF)    {      einfo->prev_a = NULL;    }    /* Do not accept links, which only link inside the already loaded document       like <a href="#top">. This is a local relative reference, so remove it.    */    if((hpinfo->current_attrib->type == HTML_ATTRIB_USEMAP ||    hpinfo->current_attrib->type == HTML_ATTRIB_HREF) &&    hpinfo->tag_attrib[0] == '#')    {      LOCK_REJCNT;      cfg.reject_cnt++;      UNLOCK_REJCNT;      DEBUG_HTML("Rejecting local anchor URL - %s\n", hpinfo->tag_attrib);    }    else if(einfo->no_limits || url_append_condition(purl, &condp))    {      DEBUG_HTML("Accepting URL - %s\n", hpinfo->tag_attrib);      /***************************************/      /* process special add-on tag PAVUKEXT */      /* where are stored some additional    */      /* informations about FTP URLs         */      /***************************************/      if(purl->type == URLT_FTP || purl->type == URLT_FTPS)      {        char *pext;        pext = html_get_attrib_from_tag(hpinfo->stack, "PAVUKEXT");        if(pext)        {          ftp_url_extension *uext;          uext = ftp_parse_ftpinf_ext(pext);          purl->extension = uext;          if(uext->type == FTP_TYPE_D)            purl->p.ftp.dir = TRUE;        }        _free(pext);      }      einfo->urls = dllist_append(einfo->urls, (dllist_t) purl);      if(hpinfo->current_tag->type == HTML_TAG_A &&        hpinfo->current_attrib->type == HTML_ATTRIB_HREF)      {        einfo->prev_a = dllist_last(einfo->urls);      }    }    else    {      LOCK_REJCNT;      cfg.reject_cnt++;      UNLOCK_REJCNT;      DEBUG_HTML("Rejecting URL - %s\n", hpinfo->tag_attrib);      free_deep_url(purl);      _free(purl);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?