html-parse.c

来自「Wget很好的处理了http和ftp的下载,很值得学习的经典代码」· C语言代码 · 共 1,074 行 · 第 1/3 页
1,074 行
  while (!ISSPACE (*p)) {                       \    ADVANCE (p);                                \  }                                             \} while (0)#ifdef STANDALONEstatic int tag_backout_count;#endif/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.   MAPFUN will be called with two arguments: pointer to an initialized   struct taginfo, and MAPARG.   ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of   which are the tags and attribute names that this function should   use.  If ALLOWED_TAGS is NULL, all tags are processed; if   ALLOWED_ATTRIBUTES is NULL, all attributes are returned.   (Obviously, the caller can filter out unwanted tags and attributes   just as well, but this is just an optimization designed to avoid   unnecessary copying of tags/attributes which the caller doesn't   care about.)  */voidmap_html_tags (const char *text, int size,               void (*mapfun) (struct taginfo *, void *), void *maparg,               int flags,               const struct hash_table *allowed_tags,               const struct hash_table *allowed_attributes){  /* storage for strings passed to MAPFUN callback; if 256 bytes is     too little, POOL_APPEND allocates more with malloc. */  char pool_initial_storage[256];  struct pool pool;  const char *p = text;  const char *end = text + size;  struct attr_pair attr_pair_initial_storage[8];  int attr_pair_size = countof (attr_pair_initial_storage);  bool attr_pair_resized = false;  struct attr_pair *pairs = attr_pair_initial_storage;  if (!size)    return;  POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));  {    int nattrs, end_tag;    const char *tag_name_begin, *tag_name_end;    const char *tag_start_position;    bool uninteresting_tag;  look_for_tag:    POOL_REWIND (&pool);    nattrs = 0;    end_tag = 0;    /* Find beginning of tag.  We use memchr() instead of the usual       looping with ADVANCE() for speed. */    p = memchr (p, '<', end - p);    if (!p)      goto finish;    tag_start_position = p;    ADVANCE (p);    /* Establish the type of the tag (start-tag, end-tag or       declaration).  */    if (*p == '!')      {        if (!(flags & MHT_STRICT_COMMENTS)            && p < end + 3 && p[1] == '-' && p[2] == '-')          {            /* If strict comments are not enforced and if we know               we're looking at a comment, simply look for the               terminating "-->".  Non-strict is the default because               it works in other browsers and most HTML writers can't               be bothered with getting the comments right.  */            const char *comment_end = find_comment_end (p + 3, end);            if (comment_end)              p = comment_end;          }        else          {            /* Either in strict comment mode or looking at a non-empty               declaration.  Real declarations are much less likely to               be misused the way comments are, so advance over them               properly regardless of strictness.  */            p = advance_declaration (p, end);          }        if (p == end)          goto finish;        goto look_for_tag;      }    else if (*p == '/')      {        end_tag = 1;        ADVANCE (p);      }    tag_name_begin = p;    while (NAME_CHAR_P (*p))      ADVANCE (p);    if (p == tag_name_begin)      goto look_for_tag;    tag_name_end = p;    SKIP_WS (p);    if (end_tag && *p != '>')      goto backout_tag;    if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))      /* We can't just say "goto look_for_tag" here because we need         the loop below to properly advance over the tag's attributes.  */      uninteresting_tag = true;    else      {        uninteresting_tag = false;        convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);      }    /* Find the attributes. */    while (1)      {        const char *attr_name_begin, *attr_name_end;        const char *attr_value_begin, *attr_value_end;        const char *attr_raw_value_begin, *attr_raw_value_end;        int operation = AP_DOWNCASE; /* stupid compiler. */        SKIP_WS (p);        if (*p == '/')          {            /* A slash at this point means the tag is about to be               closed.  This is legal in XML and has been popularized               in HTML via XHTML.  */            /* <foo a=b c=d /> */            /*              ^  */            ADVANCE (p);            SKIP_WS (p);            if (*p != '>')              goto backout_tag;          }        /* Check for end of tag definition. */        if (*p == '>')          break;        /* Establish bounds of attribute name. */        attr_name_begin = p;    /* <foo bar ...> */                                /*      ^        */        while (NAME_CHAR_P (*p))          ADVANCE (p);        attr_name_end = p;      /* <foo bar ...> */                                /*         ^     */        if (attr_name_begin == attr_name_end)          goto backout_tag;        /* Establish bounds of attribute value. */        SKIP_WS (p);        if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')          {            /* Minimized attribute syntax allows `=' to be omitted.               For example, <UL COMPACT> is a valid shorthand for <UL               COMPACT="compact">.  Even if such attributes are not               useful to Wget, we need to support them, so that the               tags containing them can be parsed correctly. */            attr_raw_value_begin = attr_value_begin = attr_name_begin;            attr_raw_value_end = attr_value_end = attr_name_end;          }        else if (*p == '=')          {            ADVANCE (p);            SKIP_WS (p);            if (*p == '\"' || *p == '\'')              {                bool newline_seen = false;                char quote_char = *p;                attr_raw_value_begin = p;                ADVANCE (p);                attr_value_begin = p; /* <foo bar="baz"> */                                      /*           ^     */                while (*p != quote_char)                  {                    if (!newline_seen && *p == '\n')                      {                        /* If a newline is seen within the quotes, it                           is most likely that someone forgot to close                           the quote.  In that case, we back out to                           the value beginning, and terminate the tag                           at either `>' or the delimiter, whichever                           comes first.  Such a tag terminated at `>'                           is discarded.  */                        p = attr_value_begin;                        newline_seen = true;                        continue;                      }                    else if (newline_seen && *p == '>')                      break;                    ADVANCE (p);                  }                attr_value_end = p; /* <foo bar="baz"> */                                    /*              ^  */                if (*p == quote_char)                  ADVANCE (p);                else                  goto look_for_tag;                attr_raw_value_end = p; /* <foo bar="baz"> */                                        /*               ^ */                operation = AP_DECODE_ENTITIES;                if (flags & MHT_TRIM_VALUES)                  operation |= AP_TRIM_BLANKS;              }            else              {                attr_value_begin = p; /* <foo bar=baz> */                                      /*          ^    */                /* According to SGML, a name token should consist only                   of alphanumerics, . and -.  However, this is often                   violated by, for instance, `%' in `width=75%'.                   We'll be liberal and allow just about anything as                   an attribute value.  */                while (!ISSPACE (*p) && *p != '>')                  ADVANCE (p);                attr_value_end = p; /* <foo bar=baz qux=quix> */                                    /*             ^          */                if (attr_value_begin == attr_value_end)                  /* <foo bar=> */                  /*          ^ */                  goto backout_tag;                attr_raw_value_begin = attr_value_begin;                attr_raw_value_end = attr_value_end;                operation = AP_DECODE_ENTITIES;              }          }        else          {            /* We skipped the whitespace and found something that is               neither `=' nor the beginning of the next attribute's               name.  Back out.  */            goto backout_tag;   /* <foo bar [... */                                /*          ^    */          }        /* If we're not interested in the tag, don't bother with any           of the attributes.  */        if (uninteresting_tag)          continue;        /* If we aren't interested in the attribute, skip it.  We           cannot do this test any sooner, because our text pointer           needs to correctly advance over the attribute.  */        if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))          continue;        GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,                    struct attr_pair);        pairs[nattrs].name_pool_index = pool.tail;        convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);        pairs[nattrs].value_pool_index = pool.tail;        convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);        pairs[nattrs].value_raw_beginning = attr_raw_value_begin;        pairs[nattrs].value_raw_size = (attr_raw_value_end                                        - attr_raw_value_begin);        ++nattrs;      }    if (uninteresting_tag)      {        ADVANCE (p);        goto look_for_tag;      }    /* By now, we have a valid tag with a name and zero or more       attributes.  Fill in the data and call the mapper function.  */    {      int i;      struct taginfo taginfo;      taginfo.name      = pool.contents;      taginfo.end_tag_p = end_tag;      taginfo.nattrs    = nattrs;      /* We fill in the char pointers only now, when pool can no         longer get realloc'ed.  If we did that above, we could get         hosed by reallocation.  Obviously, after this point, the pool         may no longer be grown.  */      for (i = 0; i < nattrs; i++)        {          pairs[i].name = pool.contents + pairs[i].name_pool_index;          pairs[i].value = pool.contents + pairs[i].value_pool_index;        }      taginfo.attrs = pairs;      taginfo.start_position = tag_start_position;      taginfo.end_position   = p + 1;      mapfun (&taginfo, maparg);      ADVANCE (p);    }    goto look_for_tag;  backout_tag:#ifdef STANDALONE    ++tag_backout_count;#endif    /* The tag wasn't really a tag.  Treat its contents as ordinary       data characters. */    p = tag_start_position + 1;    goto look_for_tag;  } finish:  POOL_FREE (&pool);  if (attr_pair_resized)    xfree (pairs);}#undef ADVANCE#undef SKIP_WS#undef SKIP_NON_WS#ifdef STANDALONEstatic voidtest_mapper (struct taginfo *taginfo, void *arg){  int i;  printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);  for (i = 0; i < taginfo->nattrs; i++)    printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);  putchar ('\n');  ++*(int *)arg;}int main (){  int size = 256;  char *x = xmalloc (size);  int length = 0;  int read_count;  int tag_counter = 0;  while ((read_count = fread (x + length, 1, size - length, stdin)))    {      length += read_count;      size <<= 1;      x = xrealloc (x, size);    }  map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);  printf ("TAGS: %d\n", tag_counter);  printf ("Tag backouts:     %d\n", tag_backout_count);  printf ("Comment backouts: %d\n", comment_backout_count);  return 0;}#endif /* STANDALONE */
html-parse.c - 源码说明

本页面展示了「Wget很好的处理了http和ftp的下载,很值得学习的经典代码」中的 html-parse.c 源码文件，采用 C语言编程语言编写，共 1,074 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Wget相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?