📄 html-parse.c
字号:
while (!ISSPACE (*p)) { \ ADVANCE (p); \ } \} while (0)#ifdef STANDALONEstatic int tag_backout_count;#endif/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long. MAPFUN will be called with two arguments: pointer to an initialized struct taginfo, and MAPARG. ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of which are the tags and attribute names that this function should use. If ALLOWED_TAGS is NULL, all tags are processed; if ALLOWED_ATTRIBUTES is NULL, all attributes are returned. (Obviously, the caller can filter out unwanted tags and attributes just as well, but this is just an optimization designed to avoid unnecessary copying of tags/attributes which the caller doesn't care about.) */voidmap_html_tags (const char *text, int size, void (*mapfun) (struct taginfo *, void *), void *maparg, int flags, const struct hash_table *allowed_tags, const struct hash_table *allowed_attributes){ /* storage for strings passed to MAPFUN callback; if 256 bytes is too little, POOL_APPEND allocates more with malloc. */ char pool_initial_storage[256]; struct pool pool; const char *p = text; const char *end = text + size; struct attr_pair attr_pair_initial_storage[8]; int attr_pair_size = countof (attr_pair_initial_storage); bool attr_pair_resized = false; struct attr_pair *pairs = attr_pair_initial_storage; if (!size) return; POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage)); { int nattrs, end_tag; const char *tag_name_begin, *tag_name_end; const char *tag_start_position; bool uninteresting_tag; look_for_tag: POOL_REWIND (&pool); nattrs = 0; end_tag = 0; /* Find beginning of tag. We use memchr() instead of the usual looping with ADVANCE() for speed. */ p = memchr (p, '<', end - p); if (!p) goto finish; tag_start_position = p; ADVANCE (p); /* Establish the type of the tag (start-tag, end-tag or declaration). */ if (*p == '!') { if (!(flags & MHT_STRICT_COMMENTS) && p < end + 3 && p[1] == '-' && p[2] == '-') { /* If strict comments are not enforced and if we know we're looking at a comment, simply look for the terminating "-->". Non-strict is the default because it works in other browsers and most HTML writers can't be bothered with getting the comments right. */ const char *comment_end = find_comment_end (p + 3, end); if (comment_end) p = comment_end; } else { /* Either in strict comment mode or looking at a non-empty declaration. Real declarations are much less likely to be misused the way comments are, so advance over them properly regardless of strictness. */ p = advance_declaration (p, end); } if (p == end) goto finish; goto look_for_tag; } else if (*p == '/') { end_tag = 1; ADVANCE (p); } tag_name_begin = p; while (NAME_CHAR_P (*p)) ADVANCE (p); if (p == tag_name_begin) goto look_for_tag; tag_name_end = p; SKIP_WS (p); if (end_tag && *p != '>') goto backout_tag; if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end)) /* We can't just say "goto look_for_tag" here because we need the loop below to properly advance over the tag's attributes. */ uninteresting_tag = true; else { uninteresting_tag = false; convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE); } /* Find the attributes. */ while (1) { const char *attr_name_begin, *attr_name_end; const char *attr_value_begin, *attr_value_end; const char *attr_raw_value_begin, *attr_raw_value_end; int operation = AP_DOWNCASE; /* stupid compiler. */ SKIP_WS (p); if (*p == '/') { /* A slash at this point means the tag is about to be closed. This is legal in XML and has been popularized in HTML via XHTML. */ /* <foo a=b c=d /> */ /* ^ */ ADVANCE (p); SKIP_WS (p); if (*p != '>') goto backout_tag; } /* Check for end of tag definition. */ if (*p == '>') break; /* Establish bounds of attribute name. */ attr_name_begin = p; /* <foo bar ...> */ /* ^ */ while (NAME_CHAR_P (*p)) ADVANCE (p); attr_name_end = p; /* <foo bar ...> */ /* ^ */ if (attr_name_begin == attr_name_end) goto backout_tag; /* Establish bounds of attribute value. */ SKIP_WS (p); if (NAME_CHAR_P (*p) || *p == '/' || *p == '>') { /* Minimized attribute syntax allows `=' to be omitted. For example, <UL COMPACT> is a valid shorthand for <UL COMPACT="compact">. Even if such attributes are not useful to Wget, we need to support them, so that the tags containing them can be parsed correctly. */ attr_raw_value_begin = attr_value_begin = attr_name_begin; attr_raw_value_end = attr_value_end = attr_name_end; } else if (*p == '=') { ADVANCE (p); SKIP_WS (p); if (*p == '\"' || *p == '\'') { bool newline_seen = false; char quote_char = *p; attr_raw_value_begin = p; ADVANCE (p); attr_value_begin = p; /* <foo bar="baz"> */ /* ^ */ while (*p != quote_char) { if (!newline_seen && *p == '\n') { /* If a newline is seen within the quotes, it is most likely that someone forgot to close the quote. In that case, we back out to the value beginning, and terminate the tag at either `>' or the delimiter, whichever comes first. Such a tag terminated at `>' is discarded. */ p = attr_value_begin; newline_seen = true; continue; } else if (newline_seen && *p == '>') break; ADVANCE (p); } attr_value_end = p; /* <foo bar="baz"> */ /* ^ */ if (*p == quote_char) ADVANCE (p); else goto look_for_tag; attr_raw_value_end = p; /* <foo bar="baz"> */ /* ^ */ operation = AP_DECODE_ENTITIES; if (flags & MHT_TRIM_VALUES) operation |= AP_TRIM_BLANKS; } else { attr_value_begin = p; /* <foo bar=baz> */ /* ^ */ /* According to SGML, a name token should consist only of alphanumerics, . and -. However, this is often violated by, for instance, `%' in `width=75%'. We'll be liberal and allow just about anything as an attribute value. */ while (!ISSPACE (*p) && *p != '>') ADVANCE (p); attr_value_end = p; /* <foo bar=baz qux=quix> */ /* ^ */ if (attr_value_begin == attr_value_end) /* <foo bar=> */ /* ^ */ goto backout_tag; attr_raw_value_begin = attr_value_begin; attr_raw_value_end = attr_value_end; operation = AP_DECODE_ENTITIES; } } else { /* We skipped the whitespace and found something that is neither `=' nor the beginning of the next attribute's name. Back out. */ goto backout_tag; /* <foo bar [... */ /* ^ */ } /* If we're not interested in the tag, don't bother with any of the attributes. */ if (uninteresting_tag) continue; /* If we aren't interested in the attribute, skip it. We cannot do this test any sooner, because our text pointer needs to correctly advance over the attribute. */ if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end)) continue; GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized, struct attr_pair); pairs[nattrs].name_pool_index = pool.tail; convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE); pairs[nattrs].value_pool_index = pool.tail; convert_and_copy (&pool, attr_value_begin, attr_value_end, operation); pairs[nattrs].value_raw_beginning = attr_raw_value_begin; pairs[nattrs].value_raw_size = (attr_raw_value_end - attr_raw_value_begin); ++nattrs; } if (uninteresting_tag) { ADVANCE (p); goto look_for_tag; } /* By now, we have a valid tag with a name and zero or more attributes. Fill in the data and call the mapper function. */ { int i; struct taginfo taginfo; taginfo.name = pool.contents; taginfo.end_tag_p = end_tag; taginfo.nattrs = nattrs; /* We fill in the char pointers only now, when pool can no longer get realloc'ed. If we did that above, we could get hosed by reallocation. Obviously, after this point, the pool may no longer be grown. */ for (i = 0; i < nattrs; i++) { pairs[i].name = pool.contents + pairs[i].name_pool_index; pairs[i].value = pool.contents + pairs[i].value_pool_index; } taginfo.attrs = pairs; taginfo.start_position = tag_start_position; taginfo.end_position = p + 1; mapfun (&taginfo, maparg); ADVANCE (p); } goto look_for_tag; backout_tag:#ifdef STANDALONE ++tag_backout_count;#endif /* The tag wasn't really a tag. Treat its contents as ordinary data characters. */ p = tag_start_position + 1; goto look_for_tag; } finish: POOL_FREE (&pool); if (attr_pair_resized) xfree (pairs);}#undef ADVANCE#undef SKIP_WS#undef SKIP_NON_WS#ifdef STANDALONEstatic voidtest_mapper (struct taginfo *taginfo, void *arg){ int i; printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name); for (i = 0; i < taginfo->nattrs; i++) printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value); putchar ('\n'); ++*(int *)arg;}int main (){ int size = 256; char *x = xmalloc (size); int length = 0; int read_count; int tag_counter = 0; while ((read_count = fread (x + length, 1, size - length, stdin))) { length += read_count; size <<= 1; x = xrealloc (x, size); } map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL); printf ("TAGS: %d\n", tag_counter); printf ("Tag backouts: %d\n", tag_backout_count); printf ("Comment backouts: %d\n", comment_backout_count); return 0;}#endif /* STANDALONE */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -