📄 html-parse.c
字号:
* AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end of text, as well as embedded newlines. */static voidconvert_and_copy (struct pool *pool, const char *beg, const char *end, int flags){ int old_tail = pool->tail; /* Skip blanks if required. We must do this before entities are processed, so that blanks can still be inserted as, for instance, ` '. */ if (flags & AP_TRIM_BLANKS) { while (beg < end && ISSPACE (*beg)) ++beg; while (end > beg && ISSPACE (end[-1])) --end; } if (flags & AP_DECODE_ENTITIES) { /* Grow the pool, then copy the text to the pool character by character, processing the encountered entities as we go along. It's safe (and necessary) to grow the pool in advance because processing the entities can only *shorten* the string, it can never lengthen it. */ const char *from = beg; char *to; bool squash_newlines = !!(flags & AP_TRIM_BLANKS); POOL_GROW (pool, end - beg); to = pool->contents + pool->tail; while (from < end) { if (*from == '&') { int entity = decode_entity (&from, end); if (entity != -1) *to++ = entity; else *to++ = *from++; } else if ((*from == '\n' || *from == '\r') && squash_newlines) ++from; else *to++ = *from++; } /* Verify that we haven't exceeded the original size. (It shouldn't happen, hence the assert.) */ assert (to - (pool->contents + pool->tail) <= end - beg); /* Make POOL's tail point to the position following the string we've written. */ pool->tail = to - pool->contents; POOL_APPEND_CHR (pool, '\0'); } else { /* Just copy the text to the pool. */ POOL_APPEND (pool, beg, end); POOL_APPEND_CHR (pool, '\0'); } if (flags & AP_DOWNCASE) { char *p = pool->contents + old_tail; for (; *p; p++) *p = TOLOWER (*p); }}/* Originally we used to adhere to rfc 1866 here, and allowed only letters, digits, periods, and hyphens as names (of tags or attributes). However, this broke too many pages which used proprietary or strange attributes, e.g. <img src="a.gif" v:shapes="whatever">. So now we allow any character except: * whitespace * 8-bit and control chars * characters that clearly cannot be part of name: '=', '>', '/'. This only affects attribute and tag names; attribute values allow an even greater variety of characters. */#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \ && (x) != '=' && (x) != '>' && (x) != '/')#ifdef STANDALONEstatic int comment_backout_count;#endif/* Advance over an SGML declaration, such as <!DOCTYPE ...>. In strict comments mode, this is used for skipping over comments as well. To recap: any SGML declaration may have comments associated with it, e.g. <!MY-DECL -- isn't this fun? -- foo bar> An HTML comment is merely an empty declaration (<!>) with a comment attached, like this: <!-- some stuff here --> Several comments may be embedded in one comment declaration: <!-- have -- -- fun --> Whitespace is allowed between and after the comments, but not before the first comment. Additionally, this function attempts to handle double quotes in SGML declarations correctly. */static const char *advance_declaration (const char *beg, const char *end){ const char *p = beg; char quote_char = '\0'; /* shut up, gcc! */ char ch; enum { AC_S_DONE, AC_S_BACKOUT, AC_S_BANG, AC_S_DEFAULT, AC_S_DCLNAME, AC_S_DASH1, AC_S_DASH2, AC_S_COMMENT, AC_S_DASH3, AC_S_DASH4, AC_S_QUOTE1, AC_S_IN_QUOTE, AC_S_QUOTE2 } state = AC_S_BANG; if (beg == end) return beg; ch = *p++; /* It looked like a good idea to write this as a state machine, but now I wonder... */ while (state != AC_S_DONE && state != AC_S_BACKOUT) { if (p == end) state = AC_S_BACKOUT; switch (state) { case AC_S_DONE: case AC_S_BACKOUT: break; case AC_S_BANG: if (ch == '!') { ch = *p++; state = AC_S_DEFAULT; } else state = AC_S_BACKOUT; break; case AC_S_DEFAULT: switch (ch) { case '-': state = AC_S_DASH1; break; case ' ': case '\t': case '\r': case '\n': ch = *p++; break; case '>': state = AC_S_DONE; break; case '\'': case '\"': state = AC_S_QUOTE1; break; default: if (NAME_CHAR_P (ch)) state = AC_S_DCLNAME; else state = AC_S_BACKOUT; break; } break; case AC_S_DCLNAME: if (ch == '-') state = AC_S_DASH1; else if (NAME_CHAR_P (ch)) ch = *p++; else state = AC_S_DEFAULT; break; case AC_S_QUOTE1: /* We must use 0x22 because broken assert macros choke on '"' and '\"'. */ assert (ch == '\'' || ch == 0x22); quote_char = ch; /* cheating -- I really don't feel like introducing more different states for different quote characters. */ ch = *p++; state = AC_S_IN_QUOTE; break; case AC_S_IN_QUOTE: if (ch == quote_char) state = AC_S_QUOTE2; else ch = *p++; break; case AC_S_QUOTE2: assert (ch == quote_char); ch = *p++; state = AC_S_DEFAULT; break; case AC_S_DASH1: assert (ch == '-'); ch = *p++; state = AC_S_DASH2; break; case AC_S_DASH2: switch (ch) { case '-': ch = *p++; state = AC_S_COMMENT; break; default: state = AC_S_BACKOUT; } break; case AC_S_COMMENT: switch (ch) { case '-': state = AC_S_DASH3; break; default: ch = *p++; break; } break; case AC_S_DASH3: assert (ch == '-'); ch = *p++; state = AC_S_DASH4; break; case AC_S_DASH4: switch (ch) { case '-': ch = *p++; state = AC_S_DEFAULT; break; default: state = AC_S_COMMENT; break; } break; } } if (state == AC_S_BACKOUT) {#ifdef STANDALONE ++comment_backout_count;#endif return beg + 1; } return p;}/* Find the first occurrence of the substring "-->" in [BEG, END) and return the pointer to the character after the substring. If the substring is not found, return NULL. */static const char *find_comment_end (const char *beg, const char *end){ /* Open-coded Boyer-Moore search for "-->". Examine the third char; if it's not '>' or '-', advance by three characters. Otherwise, look at the preceding characters and try to find a match. */ const char *p = beg - 1; while ((p += 3) < end) switch (p[0]) { case '>': if (p[-1] == '-' && p[-2] == '-') return p + 1; break; case '-': at_dash: if (p[-1] == '-') { at_dash_dash: if (++p == end) return NULL; switch (p[0]) { case '>': return p + 1; case '-': goto at_dash_dash; } } else { if ((p += 2) >= end) return NULL; switch (p[0]) { case '>': if (p[-1] == '-') return p + 1; break; case '-': goto at_dash; } } } return NULL;}/* Return true if the string containing of characters inside [b, e) is present in hash table HT. */static boolname_allowed (const struct hash_table *ht, const char *b, const char *e){ char *copy; if (!ht) return true; BOUNDED_TO_ALLOCA (b, e, copy); return hash_table_get (ht, copy) != NULL;}/* Advance P (a char pointer), with the explicit intent of being able to read the next character. If this is not possible, go to finish. */#define ADVANCE(p) do { \ ++p; \ if (p >= end) \ goto finish; \} while (0)/* Skip whitespace, if any. */#define SKIP_WS(p) do { \ while (ISSPACE (*p)) { \ ADVANCE (p); \ } \} while (0)/* Skip non-whitespace, if any. */#define SKIP_NON_WS(p) do { \
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -