📄 lexer.c
字号:
static void MapStr(char *str, uint code){ uint i; while (*str) { i = (uint)(*str++); lexmap[i] |= code; }}void InitMap(void){ MapStr("\r\n\f", newline|white); MapStr(" \t", white); MapStr("-.:_", namechar); MapStr("0123456789", digit|namechar); MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar); MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar);}/* parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to tailor the attribute value. Here is an example of a work around for using ASP in attribute values: href="<%=rsSchool.Fields("ID").Value%>" where the ASP that generates the attribute value is masked from Tidy by the quotemarks.*/Node *ParseAsp(Lexer *lexer){ uint c; Node *asp = null; lexer->txtstart = lexer->lexsize; for (;;) { c = ReadChar(lexer->in); AddCharToLexer(lexer, c); if (c != '%') continue; c = ReadChar(lexer->in); AddCharToLexer(lexer, c); if (c == '>') break; } lexer->lexsize -= 2; lexer->txtend = lexer->lexsize; if (lexer->txtend > lexer->txtstart) asp = AspToken(lexer); lexer->txtstart = lexer->txtend; return asp;} /* PHP is like ASP but is based upon XML processing instructions, e.g. <?php ... ?>*/Node *ParsePhp(Lexer *lexer){ uint c; Node *php = null; lexer->txtstart = lexer->lexsize; for (;;) { c = ReadChar(lexer->in); AddCharToLexer(lexer, c); if (c != '?') continue; c = ReadChar(lexer->in); AddCharToLexer(lexer, c); if (c == '>') break; } lexer->lexsize -= 2; lexer->txtend = lexer->lexsize; if (lexer->txtend > lexer->txtstart) php = PhpToken(lexer); lexer->txtstart = lexer->txtend; return php;} /* consumes the '>' terminating start tags */char *ParseAttribute(Lexer *lexer, Bool *isempty, Node **asp, Node **php){ int map, start, len = 0; char *attr; uint c; *asp = null; /* clear asp pointer */ *php = null; /* clear php pointer */ /* skip white space before the attribute */ for (;;) { c = ReadChar(lexer->in); if (c == '/') { c = ReadChar(lexer->in); if (c == '>') { *isempty = yes; return null; } UngetChar(c, lexer->in); c = '/'; break; } if (c == '>') return null; if (c =='<') { c = ReadChar(lexer->in); if (c == '%') { *asp = ParseAsp(lexer); return null; } else if (c == '?') { *php = ParsePhp(lexer); return null; } UngetChar(c, lexer->in); ReportAttrError(lexer, lexer->token, null, UNEXPECTED_GT); return null; } if (c == '"' || c == '\'') { ReportAttrError(lexer, lexer->token, null, UNEXPECTED_QUOTEMARK); continue; } if (c == EndOfStream) { ReportAttrError(lexer, lexer->token, null, UNEXPECTED_END_OF_FILE); UngetChar(c, lexer->in); return null; } map = MAP(c); if ((map & white) == 0) break; } start = lexer->lexsize; for (;;) { /* but push back '=' for parseValue() */ if (c == '=' || c == '>') { UngetChar(c, lexer->in); break; } if (c == '<' || c == EndOfStream) { UngetChar(c, lexer->in); break; } map = MAP(c); if ((map & white) != 0) break; /* what should be done about non-namechar characters? */ /* currently these are incorporated into the attr name */ if (!XmlTags && (map & uppercase) != 0) c += (uint)('a' - 'A'); ++len; AddCharToLexer(lexer, c); c = ReadChar(lexer->in); } attr = (len > 0 ? wstrndup(lexer->lexbuf+start, len) : null); lexer->lexsize = start; return attr;}/* invoked when < is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this routine recognizes ' and " quoted strings*/int ParseServerInstruction(Lexer *lexer){ int c, map, delim = '"'; Bool isrule = no; c = ReadChar(lexer->in); AddCharToLexer(lexer, c); /* check for ASP, PHP or Tango */ if (c == '%' || c == '?' || c == '@') isrule = yes; for (;;) { c = ReadChar(lexer->in); if (c == EndOfStream) break; if (c == '>') { if (isrule) AddCharToLexer(lexer, c); else UngetChar(c, lexer->in); break; } /* if not recognized as ASP, PHP or Tango */ /* then also finish value on whitespace */ if (!isrule) { map = MAP(c); if ((map & white) != 0) break; } AddCharToLexer(lexer, c); if (c == '"') { do { c = ReadChar(lexer->in); AddCharToLexer(lexer, c); } while (c != '"'); delim = '\''; continue; } if (c == '\'') { do { c = ReadChar(lexer->in); AddCharToLexer(lexer, c); } while (c != '\''); } } return delim;}/* values start with "=" or " = " etc. *//* doesn't consume the ">" at end of start tag */char *ParseValue(Lexer *lexer, char *name, Bool foldCase, Bool *isempty, int *pdelim){ int len = 0, start, map; Bool seen_gt = no; uint c, lastc, delim, quotewarning; char *value; delim = (char) 0; *pdelim = '"'; /* skip white space before the '=' */ for (;;) { c = ReadChar(lexer->in); if (c == EndOfStream) { UngetChar(c, lexer->in); break; } map = MAP(c); if ((map & white) == 0) break; }/* c should be '=' if there is a value other legal possibilities are white space, '/' and '>'*/ if (c != '=') { UngetChar(c, lexer->in); return null; } /* skip white space after '=' */ for (;;) { c = ReadChar(lexer->in); if (c == EndOfStream) { UngetChar(c, lexer->in); break; } map = MAP(c); if ((map & white) == 0) break; } /* check for quote marks */ if (c == '"' || c == '\'') delim = c; else if (c == '<') { start = lexer->lexsize; AddCharToLexer(lexer, c); *pdelim = ParseServerInstruction(lexer); len = lexer->lexsize - start; lexer->lexsize = start; return (len > 0 ? wstrndup(lexer->lexbuf+start, len) : null); } else UngetChar(c, lexer->in); /* and read the value string check for quote mark if needed */ quotewarning = 0; start = lexer->lexsize; c = '\0'; for (;;) { lastc = c; /* track last character */ c = ReadChar(lexer->in); if (c == EndOfStream) { ReportAttrError(lexer, lexer->token, null, UNEXPECTED_END_OF_FILE); UngetChar(c, lexer->in); break; } if (delim == (char)0) { if (c == '>') { UngetChar(c, lexer->in); break; } if (c == '<') { /* UngetChar(c, lexer->in); */ ReportAttrError(lexer, lexer->token, null, UNEXPECTED_GT); /* break; */ } /* For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however care is needed to avoid so treating <a href=http://www.acme.com/> in this way, which would map the <a> tag to <a href="http://www.acme.com"/> */ if (c == '/') { /* peek ahead in case of /> */ c = ReadChar(lexer->in); if (c == '>' && !IsUrl(name)) { *isempty = yes; UngetChar(c, lexer->in); break; } /* unget peeked char */ UngetChar(c, lexer->in); c = '/'; } } else /* delim is '\'' or '"' */ { if (c == delim) break; /* treat CRLF, CR and LF as single line break */ if (c == '\r') { if ((c = ReadChar(lexer->in)) != '\n') UngetChar(c, lexer->in); c = '\n'; } if (c == '\n' || c == '<' || c == '>') ++quotewarning; if (c == '>') seen_gt = yes; } if (c == '&') { AddCharToLexer(lexer, c); ParseEntity(lexer, null); continue; } /* kludge for JavaScript attribute values with line continuations in string literals */ if (c == '\\') { c = ReadChar(lexer->in); if (c != '\n') { UngetChar(c, lexer->in); c = '\\'; } } map = MAP(c); if (map & white) { if (delim == (char)0) break; c = ' '; if (lastc == ' ') continue; } else if (foldCase && (map & uppercase) != 0) c += (uint)('a' - 'A'); AddCharToLexer(lexer, c); } if (quotewarning > 10 && seen_gt) { /* there is almost certainly a missing trailling quote mark as we have see too many newlines, < or > characters. an exception is made for Javascript attributes and the javascript URL scheme which may legitimately include < and > */ if (!IsScript(name) && !(IsUrl(name) && wstrncmp(lexer->lexbuf+start, "javascript:", 11) == 0)) ReportError(lexer, null, null, SUSPECTED_MISSING_QUOTE); } len = lexer->lexsize - start; lexer->lexsize = start; if (len > 0 || del
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -