📄 lexer.c
字号:
/* lexer.c - Lexer for html parser (c) 1998-2000 (W3C) MIT, INRIA, Keio University See tidy.c for the copyright notice.*//* Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2 null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case. Not yet done: - Doctype subset and marked sections*/#include "platform.h"#include "html.h"AttVal *ParseAttrs(Lexer *lexer, Bool *isempty); /* forward references */Node *CommentToken(Lexer *lexer);/* used to classify chars for lexical purposes */#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)uint lexmap[128];#define XHTML_NAMESPACE "http://www.w3.org/1999/xhtml"#define MML_NAMESPACE "http://www.w3.org/1998/Math/MathML"#define XLINK_NAMESPACE "http://www.w3.org/1999/xlink"/* the 3 URIs for the XHTML 1.0 DTDs */#define voyager_loose "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"#define voyager_strict "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"#define voyager_frameset "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"#define W3C_VERSIONS 8struct _vers{ char *name; char *voyager_name; char *profile; int code;} W3C_Version[] ={ {"HTML 2.0", "XHTML 1.0 Strict", voyager_strict, VERS_HTML20}, {"HTML 3.2", "XHTML 1.0 Transitional", voyager_loose, VERS_HTML32}, {"HTML 4.0", "XHTML 1.0 Strict", voyager_strict, VERS_HTML40_STRICT}, {"HTML 4.0 Transitional", "XHTML 1.0 Transitional", voyager_loose, VERS_HTML40_LOOSE}, {"HTML 4.0 Frameset", "XHTML 1.0 Frameset", voyager_frameset, VERS_FRAMES}, {"HTML 4.01", "XHTML 1.0 Strict", voyager_strict, VERS_HTML40_STRICT}, {"HTML 4.01 Transitional", "XHTML 1.0 Transitional", voyager_loose, VERS_HTML40_LOOSE}, {"HTML 4.01 Frameset", "XHTML 1.0 Frameset", voyager_frameset, VERS_FRAMES}};Bool IsWhite(uint c){ uint map = MAP(c); return (Bool)(map & white);}Bool IsDigit(uint c){ uint map; map = MAP(c); return (Bool)(map & digit);}Bool IsLetter(uint c){ uint map; map = MAP(c); return (Bool)(map & letter);}uint ToLower(uint c){ uint map = MAP(c); if (map & uppercase) c += 'a' - 'A'; return c;}uint ToUpper(uint c){ uint map = MAP(c); if (map & lowercase) c += 'A' - 'a'; return c;}char FoldCase(char c, Bool tocaps){ uint map; if (!XmlTags) { map = MAP(c); if (tocaps) { if (map & lowercase) c += 'A' - 'a'; } else /* force to lower case */ { if (map & uppercase) c += 'a' - 'A'; } } return c;}/* node->type is one of these: #define TextNode 1 #define StartTag 2 #define EndTag 3 #define StartEndTag 4*/Lexer *NewLexer(StreamIn *in){ Lexer *lexer; lexer = (Lexer *)MemAlloc(sizeof(Lexer)); if (lexer != null) { lexer->in = in; lexer->lines = 1; lexer->columns = 1; lexer->state = LEX_CONTENT; lexer->badAccess = 0; lexer->badLayout = 0; lexer->badChars = 0; lexer->badForm = 0; lexer->warnings = 0; lexer->errors = no; lexer->waswhite = no; lexer->pushed = no; lexer->insertspace = no; lexer->exiled = no; lexer->isvoyager = no; lexer->versions = VERS_EVERYTHING; lexer->doctype = VERS_UNKNOWN; lexer->bad_doctype = no; lexer->txtstart = 0; lexer->txtend = 0; lexer->token = null; lexer->lexbuf = null; lexer->lexlength = 0; lexer->lexsize = 0; lexer->inode = null; lexer->insert = null; lexer->istack = null; lexer->istacklength = 0; lexer->istacksize = 0; lexer->istackbase = 0; lexer->styles = null; } return lexer;}Bool EndOfInput(Lexer *lexer){ return (feof(lexer->in->file));}void FreeLexer(Lexer *lexer){ if (lexer->pushed) FreeNode(lexer->token); if (lexer->lexbuf != null) MemFree(lexer->lexbuf); while (lexer->istacksize > 0) PopInline(lexer, null); if (lexer->istack) MemFree(lexer->istack); if (lexer->styles) FreeStyles(lexer); MemFree(lexer);}static void AddByte(Lexer *lexer, uint c){ if (lexer->lexsize + 1 >= lexer->lexlength) { while (lexer->lexsize + 1 >= lexer->lexlength) { if (lexer->lexlength == 0) lexer->lexlength = 8192; else lexer->lexlength = lexer->lexlength * 2; } lexer->lexbuf = (char *)MemRealloc(lexer->lexbuf, lexer->lexlength*sizeof(char)); } lexer->lexbuf[lexer->lexsize++] = (char)c; lexer->lexbuf[lexer->lexsize] = '\0'; /* debug */}void ChangeChar(Lexer *lexer, char c){ if (lexer->lexsize > 0) { lexer->lexbuf[lexer->lexsize-1] = c; }}/* store char c as UTF-8 encoded byte stream */void AddCharToLexer(Lexer *lexer, uint c){ if (c < 128) AddByte(lexer, c); else if (c <= 0x7FF) { AddByte(lexer, 0xC0 | (c >> 6)); AddByte(lexer, 0x80 | (c & 0x3F)); } else if (c <= 0xFFFF) { AddByte(lexer, 0xE0 | (c >> 12)); AddByte(lexer, 0x80 | ((c >> 6) & 0x3F)); AddByte(lexer, 0x80 | (c & 0x3F)); } else if (c <= 0x1FFFFF) { AddByte(lexer, 0xF0 | (c >> 18)); AddByte(lexer, 0x80 | ((c >> 12) & 0x3F)); AddByte(lexer, 0x80 | ((c >> 6) & 0x3F)); AddByte(lexer, 0x80 | (c & 0x3F)); } else { AddByte(lexer, 0xF8 | (c >> 24)); AddByte(lexer, 0x80 | ((c >> 18) & 0x3F)); AddByte(lexer, 0x80 | ((c >> 12) & 0x3F)); AddByte(lexer, 0x80 | ((c >> 6) & 0x3F)); AddByte(lexer, 0x80 | (c & 0x3F)); }}void AddStringToLexer(Lexer *lexer, char *str){ uint c; while((c = *str++)) AddCharToLexer(lexer, c);}/* No longer attempts to insert missing ';' for unknown enitities unless one was present already, since this gives unexpected results. For example: <a href="something.htm?foo&bar&fred"> was tidied to: <a href="something.htm?foo&bar;&fred;"> rather than: <a href="something.htm?foo&bar&fred"> My thanks for Maurice Buxton for spotting this.*/static void ParseEntity(Lexer *lexer, int mode){ uint start, map; Bool first = yes, semicolon = no; int c, ch, startcol; start = lexer->lexsize - 1; /* to start at "&" */ startcol = lexer->in->curcol - 1; while ((c = ReadChar(lexer->in)) != EndOfStream) { if (c == ';') { semicolon = yes; break; } if (first && c == '#') { AddCharToLexer(lexer, c); first = no; continue; } first = no; map = MAP(c); if (map & namechar) { AddCharToLexer(lexer, c); continue; } /* otherwise put it back */ UngetChar(c, lexer->in); break; } /* make sure entity is null terminated */ lexer->lexbuf[lexer->lexsize] = '\0'; ch = EntityCode(lexer->lexbuf+start); /* deal with unrecognized entities */ if (ch <= 0) { /* set error position just before offending chararcter */ lexer->lines = lexer->in->curline; lexer->columns = startcol; if (lexer->lexsize > start +1 ) { ReportEntityError(lexer, UNKNOWN_ENTITY, lexer->lexbuf+start, ch); if (semicolon) AddCharToLexer(lexer, ';'); } else /* naked & */ ReportEntityError(lexer, UNESCAPED_AMPERSAND, lexer->lexbuf+start, ch); } else { if (c != ';') /* issue warning if not terminated by ';' */ { /* set error position just before offending chararcter */ lexer->lines = lexer->in->curline; lexer->columns = startcol; ReportEntityError(lexer, MISSING_SEMICOLON, lexer->lexbuf+start, c); } lexer->lexsize = start; if (ch == 160 && (mode & Preformatted)) ch = ' '; AddCharToLexer(lexer, ch); if (ch == '&' && !QuoteAmpersand) { AddCharToLexer(lexer, 'a'); AddCharToLexer(lexer, 'm'); AddCharToLexer(lexer, 'p'); AddCharToLexer(lexer, ';'); } }}static char ParseTagName(Lexer *lexer){ int map; uint c; /* fold case of first char in buffer */ c = lexer->lexbuf[lexer->txtstart]; map = MAP(c); if (!XmlTags && (map & uppercase) != 0) { c -= (uint)('A' - 'a'); lexer->lexbuf[lexer->txtstart] = c; } while ((c = ReadChar(lexer->in)) != EndOfStream) { map = MAP(c); if ((map & namechar) == 0) break; /* fold case of subsequent chars */ if (!XmlTags && (map & uppercase) != 0) c -= (uint)('A' - 'a'); AddCharToLexer(lexer, c); } lexer->txtend = lexer->lexsize; return c;}/* Used for elements and text nodes element name is null for text nodes start and end are offsets into lexbuf which contains the textual content of all elements in the parse tree. parent and content allow traversal of the parse tree in any direction. attributes are represented as a linked list of AttVal nodes which hold the strings for attribute/value pairs.*/Node *NewNode(void){ Node *node; node = (Node *)MemAlloc(sizeof(Node)); node->parent = null; node->prev = null; node->next = null; node->last = null; node->start = 0; node->end = 0; node->type = TextNode; node->closed = no; node->implicit = no; node->tag = null; node->was = null; node->element = null; node->attributes = null; node->content = null; return node;}/* used to clone heading nodes when split by an <HR> */Node *CloneNode(Lexer *lexer, Node *element){ Node *node; node = NewNode(); node->parent = element->parent; node->start = lexer->lexsize; node->end = lexer->lexsize; node->type = element->type; node->closed = element->closed; node->implicit = element->implicit; node->tag = element->tag; node->element = wstrdup(element->element); node->attributes = DupAttrs(element->attributes); return node;}/* free node's attributes */void FreeAttrs(Node *node){ AttVal *av; while (node->attributes) { av = node->attributes; if (av->attribute) MemFree(av->attribute); if (av->value) MemFree(av->value); node->attributes = av->next; MemFree(av); }}/* doesn't repair attribute list linkage */void FreeAttribute(AttVal *av){ if (av->attribute) MemFree(av->attribute); if (av->value) MemFree(av->value); MemFree(av);}/* Free document nodes by iterating through peers and recursing through children. Set next to null before calling FreeNode() to avoid freeing peer nodes. Doesn't patch up prev/next links. */void FreeNode(Node *node){ AttVal *av; Node *next;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -