⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lexer.c

📁 我搜集到的一个java常用类库的源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
/*  lexer.c - Lexer for html parser    (c) 1998-2000 (W3C) MIT, INRIA, Keio University  See tidy.c for the copyright notice.*//*  Given a file stream fp it returns a sequence of tokens.     GetToken(fp) gets the next token     UngetToken(fp) provides one level undo  The tags include an attribute list:    - linked list of attribute/value nodes    - each node has 2 null-terminated strings.    - entities are replaced in attribute values  white space is compacted if not in preformatted mode  If not in preformatted mode then leading white space  is discarded and subsequent white space sequences  compacted to single space chars.  If XmlTags is no then Tag names are folded to upper  case and attribute names to lower case. Not yet done:    -   Doctype subset and marked sections*/#include "platform.h"#include "html.h"AttVal *ParseAttrs(Lexer *lexer, Bool *isempty);  /* forward references */Node *CommentToken(Lexer *lexer);/* used to classify chars for lexical purposes */#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)uint lexmap[128];#define XHTML_NAMESPACE "http://www.w3.org/1999/xhtml"#define MML_NAMESPACE  "http://www.w3.org/1998/Math/MathML"#define XLINK_NAMESPACE "http://www.w3.org/1999/xlink"/* the 3 URIs  for the XHTML 1.0 DTDs */#define voyager_loose    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"#define voyager_strict   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"#define voyager_frameset "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"#define W3C_VERSIONS 8struct _vers{    char *name;    char *voyager_name;    char *profile;    int code;} W3C_Version[] ={    {"HTML 2.0", "XHTML 1.0 Strict", voyager_strict, VERS_HTML20},    {"HTML 3.2", "XHTML 1.0 Transitional", voyager_loose, VERS_HTML32},    {"HTML 4.0", "XHTML 1.0 Strict", voyager_strict, VERS_HTML40_STRICT},    {"HTML 4.0 Transitional", "XHTML 1.0 Transitional", voyager_loose, VERS_HTML40_LOOSE},    {"HTML 4.0 Frameset", "XHTML 1.0 Frameset", voyager_frameset, VERS_FRAMES},    {"HTML 4.01", "XHTML 1.0 Strict", voyager_strict, VERS_HTML40_STRICT},    {"HTML 4.01 Transitional", "XHTML 1.0 Transitional", voyager_loose, VERS_HTML40_LOOSE},    {"HTML 4.01 Frameset", "XHTML 1.0 Frameset", voyager_frameset, VERS_FRAMES}};Bool IsWhite(uint c){    uint map = MAP(c);    return (Bool)(map & white);}Bool IsDigit(uint c){    uint map;    map = MAP(c);    return (Bool)(map & digit);}Bool IsLetter(uint c){    uint map;    map = MAP(c);    return (Bool)(map & letter);}uint ToLower(uint c){    uint map = MAP(c);    if (map & uppercase)        c += 'a' - 'A';    return c;}uint ToUpper(uint c){    uint map = MAP(c);    if (map & lowercase)        c += 'A' - 'a';    return c;}char FoldCase(char c, Bool tocaps){    uint map;    if (!XmlTags)    {        map = MAP(c);        if (tocaps)        {            if (map & lowercase)                c += 'A' - 'a';        }        else /* force to lower case */        {            if (map & uppercase)                c += 'a' - 'A';        }    }    return c;}/*   node->type is one of these:    #define TextNode    1    #define StartTag    2    #define EndTag      3    #define StartEndTag 4*/Lexer *NewLexer(StreamIn *in){    Lexer *lexer;    lexer = (Lexer *)MemAlloc(sizeof(Lexer));    if (lexer != null)    {        lexer->in = in;        lexer->lines = 1;        lexer->columns = 1;        lexer->state = LEX_CONTENT;        lexer->badAccess = 0;        lexer->badLayout = 0;        lexer->badChars = 0;        lexer->badForm = 0;        lexer->warnings = 0;        lexer->errors = no;        lexer->waswhite = no;        lexer->pushed = no;        lexer->insertspace = no;        lexer->exiled = no;        lexer->isvoyager = no;        lexer->versions = VERS_EVERYTHING;        lexer->doctype = VERS_UNKNOWN;        lexer->bad_doctype = no;        lexer->txtstart = 0;        lexer->txtend = 0;        lexer->token = null;        lexer->lexbuf =  null;        lexer->lexlength = 0;        lexer->lexsize = 0;        lexer->inode = null;        lexer->insert = null;        lexer->istack = null;        lexer->istacklength = 0;        lexer->istacksize = 0;        lexer->istackbase = 0;        lexer->styles = null;    }    return lexer;}Bool EndOfInput(Lexer *lexer){    return  (feof(lexer->in->file));}void FreeLexer(Lexer *lexer){    if (lexer->pushed)        FreeNode(lexer->token);    if (lexer->lexbuf != null)        MemFree(lexer->lexbuf);    while (lexer->istacksize > 0)        PopInline(lexer, null);    if (lexer->istack)        MemFree(lexer->istack);    if (lexer->styles)        FreeStyles(lexer);    MemFree(lexer);}static void AddByte(Lexer *lexer, uint c){    if (lexer->lexsize + 1 >= lexer->lexlength)    {        while (lexer->lexsize + 1 >= lexer->lexlength)        {            if (lexer->lexlength == 0)                lexer->lexlength = 8192;            else                lexer->lexlength = lexer->lexlength * 2;        }        lexer->lexbuf = (char *)MemRealloc(lexer->lexbuf, lexer->lexlength*sizeof(char));    }    lexer->lexbuf[lexer->lexsize++] = (char)c;    lexer->lexbuf[lexer->lexsize] = '\0';  /* debug */}void ChangeChar(Lexer *lexer, char c){    if (lexer->lexsize > 0)    {        lexer->lexbuf[lexer->lexsize-1] = c;    }}/* store char c as UTF-8 encoded byte stream */void AddCharToLexer(Lexer *lexer, uint c){    if (c < 128)        AddByte(lexer, c);    else if (c <= 0x7FF)    {        AddByte(lexer, 0xC0 | (c >> 6));        AddByte(lexer, 0x80 | (c & 0x3F));    }    else if (c <= 0xFFFF)    {        AddByte(lexer, 0xE0 | (c >> 12));        AddByte(lexer, 0x80 | ((c >> 6) & 0x3F));        AddByte(lexer, 0x80 | (c & 0x3F));    }    else if (c <= 0x1FFFFF)    {        AddByte(lexer, 0xF0 | (c >> 18));        AddByte(lexer, 0x80 | ((c >> 12) & 0x3F));        AddByte(lexer, 0x80 | ((c >> 6) & 0x3F));        AddByte(lexer, 0x80 | (c & 0x3F));    }    else    {        AddByte(lexer, 0xF8 | (c >> 24));        AddByte(lexer, 0x80 | ((c >> 18) & 0x3F));        AddByte(lexer, 0x80 | ((c >> 12) & 0x3F));        AddByte(lexer, 0x80 | ((c >> 6) & 0x3F));        AddByte(lexer, 0x80 | (c & 0x3F));    }}void AddStringToLexer(Lexer *lexer, char *str){    uint c;    while((c = *str++))        AddCharToLexer(lexer, c);}/*  No longer attempts to insert missing ';' for unknown  enitities unless one was present already, since this  gives unexpected results.  For example:   <a href="something.htm?foo&bar&fred">  was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">  rather than:   <a href="something.htm?foo&amp;bar&amp;fred">  My thanks for Maurice Buxton for spotting this.*/static void ParseEntity(Lexer *lexer, int mode){    uint start, map;    Bool first = yes, semicolon = no;    int c, ch, startcol;    start = lexer->lexsize - 1;  /* to start at "&" */    startcol = lexer->in->curcol - 1;    while ((c = ReadChar(lexer->in)) != EndOfStream)    {        if (c == ';')        {            semicolon = yes;            break;        }        if (first && c == '#')        {            AddCharToLexer(lexer, c);            first = no;            continue;        }        first = no;        map = MAP(c);        if (map & namechar)        {            AddCharToLexer(lexer, c);            continue;        }        /* otherwise put it back */        UngetChar(c, lexer->in);        break;    }    /* make sure entity is null terminated */    lexer->lexbuf[lexer->lexsize] = '\0';    ch = EntityCode(lexer->lexbuf+start);    /* deal with unrecognized entities */    if (ch <= 0)    {        /* set error position just before offending chararcter */        lexer->lines = lexer->in->curline;        lexer->columns = startcol;        if (lexer->lexsize > start +1 )        {            ReportEntityError(lexer, UNKNOWN_ENTITY, lexer->lexbuf+start, ch);            if (semicolon)                AddCharToLexer(lexer, ';');        }        else /* naked & */            ReportEntityError(lexer, UNESCAPED_AMPERSAND, lexer->lexbuf+start, ch);    }    else    {        if (c != ';')    /* issue warning if not terminated by ';' */        {            /* set error position just before offending chararcter */            lexer->lines = lexer->in->curline;            lexer->columns = startcol;            ReportEntityError(lexer, MISSING_SEMICOLON, lexer->lexbuf+start, c);        }        lexer->lexsize = start;        if (ch == 160 && (mode & Preformatted))            ch = ' ';        AddCharToLexer(lexer, ch);        if (ch == '&' && !QuoteAmpersand)        {            AddCharToLexer(lexer, 'a');            AddCharToLexer(lexer, 'm');            AddCharToLexer(lexer, 'p');            AddCharToLexer(lexer, ';');        }    }}static char ParseTagName(Lexer *lexer){    int map;    uint c;    /* fold case of first char in buffer */    c = lexer->lexbuf[lexer->txtstart];    map = MAP(c);    if (!XmlTags && (map & uppercase) != 0)    {        c -= (uint)('A' - 'a');        lexer->lexbuf[lexer->txtstart] = c;    }    while ((c = ReadChar(lexer->in)) != EndOfStream)    {        map = MAP(c);        if ((map & namechar) == 0)            break;       /* fold case of subsequent chars */       if (!XmlTags && (map & uppercase) != 0)            c -= (uint)('A' - 'a');       AddCharToLexer(lexer, c);    }    lexer->txtend = lexer->lexsize;    return c;}/*  Used for elements and text nodes  element name is null for text nodes  start and end are offsets into lexbuf  which contains the textual content of  all elements in the parse tree.  parent and content allow traversal  of the parse tree in any direction.  attributes are represented as a linked  list of AttVal nodes which hold the  strings for attribute/value pairs.*/Node *NewNode(void){    Node *node;    node = (Node *)MemAlloc(sizeof(Node));    node->parent = null;    node->prev = null;    node->next = null;    node->last = null;    node->start = 0;    node->end = 0;    node->type = TextNode;    node->closed = no;    node->implicit = no;    node->tag = null;    node->was = null;    node->element = null;    node->attributes = null;    node->content = null;    return node;}/* used to clone heading nodes when split by an <HR> */Node *CloneNode(Lexer *lexer, Node *element){    Node *node;    node = NewNode();    node->parent = element->parent;    node->start = lexer->lexsize;    node->end = lexer->lexsize;    node->type = element->type;    node->closed = element->closed;    node->implicit = element->implicit;    node->tag = element->tag;    node->element = wstrdup(element->element);    node->attributes = DupAttrs(element->attributes);    return node;}/* free node's attributes */void FreeAttrs(Node *node){    AttVal *av;    while (node->attributes)    {        av = node->attributes;        if (av->attribute)            MemFree(av->attribute);        if (av->value)            MemFree(av->value);        node->attributes = av->next;        MemFree(av);    }}/* doesn't repair attribute list linkage */void FreeAttribute(AttVal *av){    if (av->attribute)        MemFree(av->attribute);    if (av->value)        MemFree(av->value);    MemFree(av);}/*  Free document nodes by iterating through peers and recursing  through children. Set next to null before calling FreeNode()  to avoid freeing peer nodes. Doesn't patch up prev/next links. */void FreeNode(Node *node){    AttVal *av;    Node *next;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -