📄 lexer.java

📁 windows 代码
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/*
 * @(#)Lexer.java   1.11 2000/08/16
 *
 */

package org.w3c.tidy;

/**
 *
 * Lexer for html parser
 *
 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
 * See Tidy.java for the copyright notice.
 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
 * HTML Tidy Release 4 Aug 2000</a>
 *
 * @author  Dave Raggett <dsr@w3.org>
 * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
 * @version 1.0, 1999/05/22
 * @version 1.0.1, 1999/05/29
 * @version 1.1, 1999/06/18 Java Bean
 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
 * @version 1.4, 1999/09/04 DOM support
 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
 */

/*
  Given a file stream fp it returns a sequence of tokens.

     GetToken(fp) gets the next token
     UngetToken(fp) provides one level undo

  The tags include an attribute list:

    - linked list of attribute/value nodes
    - each node has 2 null-terminated strings.
    - entities are replaced in attribute values

  white space is compacted if not in preformatted mode
  If not in preformatted mode then leading white space
  is discarded and subsequent white space sequences
  compacted to single space chars.

  If XmlTags is no then Tag names are folded to upper
  case and attribute names to lower case.

 Not yet done:
    -   Doctype subset and marked sections
*/

import java.io.PrintWriter;
import java.util.Stack;
import java.util.Vector;

public class Lexer {


    public StreamIn in;   /* file stream */
    public PrintWriter errout;   /* error output stream */
    public short badAccess; /* for accessibility errors */
    public short badLayout; /* for bad style errors */
    public short badChars;  /* for bad char encodings */
    public short badForm;   /* for mismatched/mispositioned form tags */
    public short warnings;  /* count of warnings in this document */
    public short errors;    /* count of errors */
    public int   lines;     /* lines seen */
    public int   columns;   /* at start of current token */
    public boolean waswhite;  /* used to collapse contiguous white space */
    public boolean pushed;    /* true after token has been pushed back */
    public boolean insertspace;   /* when space is moved after end tag */
    public boolean excludeBlocks;  /* Netscape compatibility */
    public boolean exiled;    /* true if moved out of table */
    public boolean isvoyager; /* true if xmlns attribute on html element */
    public short versions;  /* bit vector of HTML versions */
    public int doctype;    /* version as given by doctype (if any) */
    public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
    public int txtstart;  /* start of current node */
    public int txtend;    /* end of current node */
    public short state;     /* state of lexer's finite state machine */
    public Node token;

    /* 
      lexer character buffer

      parse tree nodes span onto this buffer
      which contains the concatenated text
      contents of all of the elements.

     lexsize must be reset for each file.
    */
    public byte[] lexbuf;   /* byte buffer of UTF-8 chars */
    public int lexlength;   /* allocated */
    public int lexsize;     /* used */

    /* Inline stack for compatibility with Mosaic */
    public Node inode;        /* for deferring text node */
    public int insert;        /* for inferring inline tags */
    public Stack istack;
    public int istackbase;    /* start of frame */

    public Style styles;      /* used for cleaning up presentation markup */

    public Configuration configuration;
    protected int seenBodyEndTag; /* used by parser */
    private Vector nodeList;

    public Lexer(StreamIn in, Configuration configuration)
    {
        this.in = in;
        this.lines = 1;
        this.columns = 1;
        this.state = LEX_CONTENT;
        this.badAccess = 0;
        this.badLayout = 0;
        this.badChars = 0;
        this.badForm = 0;
        this.warnings = 0;
        this.errors = 0;
        this.waswhite = false;
        this.pushed = false;
        this.insertspace = false;
        this.exiled = false;
        this.isvoyager = false;
        this.versions = Dict.VERS_EVERYTHING;
        this.doctype = Dict.VERS_UNKNOWN;
        this.badDoctype = false;
        this.txtstart = 0;
        this.txtend = 0;
        this.token = null;
        this.lexbuf =  null;
        this.lexlength = 0;
        this.lexsize = 0;
        this.inode = null;
        this.insert = -1;
        this.istack = new Stack();
        this.istackbase = 0;
        this.styles = null;
        this.configuration = configuration;
        this.seenBodyEndTag = 0;
        this.nodeList = new Vector();
    }

    public Node newNode()
    {
        Node node = new Node();
        nodeList.addElement(node);
        return node;
    }

    public Node newNode(short type, byte[] textarray, int start, int end)
    {
        Node node = new Node(type, textarray, start, end);
        nodeList.addElement(node);
        return node;
    }

    public Node newNode(short type, byte[] textarray, int start, int end, String element)
    {
        Node node = new Node(type, textarray, start, end, element);
        nodeList.addElement(node);
        return node;
    }

    public Node cloneNode(Node node)
    {
        Node cnode = (Node)node.clone();
        nodeList.addElement(cnode);
        for (AttVal att = cnode.attributes; att != null; att = att.next) {
            if (att.asp != null)
                nodeList.addElement(att.asp);
            if (att.php != null)
                nodeList.addElement(att.php);
        }
        return cnode;
    }

    public AttVal cloneAttributes(AttVal attrs)
    {
        AttVal cattrs = (AttVal)attrs.clone();
        for (AttVal att = cattrs; att != null; att = att.next) {
            if (att.asp != null)
                nodeList.addElement(att.asp);
            if (att.php != null)
                nodeList.addElement(att.php);
        }
        return cattrs;
    }

    protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
    {
        Node node;
        for (int i = 0; i < nodeList.size(); i++) {
            node = (Node)(nodeList.elementAt(i));
            if (node.textarray == oldtextarray)
                node.textarray = newtextarray;
        }
    }

    /* used for creating preformatted text from Word2000 */
    public Node newLineNode()
    {
        Node node = newNode();

        node.textarray = this.lexbuf;
        node.start = this.lexsize;
        addCharToLexer((int)'\n');
        node.end = this.lexsize;
        return node;
    }

    // Should always be able convert to/from UTF-8, so encoding exceptions are
    // converted to an Error to avoid adding throws declarations in
    // lots of methods.
    
    public static byte[] getBytes(String str) {
        try {
            return str.getBytes("UTF8");
        } catch (java.io.UnsupportedEncodingException e) {
            throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
        }
    }

    public static String getString(byte[] bytes, int offset, int length) {
        try {
            return new String(bytes, offset, length, "UTF8");
        } catch (java.io.UnsupportedEncodingException e) {
            throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
        }
    }

    public boolean endOfInput()
    {
        return this.in.isEndOfStream();
    }

    public void addByte(int c)
    {
        if (this.lexsize + 1 >= this.lexlength)
        {
            while (this.lexsize + 1 >= this.lexlength)
            {
                if (this.lexlength == 0)
                    this.lexlength = 8192;
                else
                    this.lexlength = this.lexlength * 2;
            }

            byte[] temp = this.lexbuf;
            this.lexbuf = new byte[ this.lexlength ];
            if (temp != null)
            {
                System.arraycopy( temp, 0, this.lexbuf, 0, temp.length );
                updateNodeTextArrays(temp, this.lexbuf);
            }
        }

        this.lexbuf[this.lexsize++] = (byte)c;
        this.lexbuf[this.lexsize] = (byte)'\0';  /* debug */
    }

    public void changeChar(byte c)
    {
        if (this.lexsize > 0)
        {
            this.lexbuf[this.lexsize-1] = c;
        }
    }

    /* store char c as UTF-8 encoded byte stream */
    public void addCharToLexer(int c)
    {
        if (c < 128)
            addByte(c);
        else if (c <= 0x7FF)
        {
            addByte(0xC0 | (c >> 6));
            addByte(0x80 | (c & 0x3F));
        }
        else if (c <= 0xFFFF)
        {
            addByte(0xE0 | (c >> 12));
            addByte(0x80 | ((c >> 6) & 0x3F));
            addByte(0x80 | (c & 0x3F));
        }
        else if (c <= 0x1FFFFF)
        {
            addByte(0xF0 | (c >> 18));
            addByte(0x80 | ((c >> 12) & 0x3F));
            addByte(0x80 | ((c >> 6) & 0x3F));
            addByte(0x80 | (c & 0x3F));
        }
        else
        {
            addByte(0xF8 | (c >> 24));
            addByte(0x80 | ((c >> 18) & 0x3F));
            addByte(0x80 | ((c >> 12) & 0x3F));
            addByte(0x80 | ((c >> 6) & 0x3F));
            addByte(0x80 | (c & 0x3F));
        }
    }

    public void addStringToLexer(String str)
    {
        for ( int i = 0; i < str.length(); i++ ) {
            addCharToLexer( (int)str.charAt(i) );
        }
    }

    /*
      No longer attempts to insert missing ';' for unknown
      enitities unless one was present already, since this
      gives unexpected results.

      For example:   <a href="something.htm?foo&bar&fred">
      was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
      rather than:   <a href="something.htm?foo&amp;bar&amp;fred">

      My thanks for Maurice Buxton for spotting this.
    */
    public void parseEntity(short mode)
    {
        short map;
        int start;
        boolean first = true;
        boolean semicolon = false;
        boolean numeric = false;
        int c, ch, startcol;
        String str;

        start = this.lexsize - 1;  /* to start at "&" */
        startcol = this.in.curcol - 1;

        while (true)
        {
            c = this.in.readChar();
            if (c == StreamIn.EndOfStream) break;
            if (c == ';')
            {
                semicolon = true;
                break;
            }

            if (first && c == '#')
            {
                addCharToLexer(c);
                first = false;
                numeric = true;
                continue;
            }

            first = false;
            map = MAP((char)c);

            /* AQ: Added flag for numeric entities so that numeric entities
               with missing semi-colons are recognized.
               Eg. "&#114e&#112;..." is recognized as "rep"
            */
            if (numeric && ((c == 'x') || ((map & DIGIT) != 0)))
            {
                addCharToLexer(c);
                continue;
            }
            if (!numeric && ((map & NAMECHAR) != 0))
            {
                addCharToLexer(c);
                continue;
            }

            /* otherwise put it back */

            this.in.ungetChar(c);
            break;
        }

        str = getString( this.lexbuf, start, this.lexsize - start );
        ch = EntityTable.getDefaultEntityTable().entityCode( str );

        /* deal with unrecognized entities */
        if (ch <= 0)
        {
            /* set error position just before offending chararcter */
            this.lines = this.in.curline;
            this.columns = startcol;

            if (this.lexsize > start +1 )
            {
                Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);

                if (semicolon)
                    addCharToLexer(';');
            }
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -