📄 lexer.java

📁 html 解析处理代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
            // got some characters            ret = getNodeFactory ().createStringNode (                this.getPage (), start, end);        else            ret = null;        return (ret);    }    /**     * Generate a whitespace 'attribute',     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void whitespace (Vector attributes, int[] bookmarks)    {        if (bookmarks[1] > bookmarks[0])            attributes.addElement (new PageAttribute (                mPage, -1, -1, bookmarks[0], bookmarks[1], (char)0));    }    /**     * Generate a standalone attribute -- font.     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void standalone (Vector attributes, int[] bookmarks)    {        attributes.addElement (new PageAttribute (            mPage, bookmarks[1], bookmarks[2], -1, -1, (char)0));    }    /**     * Generate an empty attribute -- color=.     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void empty (Vector attributes, int[] bookmarks)    {        attributes.addElement (new PageAttribute (            mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char)0));    }    /**     * Generate an unquoted attribute -- size=1.     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void naked (Vector attributes, int[] bookmarks)    {        attributes.addElement (new PageAttribute (            mPage, bookmarks[1], bookmarks[2], bookmarks[3],            bookmarks[4], (char)0));    }    /**     * Generate an single quoted attribute -- width='100%'.     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void single_quote (Vector attributes, int[] bookmarks)    {        attributes.addElement (new PageAttribute (            mPage, bookmarks[1], bookmarks[2], bookmarks[4] + 1,            bookmarks[5], '\''));    }    /**     * Generate an double quoted attribute -- CONTENT="Test Development".     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void double_quote (Vector attributes, int[] bookmarks)    {        attributes.addElement (new PageAttribute (            mPage, bookmarks[1], bookmarks[2], bookmarks[5] + 1,            bookmarks[6], '"'));    }    /**     * Parse a tag.     * Parse the name and attributes from a start tag.<p>     * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2">     * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>     * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2<p>     * <cite>     * 3.2.2 Attributes<p>     * Elements may have associated properties, called attributes, which may     * have values (by default, or set by authors or scripts). Attribute/value     * pairs appear before the final ">" of an element's start tag. Any number     * of (legal) attribute value pairs, separated by spaces, may appear in an     * element's start tag. They may appear in any order.<p>     * In this example, the id attribute is set for an H1 element:     * <code>     * &lt;H1 id="section1"&gt;     * </code>     * This is an identified heading thanks to the id attribute     * <code>     * &lt;/H1&gt;     * </code>     * By default, SGML requires that all attribute values be delimited using     * either double quotation marks (ASCII decimal 34) or single quotation     * marks (ASCII decimal 39). Single quote marks can be included within the     * attribute value when the value is delimited by double quote marks, and     * vice versa. Authors may also use numeric character references to     * represent double quotes (&amp;#34;) and single quotes (&amp;#39;).     * For doublequotes authors can also use the character entity reference     * &amp;quot;.<p>     * In certain cases, authors may specify the value of an attribute without     * any quotation marks. The attribute value may only contain letters     * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45),     * periods (ASCII decimal 46), underscores (ASCII decimal 95),     * and colons (ASCII decimal 58). We recommend using quotation marks even     * when it is possible to eliminate them.<p>     * Attribute names are always case-insensitive.<p>     * Attribute values are generally case-insensitive. The definition of each     * attribute in the reference manual indicates whether its value is     * case-insensitive.<p>     * All the attributes defined by this specification are listed in the     * attribute index.<p>     * </cite>     * <p>     * This method uses a state machine with the following states:     * <ol>     * <li>state 0 - outside of any attribute</li>     * <li>state 1 - within attributre name</li>     * <li>state 2 - equals hit</li>     * <li>state 3 - within naked attribute value.</li>     * <li>state 4 - within single quoted attribute value</li>     * <li>state 5 - within double quoted attribute value</li>     * <li>state 6 - whitespaces after attribute name could lead to state 2 (=)or state 0</li>     * </ol>     * <p>     * The starting point for the various components is stored in an array     * of integers that match the initiation point for the states one-for-one,     * i.e. bookmarks[0] is where state 0 began, bookmarks[1] is where state 1     * began, etc.     * Attributes are stored in a <code>Vector</code> having     * one slot for each whitespace or attribute/value pair.     * The first slot is for attribute name (kind of like a standalone attribute).     * @param start The position at which to start scanning.     * @return The parsed tag.     * @exception ParserException If a problem occurs reading from the source.     */    protected Node parseTag (int start)        throws            ParserException    {        boolean done;        char ch;        int state;        int[] bookmarks;        Vector attributes;        done = false;        attributes = new Vector ();        state = 0;        bookmarks = new int[8];        bookmarks[0] = mCursor.getPosition ();        while (!done)        {            bookmarks[state + 1] = mCursor.getPosition ();            ch = mPage.getCharacter (mCursor);            switch (state)            {                case 0: // outside of any attribute                    if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch))                    {                        if ('<' == ch)                        {                            // don't consume the opening angle                            mPage.ungetCharacter (mCursor);                            bookmarks[state + 1] = mCursor.getPosition ();                        }                        whitespace (attributes, bookmarks);                        done = true;                    }                    else if (!Character.isWhitespace (ch))                    {                        whitespace (attributes, bookmarks);                        state = 1;                    }                    break;                case 1: // within attribute name                    if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch))                    {                        if ('<' == ch)                        {                            // don't consume the opening angle                            mPage.ungetCharacter (mCursor);                            bookmarks[state + 1] = mCursor.getPosition ();                        }                        standalone (attributes, bookmarks);                        done = true;                    }                    else if (Character.isWhitespace (ch))                    {                        // whitespaces might be followed by next attribute or an equal sign                        // see Bug #891058 Bug in lexer.                        bookmarks[6] = bookmarks[2]; // setting the bookmark[0] is done in state 6 if applicable                        state = 6;                    }                    else if ('=' == ch)                        state = 2;                    break;                case 2: // equals hit                    if ((Page.EOF == ch) || ('>' == ch))                    {                        empty (attributes, bookmarks);                        done = true;                    }                    else if ('\'' == ch)                    {                        state = 4;                        bookmarks[4] = bookmarks[3];                    }                    else if ('"' == ch)                    {                        state = 5;                        bookmarks[5] = bookmarks[3];                    }                    else if (Character.isWhitespace (ch))                    {                         // collect white spaces after "=" into the assignment string;                        // do nothing                        // see Bug #891058 Bug in lexer.                    }                    else                        state = 3;                    break;                case 3: // within naked attribute value                    if ((Page.EOF == ch) || ('>' == ch))                    {                        naked (attributes, bookmarks);                        done = true;                    }                    else if (Character.isWhitespace (ch))                    {                        naked (attributes, bookmarks);                        bookmarks[0] = bookmarks[4];                        state = 0;                    }                    break;                case 4: // within single quoted attribute value                    if (Page.EOF == ch)                    {                        single_quote (attributes, bookmarks);                        done = true; // complain?                    }                    else if ('\'' == ch)                    {                        single_quote (attributes, bookmarks);                        bookmarks[0] = bookmarks[5] + 1;                        state = 0;                    }                    break;                case 5: // within double quoted attribute value                    if (Page.EOF == ch)                    {                        double_quote (attributes, bookmarks);                        done = true; // complain?                    }                    else if ('"' == ch)                    {                        double_quote (attributes, bookmarks);                        bookmarks[0] = bookmarks[6] + 1;                        state = 0;                    }                    break;                // patch for lexer state correction by                // Gernot Fricke                // See Bug # 891058 Bug in lexer.                case 6: // undecided for state 0 or 2                        // we have read white spaces after an attributte name                    if (Page.EOF == ch)                    {                        // same as last else clause                        standalone (attributes, bookmarks);                  	    bookmarks[0]=bookmarks[6];                  	    mPage.ungetCharacter (mCursor);                  	    state=0;                    }                    else if (Character.isWhitespace (ch))                    {                         // proceed                    }                     else if ('=' == ch) // yepp. the white spaces belonged to the equal.                    {                        bookmarks[2] = bookmarks[6];                        bookmarks[3] = bookmarks[7];                        state=2;                    }                    else                    {                        // white spaces were not ended by equal                        // meaning the attribute was a stand alone attribute                        // now: create the stand alone attribute and rewind                         // the cursor to the end of the white spaces                        // and restart scanning as whitespace attribute.                  	    standalone (attributes, bookmarks);                  	    bookmarks[0]=bookmarks[6];                  	    mPage.ungetCharacter (mCursor);                  	    state=0;                   	}                    break;                default:                    throw new IllegalStateException ("how the fuck did we get in state " + state);            }        }        return (makeTag (start, mCursor.getPosition (), attributes));    }    /**     * Create a tag node based on the current cursor and the one provided.     * @param start The starting point of the node.     * @param end The ending point of the node.     * @param attributes The attributes parsed from the tag.     * @exception ParserException If the nodefactory creation of the tag node fails.     * @return The new Tag node.     */    protected Node makeTag (int start, int end, Vector attributes)        throws            ParserException    {        int length;        Node ret;        length = end - start;        if (0 != length)        {   // return tag based on second character, '/', '%', Letter (ch), '!'            if (2 > length)                // this is an error                return (makeString (start, end));            ret = getNodeFactory ().createTagNode (this.getPage (), start, end, attributes);        }        else            ret = null;        return (ret);    }    /**     * Parse a comment.     * Parse a remark markup.<p>     * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4">     * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>     * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4<p>     * <cite>     * 3.2.4 Comments<p>     * HTML comments have the following syntax:<p>     * <code>     * &lt;!-- this is a comment --&gt;<p>     * &lt;!-- and so is this one,<p>     *     which occupies more than one line --&gt;<p>     * </code>     * White space is not permitted between the markup declaration     * open delimiter("&lt;!") and the comment open delimiter ("--"),     * but is permitted between the comment close delimiter ("--") and     * the markup declaration close delimiter ("&gt;").     * A common error is to include a string of hyphens ("---") within a comment.     * Authors should avoid putting two or more adjacent hyphens inside comments.     * Information that appears between comments has no special meaning     * (e.g., character references are not interpreted as such).     * Note that comments are markup.<p>     * </cite>     * <p>     * This method uses a state machine with the following states:     * <ol>     * <li>state 0 - prior to the first open delimiter (first dash)</li>     * <li>state 1 - prior to the second open delimiter (second dash)</li>     * <li>state 2 - prior to the first closing delimiter (first dash)</li>     * <li>state 3 - prior to the second closing delimiter (second dash)</li>     * <li>state 4 - prior to the terminating &gt;</li>     * </ol>     * <p>     * All comment text (everything excluding the &lt; and &gt;), is included     * in the remark text.     * We allow terminators like --!&gt; even though this isn't part of the spec.     * @param start The position at which to start scanning.     * @param quotesmart If <code>true</code>, strings ignore quoted contents.     * @return The parsed node.     * @exception ParserException If a problem occurs reading from the source.     */    protected Node parseRemark (int start, boolean quotesmart)        throws            ParserException    {        boolean done;        char ch;        int state;        done = false;        state = 0;        while (!done)        {            ch = mPage.getCharacter (mCursor);            if (Page.EOF == ch)                done = true;            else                switch (state)                {                    case 0: // prior to the first open delimiter                        if ('>' == ch)                            done = true;                        if ('-' == ch)                            state = 1;                        else                            return (parseString (start, quotesmart));                        break;                    case 1: // prior to the second open delimiter                        if ('-' == ch)                        {                            // handle <!--> because netscape does                            ch = mPage.getCharacter (mCursor);                            if (Page.EOF == ch)                                done = true;                            else if ('>' == ch)                                done = true;                            else                            {                                mPage.ungetCharacter (mCursor);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -