📄 lexer.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
            ch = mPage.getCharacter (mCursor);            if (Page.EOF == ch)                done = true;            else if (0x1b == ch) // escape            {                ch = mPage.getCharacter (mCursor);                if (Page.EOF == ch)                    done = true;                else if ('$' == ch)                {                    ch = mPage.getCharacter (mCursor);                    if (Page.EOF == ch)                        done = true;                    else if ('B' == ch)                        scanJIS (mCursor);                    else                    {                        mCursor.retreat ();                        mCursor.retreat ();                    }                }                else                    mCursor.retreat ();            }            else if (quotesmart && (0 == quote)                && (('\'' == ch) || ('"' == ch)))                quote = ch; // enter quoted state            // patch from Gernot Fricke to handle escaped closing quote            else if (quotesmart && (0 != quote) && ('\\' == ch))            {                ch = mPage.getCharacter (mCursor); // try to consume escape                if ((Page.EOF != ch)                    && ('\\' != ch) // escaped backslash                    && (ch != quote)) // escaped quote character                       // ( reflects ["] or [']  whichever opened the quotation)                    mCursor.retreat(); // unconsume char if char not an escape            }            else if (quotesmart && (ch == quote))                quote = 0; // exit quoted state            else if (quotesmart && (0 == quote) && (ch == '/'))            {                // handle multiline and double slash comments (with a quote)                // in script like:                // I can't handle single quotations.                ch = mPage.getCharacter (mCursor);                if (Page.EOF == ch)                    done = true;                else if ('/' == ch)                {                    do                        ch = mPage.getCharacter (mCursor);                    while ((Page.EOF != ch) && ('\n' != ch));                }                else if ('*' == ch)                {                    do                    {                        do                            ch = mPage.getCharacter (mCursor);                        while ((Page.EOF != ch) && ('*' != ch));                        ch = mPage.getCharacter (mCursor);                        if (ch == '*')                            mCursor.retreat ();                    }                    while ((Page.EOF != ch) && ('/' != ch));                }                else                    mCursor.retreat ();            }            else if ((0 == quote) && ('<' == ch))            {                ch = mPage.getCharacter (mCursor);                if (Page.EOF == ch)                    done = true;                // the order of these tests might be optimized for speed:                else if ('/' == ch || Character.isLetter (ch)                    || '!' == ch || '%' == ch || '?' == ch)                {                    done = true;                    mCursor.retreat ();                    mCursor.retreat ();                }                else                {                    // it's not a tag, so keep going, but check for quotes                    mCursor.retreat ();                }            }        }        return (makeString (start, mCursor.getPosition ()));    }    /**     * Create a string node based on the current cursor and the one provided.     * @param start The starting point of the node.     * @param end The ending point of the node.     * @exception ParserException If the nodefactory creation of the text     * node fails.     * @return The new Text node.     */    protected Node makeString (int start, int end)        throws            ParserException    {        int length;        Node ret;        length = end - start;        if (0 != length)            // got some characters            ret = getNodeFactory ().createStringNode (                this.getPage (), start, end);        else            ret = null;        return (ret);    }    /**     * Generate a whitespace 'attribute',     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void whitespace (Vector attributes, int[] bookmarks)    {        if (bookmarks[1] > bookmarks[0])            attributes.addElement (new PageAttribute (                mPage, -1, -1, bookmarks[0], bookmarks[1], (char)0));    }    /**     * Generate a standalone attribute -- font.     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void standalone (Vector attributes, int[] bookmarks)    {        attributes.addElement (new PageAttribute (            mPage, bookmarks[1], bookmarks[2], -1, -1, (char)0));    }    /**     * Generate an empty attribute -- color=.     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void empty (Vector attributes, int[] bookmarks)    {        attributes.addElement (new PageAttribute (            mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char)0));    }    /**     * Generate an unquoted attribute -- size=1.     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void naked (Vector attributes, int[] bookmarks)    {        attributes.addElement (new PageAttribute (            mPage, bookmarks[1], bookmarks[2], bookmarks[3],            bookmarks[4], (char)0));    }    /**     * Generate an single quoted attribute -- width='100%'.     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void single_quote (Vector attributes, int[] bookmarks)    {        attributes.addElement (new PageAttribute (            mPage, bookmarks[1], bookmarks[2], bookmarks[4] + 1,            bookmarks[5], '\''));    }    /**     * Generate an double quoted attribute -- CONTENT="Test Development".     * @param attributes The list so far.     * @param bookmarks The array of positions.     */    private void double_quote (Vector attributes, int[] bookmarks)    {        attributes.addElement (new PageAttribute (            mPage, bookmarks[1], bookmarks[2], bookmarks[5] + 1,            bookmarks[6], '"'));    }    /**     * Parse a tag.     * Parse the name and attributes from a start tag.<p>     * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2">     * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>     * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2<p>     * <cite>     * 3.2.2 Attributes<p>     * Elements may have associated properties, called attributes, which may     * have values (by default, or set by authors or scripts). Attribute/value     * pairs appear before the final ">" of an element's start tag. Any number     * of (legal) attribute value pairs, separated by spaces, may appear in an     * element's start tag. They may appear in any order.<p>     * In this example, the id attribute is set for an H1 element:     * <code>     * &lt;H1 id="section1"&gt;     * </code>     * This is an identified heading thanks to the id attribute     * <code>     * &lt;/H1&gt;     * </code>     * By default, SGML requires that all attribute values be delimited using     * either double quotation marks (ASCII decimal 34) or single quotation     * marks (ASCII decimal 39). Single quote marks can be included within the     * attribute value when the value is delimited by double quote marks, and     * vice versa. Authors may also use numeric character references to     * represent double quotes (&amp;#34;) and single quotes (&amp;#39;).     * For doublequotes authors can also use the character entity reference     * &amp;quot;.<p>     * In certain cases, authors may specify the value of an attribute without     * any quotation marks. The attribute value may only contain letters     * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45),     * periods (ASCII decimal 46), underscores (ASCII decimal 95),     * and colons (ASCII decimal 58). We recommend using quotation marks even     * when it is possible to eliminate them.<p>     * Attribute names are always case-insensitive.<p>     * Attribute values are generally case-insensitive. The definition of each     * attribute in the reference manual indicates whether its value is     * case-insensitive.<p>     * All the attributes defined by this specification are listed in the     * attribute index.<p>     * </cite>     * <p>     * This method uses a state machine with the following states:     * <ol>     * <li>state 0 - outside of any attribute</li>     * <li>state 1 - within attributre name</li>     * <li>state 2 - equals hit</li>     * <li>state 3 - within naked attribute value.</li>     * <li>state 4 - within single quoted attribute value</li>     * <li>state 5 - within double quoted attribute value</li>     * <li>state 6 - whitespaces after attribute name could lead to state 2 (=)or state 0</li>     * </ol>     * <p>     * The starting point for the various components is stored in an array     * of integers that match the initiation point for the states one-for-one,     * i.e. bookmarks[0] is where state 0 began, bookmarks[1] is where state 1     * began, etc.     * Attributes are stored in a <code>Vector</code> having     * one slot for each whitespace or attribute/value pair.     * The first slot is for attribute name (kind of like a standalone attribute).     * @param start The position at which to start scanning.     * @return The parsed tag.     * @exception ParserException If a problem occurs reading from the source.     */    protected Node parseTag (int start)        throws            ParserException    {        boolean done;        char ch;        int state;        int[] bookmarks;        Vector attributes;        done = false;        attributes = new Vector ();        state = 0;        bookmarks = new int[8];        bookmarks[0] = mCursor.getPosition ();        while (!done)        {            bookmarks[state + 1] = mCursor.getPosition ();            ch = mPage.getCharacter (mCursor);            switch (state)            {                case 0: // outside of any attribute                    if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch))                    {                        if ('<' == ch)                        {                            // don't consume the opening angle                            mCursor.retreat ();                            bookmarks[state + 1] = mCursor.getPosition ();                        }                        whitespace (attributes, bookmarks);                        done = true;                    }                    else if (!Character.isWhitespace (ch))                    {                        whitespace (attributes, bookmarks);                        state = 1;                    }                    break;                case 1: // within attribute name                    if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch))                    {                        if ('<' == ch)                        {                            // don't consume the opening angle                            mCursor.retreat ();                            bookmarks[state + 1] = mCursor.getPosition ();                        }                        standalone (attributes, bookmarks);                        done = true;                    }                    else if (Character.isWhitespace (ch))                    {                        // whitespaces might be followed by next attribute or an equal sign                        // see Bug #891058 Bug in lexer.                        bookmarks[6] = bookmarks[2]; // setting the bookmark[0] is done in state 6 if applicable                        state = 6;                    }                    else if ('=' == ch)                        state = 2;                    break;                case 2: // equals hit                    if ((Page.EOF == ch) || ('>' == ch))                    {                        empty (attributes, bookmarks);                        done = true;                    }                    else if ('\'' == ch)                    {                        state = 4;                        bookmarks[4] = bookmarks[3];                    }                    else if ('"' == ch)                    {                        state = 5;                        bookmarks[5] = bookmarks[3];                    }                    else if (Character.isWhitespace (ch))                    {                         // collect white spaces after "=" into the assignment string;                        // do nothing                        // see Bug #891058 Bug in lexer.                    }                    else                        state = 3;                    break;                case 3: // within naked attribute value                    if ((Page.EOF == ch) || ('>' == ch))                    {                        naked (attributes, bookmarks);                        done = true;                    }                    else if (Character.isWhitespace (ch))                    {                        naked (attributes, bookmarks);                        bookmarks[0] = bookmarks[4];                        state = 0;                    }                    break;                case 4: // within single quoted attribute value                    if (Page.EOF == ch)                    {                        single_quote (attributes, bookmarks);                        done = true; // complain?                    }                    else if ('\'' == ch)                    {                        single_quote (attributes, bookmarks);                        bookmarks[0] = bookmarks[5] + 1;                        state = 0;                    }                    break;                case 5: // within double quoted attribute value                    if (Page.EOF == ch)                    {                        double_quote (attributes, bookmarks);                        done = true; // complain?                    }                    else if ('"' == ch)                    {                        double_quote (attributes, bookmarks);                        bookmarks[0] = bookmarks[6] + 1;                        state = 0;                    }                    break;                // patch for lexer state correction by                // Gernot Fricke                // See Bug # 891058 Bug in lexer.                case 6: // undecided for state 0 or 2                        // we have read white spaces after an attributte name                    if (Page.EOF == ch)                    {                        // same as last else clause                        standalone (attributes, bookmarks);                  	    bookmarks[0]=bookmarks[6];                  	    mCursor.retreat();                  	    state=0;                    }                    else if (Character.isWhitespace (ch))                    {                         // proceed                    }                     else if ('=' == ch) // yepp. the white spaces belonged to the equal.                    {                        bookmarks[2] = bookmarks[6];                        bookmarks[3] = bookmarks[7];
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -