📄 lexer.java

📁 html 解析处理代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
        throws            ParserException    {        return (parseCDATA (false));    }    /**     * Return CDATA as a text node.     * Slightly less rigid than {@link #parseCDATA()} this method provides for     * parsing CDATA that may contain quoted strings that have embedded     * ETAGO ("&lt;/") delimiters and skips single and multiline comments.     * @param quotesmart If <code>true</code> the strict definition of CDATA is     * extended to allow for single or double quoted ETAGO ("&lt;/") sequences.     * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none.     * @see #parseCDATA()     * @exception ParserException If a problem occurs reading from the source.     */    public Node parseCDATA (boolean quotesmart)        throws            ParserException    {        int start;        int state;        boolean done;        char quote;        char ch;        int end;        boolean comment;        start = mCursor.getPosition ();        state = 0;        done = false;        quote = 0;        comment = false;        while (!done)        {            ch = mPage.getCharacter (mCursor);            switch (state)            {                case 0: // prior to ETAGO                    switch (ch)                    {                        case Page.EOF:                            done = true;                            break;                        case '\'':                            if (quotesmart && !comment)                                if (0 == quote)                                    quote = '\''; // enter quoted state                                else if ('\'' == quote)                                    quote = 0; // exit quoted state                            break;                        case '"':                            if (quotesmart && !comment)                                if (0 == quote)                                    quote = '"'; // enter quoted state                                else if ('"' == quote)                                    quote = 0; // exit quoted state                            break;                        case '\\':                            if (quotesmart)                                if (0 != quote)                                {                                    ch = mPage.getCharacter (mCursor); // try to consume escaped character                                    if (Page.EOF == ch)                                        done = true;                                    else if (  (ch != '\\') && (ch != quote))                                        // unconsume char if character was not an escapable char.                                        mPage.ungetCharacter (mCursor);                                }                            break;                        case '/':                            if (quotesmart)                                if (0 == quote)                                {                                    // handle multiline and double slash comments (with a quote)                                    ch = mPage.getCharacter (mCursor);                                    if (Page.EOF == ch)                                        done = true;                                    else if ('/' == ch)                                        comment = true;                                    else if ('*' == ch)                                    {                                        do                                        {                                            do                                                ch = mPage.getCharacter (mCursor);                                            while ((Page.EOF != ch) && ('*' != ch));                                            ch = mPage.getCharacter (mCursor);                                            if (ch == '*')                                                mPage.ungetCharacter (mCursor);                                        }                                        while ((Page.EOF != ch) && ('/' != ch));                                    }                                    else                                        mPage.ungetCharacter (mCursor);                                }                            break;                        case '\n':                            comment = false;                            break;                        case '<':                            if (quotesmart)                            {                                if (0 == quote)                                    state = 1;                            }                            else                                state = 1;                            break;                        default:                            break;                    }                    break;                case 1: // <                    switch (ch)                    {                        case Page.EOF:                            done = true;                            break;                        case '/':                            state = 2;                            break;                        case '!':                            ch = mPage.getCharacter (mCursor);                            if (Page.EOF == ch)                                done = true;                            else if ('-' == ch)                            {                                ch = mPage.getCharacter (mCursor);                                if (Page.EOF == ch)                                    done = true;                                else if ('-' == ch)                                    state = 3;                                else                                    state = 0;                            }                            else                                state = 0;                            break;                        default:                            state = 0;                            break;                    }                    break;                case 2: // </                    comment = false;                    if (Page.EOF == ch)                        done = true;                    else if (Character.isLetter (ch))                    {                        done = true;                        // back up to the start of ETAGO                        mPage.ungetCharacter (mCursor);                        mPage.ungetCharacter (mCursor);                        mPage.ungetCharacter (mCursor);                    }                    else                        state = 0;                    break;                case 3: // <!                    comment = false;                    if (Page.EOF == ch)                        done = true;                    else if ('-' == ch)                    {                        ch = mPage.getCharacter (mCursor);                        if (Page.EOF == ch)                            done = true;                        else if ('-' == ch)                        {                            ch = mPage.getCharacter (mCursor);                            if (Page.EOF == ch)                                done = true;                            else if ('>' == ch)                                state = 0;                            else                            {                                mPage.ungetCharacter (mCursor);                                mPage.ungetCharacter (mCursor);                            }                        }                        else                            mPage.ungetCharacter (mCursor);                    }                    break;                default:                    throw new IllegalStateException ("how the fuck did we get in state " + state);            }        }        end = mCursor.getPosition ();        return (makeString (start, end));    }    //    // NodeFactory interface    //    /**     * Create a new string node.     * @param page The page the node is on.     * @param start The beginning position of the string.     * @param end The ending positiong of the string.     * @return The created Text node.     */    public Text createStringNode (Page page,  int start, int end)    {        return (new TextNode (page, start, end));    }    /**     * Create a new remark node.     * @param page The page the node is on.     * @param start The beginning position of the remark.     * @param end The ending positiong of the remark.     * @return The created Remark node.     */    public Remark createRemarkNode (Page page,  int start, int end)    {        return (new RemarkNode (page, start, end));    }    /**     * Create a new tag node.     * Note that the attributes vector contains at least one element,     * which is the tag name (standalone attribute) at position zero.     * This can be used to decide which type of node to create, or     * gate other processing that may be appropriate.     * @param page The page the node is on.     * @param start The beginning position of the tag.     * @param end The ending positiong of the tag.     * @param attributes The attributes contained in this tag.     * @return The created Tag node.     */    public Tag createTagNode (Page page, int start, int end, Vector attributes)    {        return (new TagNode (page, start, end, attributes));    }    //    // Internal methods    //    /**     * Advance the cursor through a JIS escape sequence.     * @param cursor A cursor positioned within the escape sequence.     * @exception ParserException If a problem occurs reading from the source.     */    protected void scanJIS (Cursor cursor)        throws            ParserException    {        boolean done;        char ch;        int state;        done = false;        state = 0;        while (!done)        {            ch = mPage.getCharacter (cursor);            if (Page.EOF == ch)                done = true;            else                switch (state)                {                    case 0:                        if (0x1b == ch) // escape                            state = 1;                        break;                    case 1:                        if ('(' == ch)                            state = 2;                        else                            state = 0;                        break;                    case 2:                        if ('J' == ch)                            done = true;                        else                            state = 0;                        break;                    default:                        throw new IllegalStateException ("state " + state);                }        }    }    /**     * Parse a string node.     * Scan characters until "&lt;/", "&lt;%", "&lt;!" or &lt; followed by a     * letter is encountered, or the input stream is exhausted, in which     * case <code>null</code> is returned.     * @param start The position at which to start scanning.     * @param quotesmart If <code>true</code>, strings ignore quoted contents.     * @return The parsed node.     * @exception ParserException If a problem occurs reading from the source.     */    protected Node parseString (int start, boolean quotesmart)        throws            ParserException    {        boolean done;        char ch;        char quote;        done = false;        quote = 0;        while (!done)        {            ch = mPage.getCharacter (mCursor);            if (Page.EOF == ch)                done = true;            else if (0x1b == ch) // escape            {                ch = mPage.getCharacter (mCursor);                if (Page.EOF == ch)                    done = true;                else if ('$' == ch)                {                    ch = mPage.getCharacter (mCursor);                    if (Page.EOF == ch)                        done = true;                    else if ('B' == ch)                        scanJIS (mCursor);                    else                    {                        mPage.ungetCharacter (mCursor);                        mPage.ungetCharacter (mCursor);                    }                }                else                    mPage.ungetCharacter (mCursor);            }            else if (quotesmart && (0 == quote)                && (('\'' == ch) || ('"' == ch)))                quote = ch; // enter quoted state            // patch from Gernot Fricke to handle escaped closing quote            else if (quotesmart && (0 != quote) && ('\\' == ch))            {                ch = mPage.getCharacter (mCursor); // try to consume escape                if ((Page.EOF != ch)                    && ('\\' != ch) // escaped backslash                    && (ch != quote)) // escaped quote character                       // ( reflects ["] or [']  whichever opened the quotation)                    mPage.ungetCharacter (mCursor); // unconsume char if char not an escape            }            else if (quotesmart && (ch == quote))                quote = 0; // exit quoted state            else if (quotesmart && (0 == quote) && (ch == '/'))            {                // handle multiline and double slash comments (with a quote)                // in script like:                // I can't handle single quotations.                ch = mPage.getCharacter (mCursor);                if (Page.EOF == ch)                    done = true;                else if ('/' == ch)                {                    do                        ch = mPage.getCharacter (mCursor);                    while ((Page.EOF != ch) && ('\n' != ch));                }                else if ('*' == ch)                {                    do                    {                        do                            ch = mPage.getCharacter (mCursor);                        while ((Page.EOF != ch) && ('*' != ch));                        ch = mPage.getCharacter (mCursor);                        if (ch == '*')                            mPage.ungetCharacter (mCursor);                    }                    while ((Page.EOF != ch) && ('/' != ch));                }                else                    mPage.ungetCharacter (mCursor);            }            else if ((0 == quote) && ('<' == ch))            {                ch = mPage.getCharacter (mCursor);                if (Page.EOF == ch)                    done = true;                // the order of these tests might be optimized for speed:                else if ('/' == ch || Character.isLetter (ch)                    || '!' == ch || '%' == ch || '?' == ch)                {                    done = true;                    mPage.ungetCharacter (mCursor);                    mPage.ungetCharacter (mCursor);                }                else                {                    // it's not a tag, so keep going, but check for quotes                    mPage.ungetCharacter (mCursor);                }            }        }        return (makeString (start, mCursor.getPosition ()));    }    /**     * Create a string node based on the current cursor and the one provided.     * @param start The starting point of the node.     * @param end The ending point of the node.     * @exception ParserException If the nodefactory creation of the text     * node fails.     * @return The new Text node.     */    protected Node makeString (int start, int end)        throws            ParserException    {        int length;        Node ret;        length = end - start;        if (0 != length)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -