📄 lexer.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v $// $Author: derrickoswald $// $Date: 2006/03/19 21:26:32 $// $Revision: 1.44 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.lexer;import java.io.Serializable;import java.net.MalformedURLException;import java.net.URLConnection;import java.util.Vector;import org.htmlparser.Node;import org.htmlparser.NodeFactory;import org.htmlparser.Remark;import org.htmlparser.Text;import org.htmlparser.Tag;import org.htmlparser.http.ConnectionManager;import org.htmlparser.nodes.RemarkNode;import org.htmlparser.nodes.TextNode;import org.htmlparser.nodes.TagNode;import org.htmlparser.util.ParserException;/** * This class parses the HTML stream into nodes. * There are three major types of nodes (lexemes): * <ul> * <li>Remark</li> * <li>Text</li> * <li>Tag</li> * </ul> * Each time <code>nextNode()</code> is called, another node is returned until * the stream is exhausted, and <code>null</code> is returned. */public class Lexer    implements        Serializable,        NodeFactory{    /**     * The page lexemes are retrieved from.     */    protected Page mPage;    /**     * The current position on the page.     */    protected Cursor mCursor;    /**     * The factory for new nodes.     */    protected NodeFactory mFactory;    /**     * Line number to trigger on.     * This is tested on each <code>nextNode()</code> call, as a debugging aid.     * Alter this value and set a breakpoint on the guarded statement.     * Remember, these line numbers are zero based, while most editors are     * one based.     * @see #nextNode     */    protected static int mDebugLineTrigger = -1;    /**     * Creates a new instance of a Lexer.     */    public Lexer ()    {        this (new Page (""));    }    /**     * Creates a new instance of a Lexer.     * @param page The page with HTML text.     */    public Lexer (Page page)    {        setPage (page);        setCursor (new Cursor (page, 0));        setNodeFactory (this);    }    /**     * Creates a new instance of a Lexer.     * @param text The text to parse.     */    public Lexer (String text)    {        this (new Page (text));    }    /**     * Creates a new instance of a Lexer.     * @param connection The url to parse.     * @exception ParserException If an error occurs opening the connection.     */    public Lexer (URLConnection connection)        throws            ParserException    {        this (new Page (connection));    }    /**     * Reset the lexer to start parsing from the beginning again.     * The underlying components are reset such that the next call to     * <code>nextNode()</code> will return the first lexeme on the page.     */    public void reset ()    {        getPage ().reset ();        setCursor (new Cursor (getPage (), 0));    }    /**     * Get the page this lexer is working on.     * @return The page that nodes are being read from.     */    public Page getPage ()    {        return (mPage);    }    /**     * Set the page this lexer is working on.     * @param page The page that nodes will be read from.     */    public void setPage (Page page)    {        if (null == page)            throw new IllegalArgumentException ("page cannot be null");        // todo: sanity checks        mPage = page;    }    /**     * Get the current scanning position.     * @return The lexer's cursor position.     */    public Cursor getCursor ()    {        return (mCursor);    }    /**     * Set the current scanning position.     * @param cursor The lexer's new cursor position.     */    public void setCursor (Cursor cursor)    {        if (null == cursor)            throw new IllegalArgumentException ("cursor cannot be null");        // todo: sanity checks        mCursor = cursor;    }    /**     * Get the current node factory.     * @return The lexer's node factory.     */    public NodeFactory getNodeFactory ()    {        return (mFactory);    }    /**     * Set the current node factory.     * @param factory The node factory to be used by the lexer.     */    public void setNodeFactory (NodeFactory factory)    {        if (null == factory)            throw new IllegalArgumentException ("node factory cannot be null");        mFactory = factory;    }    /**     * Get the current cursor position.     * @return The current character offset into the source.     */    public int getPosition ()    {        return (getCursor ().getPosition ());    }    /**     * Set the current cursor position.     * @param position The new character offset into the source.     */    public void setPosition (int position)    {        // todo: sanity checks        getCursor ().setPosition (position);    }    /**     * Get the current line number.     * @return The line number the lexer's working on.     */    public int getCurrentLineNumber ()    {        return (getPage ().row (getCursor ()));    }    /**     * Get the current line.     * @return The string the lexer's working on.     */    public String getCurrentLine ()    {        return (getPage ().getLine (getCursor ()));    }    /**     * Get the next node from the source.     * @return A Remark, Text or Tag, or <code>null</code> if no     * more lexemes are present.     * @exception ParserException If there is a problem with the     * underlying page.     */    public Node nextNode ()        throws            ParserException    {        return nextNode (false);    }    /**     * Get the next node from the source.     * @param quotesmart If <code>true</code>, strings ignore quoted contents.     * @return A Remark, Text or Tag, or <code>null</code> if no     * more lexemes are present.     * @exception ParserException If there is a problem with the     * underlying page.     */    public Node nextNode (boolean quotesmart)        throws            ParserException    {        int start;        char ch;        Node ret;        // debugging suppport        if (-1 != mDebugLineTrigger)        {            Page page = getPage ();            int lineno = page.row (mCursor);            if (mDebugLineTrigger < lineno)                mDebugLineTrigger = lineno + 1; // trigger on next line too        }        start = mCursor.getPosition ();        ch = mPage.getCharacter (mCursor);        switch (ch)        {            case Page.EOF:                ret = null;                break;            case '<':                ch = mPage.getCharacter (mCursor);                if (Page.EOF == ch)                    ret = makeString (start, mCursor.getPosition ());                else if ('%' == ch)                {                    mCursor.retreat ();                    ret = parseJsp (start);                }                else if ('?' == ch)                {                    mCursor.retreat ();                    ret = parsePI (start);                }                else if ('/' == ch || '%' == ch || Character.isLetter (ch))                {                    mCursor.retreat ();                    ret = parseTag (start);                }                else if ('!' == ch)                {                    ch = mPage.getCharacter (mCursor);                    if (Page.EOF == ch)                        ret = makeString (start, mCursor.getPosition ());                    else                    {                        if ('>' == ch) // handle <!>                            ret = makeRemark (start, mCursor.getPosition ());                        else                        {                            mCursor.retreat (); // remark/tag need this char                            if ('-' == ch)                                ret = parseRemark (start, quotesmart);                            else                            {                                mCursor.retreat (); // tag needs prior one too                                ret = parseTag (start);                            }                        }                    }                }                else                    ret = parseString (start, quotesmart);                break;            default:                mCursor.retreat (); // string needs to see leading foreslash                ret = parseString (start, quotesmart);                break;        }        return (ret);    }    /**     * Advance the cursor through a JIS escape sequence.     * @param cursor A cursor positioned within the escape sequence.     * @exception ParserException If a problem occurs reading from the source.     */    protected void scanJIS (Cursor cursor)        throws            ParserException    {        boolean done;        char ch;        int state;        done = false;        state = 0;        while (!done)        {            ch = mPage.getCharacter (cursor);            if (Page.EOF == ch)                done = true;            else                switch (state)                {                    case 0:                        if (0x1b == ch) // escape                            state = 1;                        break;                    case 1:                        if ('(' == ch)                            state = 2;                        else                            state = 0;                        break;                    case 2:                        if ('J' == ch)                            done = true;                        else                            state = 0;                        break;                    default:                        throw new IllegalStateException ("state " + state);                }        }    }    /**     * Parse a string node.     * Scan characters until "&lt;/", "&lt;%", "&lt;!" or &lt; followed by a     * letter is encountered, or the input stream is exhausted, in which     * case <code>null</code> is returned.     * @param start The position at which to start scanning.     * @param quotesmart If <code>true</code>, strings ignore quoted contents.     * @return The parsed node.     * @exception ParserException If a problem occurs reading from the source.     */    protected Node parseString (int start, boolean quotesmart)        throws            ParserException    {        boolean done;        char ch;        char quote;        done = false;        quote = 0;        while (!done)        {
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -