⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lexer.java

📁 html 解析处理代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v $// $Author: derrickoswald $// $Date: 2006/06/10 15:11:32 $// $Revision: 1.50 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.lexer;import java.io.Serializable;import java.net.MalformedURLException;import java.net.URLConnection;import java.util.Vector;import org.htmlparser.Node;import org.htmlparser.NodeFactory;import org.htmlparser.Remark;import org.htmlparser.Text;import org.htmlparser.Tag;import org.htmlparser.http.ConnectionManager;import org.htmlparser.nodes.RemarkNode;import org.htmlparser.nodes.TextNode;import org.htmlparser.nodes.TagNode;import org.htmlparser.util.ParserException;/** * This class parses the HTML stream into nodes. * There are three major types of nodes (lexemes): * <ul> * <li>Remark</li> * <li>Text</li> * <li>Tag</li> * </ul> * Each time <code>nextNode()</code> is called, another node is returned until * the stream is exhausted, and <code>null</code> is returned. */public class Lexer    implements        Serializable,        NodeFactory{    // Please don't change the formatting of the version variables below.    // This is done so as to facilitate ant script processing.    /**     * The floating point version number ({@value}).     */    public static final double    VERSION_NUMBER = 1.6    ;    /**     * The type of version ({@value}).     */    public static final String    VERSION_TYPE = "Release Build"    ;    /**     * The date of the version ({@value}).     */    public static final String    VERSION_DATE = "Jun 10, 2006"    ;    // End of formatting    /**     * The display version ({@value}).     */    public static final String VERSION_STRING =            "" + VERSION_NUMBER            + " (" + VERSION_TYPE + " " + VERSION_DATE + ")";    /**     * Process remarks strictly flag.     * If <code>true</code>, remarks are not terminated by ---$gt;     * or --!$gt;, i.e. more than two dashes. If <code>false</code>,     * a more lax (and closer to typical browser handling) remark parsing     * is used.     * Default <code>true</code>.     */    public static boolean STRICT_REMARKS = true;    /**     * The page lexemes are retrieved from.     */    protected Page mPage;    /**     * The current position on the page.     */    protected Cursor mCursor;    /**     * The factory for new nodes.     */    protected NodeFactory mFactory;    /**     * Line number to trigger on.     * This is tested on each <code>nextNode()</code> call, as a debugging aid.     * Alter this value and set a breakpoint on the guarded statement.     * Remember, these line numbers are zero based, while most editors are     * one based.     * @see #nextNode     */    protected static int mDebugLineTrigger = -1;    //    // Static methods    //    /**     * Return the version string of this parser.     * @return A string of the form:     * <pre>     * "[floating point number] ([build-type] [build-date])"     * </pre>     */    public static String getVersion ()    {        return (VERSION_STRING);    }    //    // Constructors    //    /**     * Creates a new instance of a Lexer.     */    public Lexer ()    {        this (new Page (""));    }    /**     * Creates a new instance of a Lexer.     * @param page The page with HTML text.     */    public Lexer (Page page)    {        setPage (page);        setCursor (new Cursor (page, 0));        setNodeFactory (this);    }    /**     * Creates a new instance of a Lexer.     * @param text The text to parse.     */    public Lexer (String text)    {        this (new Page (text));    }    /**     * Creates a new instance of a Lexer.     * @param connection The url to parse.     * @exception ParserException If an error occurs opening the connection.     */    public Lexer (URLConnection connection)        throws            ParserException    {        this (new Page (connection));    }    //    // Bean patterns    //    /**     * Get the page this lexer is working on.     * @return The page that nodes are being read from.     */    public Page getPage ()    {        return (mPage);    }    /**     * Set the page this lexer is working on.     * @param page The page that nodes will be read from.     */    public void setPage (Page page)    {        if (null == page)            throw new IllegalArgumentException ("page cannot be null");        // todo: sanity checks        mPage = page;    }    /**     * Get the current scanning position.     * @return The lexer's cursor position.     */    public Cursor getCursor ()    {        return (mCursor);    }    /**     * Set the current scanning position.     * @param cursor The lexer's new cursor position.     */    public void setCursor (Cursor cursor)    {        if (null == cursor)            throw new IllegalArgumentException ("cursor cannot be null");        // todo: sanity checks        mCursor = cursor;    }    /**     * Get the current node factory.     * @return The lexer's node factory.     */    public NodeFactory getNodeFactory ()    {        return (mFactory);    }    /**     * Set the current node factory.     * @param factory The node factory to be used by the lexer.     */    public void setNodeFactory (NodeFactory factory)    {        if (null == factory)            throw new IllegalArgumentException ("node factory cannot be null");        mFactory = factory;    }    /**     * Get the current cursor position.     * @return The current character offset into the source.     */    public int getPosition ()    {        return (getCursor ().getPosition ());    }    /**     * Set the current cursor position.     * @param position The new character offset into the source.     */    public void setPosition (int position)    {        // todo: sanity checks        getCursor ().setPosition (position);    }    /**     * Get the current line number.     * @return The line number the lexer's working on.     */    public int getCurrentLineNumber ()    {        return (getPage ().row (getCursor ()));    }    /**     * Get the current line.     * @return The string the lexer's working on.     */    public String getCurrentLine ()    {        return (getPage ().getLine (getCursor ()));    }    //    // Public methods    //    /**     * Reset the lexer to start parsing from the beginning again.     * The underlying components are reset such that the next call to     * <code>nextNode()</code> will return the first lexeme on the page.     */    public void reset ()    {        getPage ().reset ();        setCursor (new Cursor (getPage (), 0));    }    /**     * Get the next node from the source.     * @return A Remark, Text or Tag, or <code>null</code> if no     * more lexemes are present.     * @exception ParserException If there is a problem with the     * underlying page.     */    public Node nextNode ()        throws            ParserException    {        return nextNode (false);    }    /**     * Get the next node from the source.     * @param quotesmart If <code>true</code>, strings ignore quoted contents.     * @return A Remark, Text or Tag, or <code>null</code> if no     * more lexemes are present.     * @exception ParserException If there is a problem with the     * underlying page.     */    public Node nextNode (boolean quotesmart)        throws            ParserException    {        int start;        char ch;        Node ret;        // debugging suppport        if (-1 != mDebugLineTrigger)        {            Page page = getPage ();            int lineno = page.row (mCursor);            if (mDebugLineTrigger < lineno)                mDebugLineTrigger = lineno + 1; // trigger on next line too        }        start = mCursor.getPosition ();        ch = mPage.getCharacter (mCursor);        switch (ch)        {            case Page.EOF:                ret = null;                break;            case '<':                ch = mPage.getCharacter (mCursor);                if (Page.EOF == ch)                    ret = makeString (start, mCursor.getPosition ());                else if ('%' == ch)                {                    mPage.ungetCharacter (mCursor);                    ret = parseJsp (start);                }                else if ('?' == ch)                {                    mPage.ungetCharacter (mCursor);                    ret = parsePI (start);                }                else if ('/' == ch || '%' == ch || Character.isLetter (ch))                {                    mPage.ungetCharacter (mCursor);                    ret = parseTag (start);                }                else if ('!' == ch)                {                    ch = mPage.getCharacter (mCursor);                    if (Page.EOF == ch)                        ret = makeString (start, mCursor.getPosition ());                    else                    {                        if ('>' == ch) // handle <!>                            ret = makeRemark (start, mCursor.getPosition ());                        else                        {                            mPage.ungetCharacter (mCursor); // remark/tag need this char                            if ('-' == ch)                                ret = parseRemark (start, quotesmart);                            else                            {                                mPage.ungetCharacter (mCursor); // tag needs prior one too                                ret = parseTag (start);                            }                        }                    }                }                else                    ret = parseString (start, quotesmart);                break;            default:                mPage.ungetCharacter (mCursor); // string needs to see leading foreslash                ret = parseString (start, quotesmart);                break;        }        return (ret);    }    /**     * Return CDATA as a text node.     * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data">     * B.3.2 Specifying non-HTML data</a> of the     * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:<br>     * <quote>     * <b>Element content</b><br>     * When script or style data is the content of an element (SCRIPT and STYLE),     * the data begins immediately after the element start tag and ends at the     * first ETAGO ("&lt;/") delimiter followed by a name start character ([a-zA-Z]);     * note that this may not be the element's end tag.     * Authors should therefore escape "&lt;/" within the content. Escape mechanisms     * are specific to each scripting or style sheet language.     * </quote>     * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none.     * @exception ParserException If a problem occurs reading from the source.     */    public Node parseCDATA ()

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -