parser.java

来自「html 解析处理代码」· Java 代码 · 共 857 行 · 第 1/2 页
JAVA
857 行
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Somik Raha//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v $// $Author: derrickoswald $// $Date: 2006/06/10 15:11:32 $// $Revision: 1.120 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser;import java.io.Serializable;import java.net.HttpURLConnection;import java.net.URLConnection;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.http.ConnectionManager;import org.htmlparser.http.ConnectionMonitor;import org.htmlparser.http.HttpHeader;import org.htmlparser.lexer.Lexer;import org.htmlparser.lexer.Page;import org.htmlparser.util.DefaultParserFeedback;import org.htmlparser.util.IteratorImpl;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.util.ParserFeedback;import org.htmlparser.visitors.NodeVisitor;/** * The main parser class. * This is the primary class of the HTML Parser library. It provides * constructors that take a {@link #Parser(String) String}, * a {@link #Parser(URLConnection) URLConnection}, or a * {@link #Parser(Lexer) Lexer}.  In the case of a String,  * a check is made to see if the first non-whitespace character is a &lt;, in * which case it is assumed to be HTML. Otherwise an * attempt is made to open it as a URL, and if that fails it assumes it is a * local disk file. If you want to parse a String after using the * {@link #Parser() no-args} constructor, use  * {@link #setInputHTML setInputHTML()}, or you can use {@link #createParser}. * <p>The Parser provides access to the contents of the * page, via a {@link #elements() NodeIterator}, a * {@link #parse(NodeFilter) NodeList} or a * {@link #visitAllNodesWith NodeVisitor}. * <p>Typical usage of the parser is: * <code> * <pre> * Parser parser = new Parser ("http://whatever"); * NodeList list = parser.parse (null); * // do something with your list of nodes. * </pre> * </code></p> * <p>What types of nodes and what can be done with them is dependant on the * setup, but in general a node can be converted back to HTML and it's * children (enclosed nodes) and parent can be obtained, because nodes are * nested. See the {@link Node} interface.</p> * <p>For example, if the URL contains:<br> * <code> * {@.html * <html> * <head> * <title>Mondays -- What a bad idea.</title> * </head> * <body BGCOLOR="#FFFFFF"> * Most people have a pathological hatred of Mondays... * </body> * </html>} * </code><br> * and the example code above is used, the list contain only one element, the * {@.html <html>} node.  This node is a {@link org.htmlparser.tags tag}, * which is an object of class * {@link org.htmlparser.tags.Html Html} if the default {@link NodeFactory} * (a {@link PrototypicalNodeFactory}) is used.</p> * <p>To get at further content, the children of the top * level nodes must be examined. When digging through a node list one must be * conscious of the possibility of whitespace between nodes, e.g. in the example * above: * <code> * <pre> * Node node = list.elementAt (0); * NodeList sublist = node.getChildren (); * System.out.println (sublist.size ()); * </pre> * </code> * would print out 5, not 2, because there are newlines after {@.html <html>}, * {@.html </head>} and {@.html </body>} that are children of the HTML node * besides the {@.html <head>} and {@.html <body>} nodes.</p> * <p>Because processing nodes is so common, two interfaces are provided to * ease this task, {@link org.htmlparser.filters filters} * and {@link org.htmlparser.visitors visitors}. */public class Parser    implements        Serializable,        ConnectionMonitor{    // Please don't change the formatting of the version variables below.    // This is done so as to facilitate ant script processing.    /**     * The floating point version number ({@value}).     */    public static final double    VERSION_NUMBER = 1.6    ;    /**     * The type of version ({@value}).     */    public static final String    VERSION_TYPE = "Release Build"    ;    /**     * The date of the version ({@value}).     */    public static final String    VERSION_DATE = "Jun 10, 2006"    ;    // End of formatting    /**     * The display version ({@value}).     */    public static final String VERSION_STRING =            "" + VERSION_NUMBER            + " (" + VERSION_TYPE + " " + VERSION_DATE + ")";    /**     * Feedback object.     */    protected ParserFeedback mFeedback;    /**     * The html lexer associated with this parser.     */    protected Lexer mLexer;    /**     * A quiet message sink.     * Use this for no feedback.     */    public static final ParserFeedback DEVNULL =        new DefaultParserFeedback (DefaultParserFeedback.QUIET);    /**     * A verbose message sink.     * Use this for output on <code>System.out</code>.     */    public static final ParserFeedback STDOUT = new DefaultParserFeedback ();    static    {        getConnectionManager ().getDefaultRequestProperties ().put (            "User-Agent", "HTMLParser/" + getVersionNumber ());        }    //    // Static methods    //    /**     * Return the version string of this parser.     * @return A string of the form:     * <pre>     * "[floating point number] ([build-type] [build-date])"     * </pre>     */    public static String getVersion ()    {        return (VERSION_STRING);    }    /**     * Return the version number of this parser.     * @return A floating point number, the whole number part is the major     * version, and the fractional part is the minor version.     */    public static double getVersionNumber ()    {        return (VERSION_NUMBER);    }    /**     * Get the connection manager all Parsers use.     * @return The connection manager.     * @see #setConnectionManager     */    public static ConnectionManager getConnectionManager ()    {        return (Page.getConnectionManager ());    }    /**     * Set the connection manager all Parsers use.     * @param manager The new connection manager.     * @see #getConnectionManager     */    public static void setConnectionManager (ConnectionManager manager)    {        Page.setConnectionManager (manager);    }    /**     * Creates the parser on an input string.     * @param html The string containing HTML.     * @param charset <em>Optional</em>. The character set encoding that will     * be reported by {@link #getEncoding}. If charset is <code>null</code>     * the default character set is used.     * @return A parser with the <code>html</code> string as input.     * @exception IllegalArgumentException if <code>html</code> is <code>null</code>.     */    public static Parser createParser (String html, String charset)    {        Parser ret;        if (null == html)            throw new IllegalArgumentException ("html cannot be null");        ret = new Parser (new Lexer (new Page (html, charset)));        return (ret);    }    //    // Constructors    //    /**     * Zero argument constructor.     * The parser is in a safe but useless state parsing an empty string.     * Set the lexer or connection using {@link #setLexer}     * or {@link #setConnection}.     * @see #setLexer(Lexer)     * @see #setConnection(URLConnection)     */    public Parser ()    {        this (new Lexer (new Page ("")), DEVNULL);    }    /**     * Construct a parser using the provided lexer and feedback object.     * This would be used to create a parser for special cases where the     * normal creation of a lexer on a URLConnection needs to be customized.     * @param lexer The lexer to draw characters from.     * @param fb The object to use when information,     * warning and error messages are produced. If <em>null</em> no feedback     * is provided.     */    public Parser (Lexer lexer, ParserFeedback fb)    {        setFeedback (fb);        setLexer (lexer);        setNodeFactory (new PrototypicalNodeFactory ());    }    /**     * Constructor for custom HTTP access.     * This would be used to create a parser for a URLConnection that needs     * a special setup or negotiation conditioning beyond what is available     * from the {@link #getConnectionManager ConnectionManager}.     * @param connection A fully conditioned connection. The connect()     * method will be called so it need not be connected yet.     * @param fb The object to use for message communication.     * @throws ParserException If the creation of the underlying Lexer     * cannot be performed.     */    public Parser (URLConnection connection, ParserFeedback fb)        throws            ParserException    {        this (new Lexer (connection), fb);    }    /**     * Creates a Parser object with the location of the resource (URL or file)     * You would typically create a DefaultHTMLParserFeedback object and pass     * it in.     * @see #Parser(URLConnection,ParserFeedback)     * @param resource Either a URL, a filename or a string of HTML.     * The string is considered HTML if the first non-whitespace character     * is a &lt;. The use of a url or file is autodetected by first attempting     * to open the resource as a URL, if that fails it is assumed to be a file     * name.     * A standard HTTP GET is performed to read the content of the URL.     * @param feedback The HTMLParserFeedback object to use when information,     * warning and error messages are produced. If <em>null</em> no feedback     * is provided.     * @throws ParserException If the URL is invalid.     */    public Parser (String resource, ParserFeedback feedback)        throws            ParserException    {        setFeedback (feedback);        setResource (resource);        setNodeFactory (new PrototypicalNodeFactory ());    }    /**     * Creates a Parser object with the location of the resource (URL or file).     * A DefaultHTMLParserFeedback object is used for feedback.     * @param resource Either HTML, a URL or a filename (autodetects).     * @throws ParserException If the resourceLocn argument does not resolve     * to a valid page or file.     * @see #Parser(String,ParserFeedback)     */    public Parser (String resource) throws ParserException    {        this (resource, STDOUT);    }    /**     * Construct a parser using the provided lexer.     * A feedback object printing to {@link #STDOUT System.out} is used.     * This would be used to create a parser for special cases where the     * normal creation of a lexer on a URLConnection needs to be customized.     * @param lexer The lexer to draw characters from.     */    public Parser (Lexer lexer)    {        this (lexer, STDOUT);    }    /**     * Construct a parser using the provided URLConnection.     * This would be used to create a parser for a URLConnection that needs     * a special setup or negotiation conditioning beyond what is available     * from the {@link #getConnectionManager ConnectionManager}.     * A feedback object printing to {@link #STDOUT System.out} is used.     * @see #Parser(URLConnection,ParserFeedback)     * @param connection A fully conditioned connection. The connect()     * method will be called so it need not be connected yet.     * @throws ParserException If the creation of the underlying Lexer     * cannot be performed.     */    public Parser (URLConnection connection) throws ParserException    {        this (connection, STDOUT);    }    //    // Bean patterns    //    /**     * Set the html, a url, or a file.     * @param resource The resource to use.     * @exception IllegalArgumentException if <code>resource</code> is <code>null</code>.     * @exception ParserException if a problem occurs in connecting.     */    public void setResource (String resource)        throws            ParserException    {        int length;        boolean html;        char ch;        if (null == resource)            throw new IllegalArgumentException ("resource cannot be null");        length = resource.length ();        html = false;        for (int i = 0; i < length; i++)        {            ch = resource.charAt (i);            if (!Character.isWhitespace (ch))            {                if ('<' == ch)                    html = true;                break;            }        }        if (html)            setLexer (new Lexer (new Page (resource)));        else            setLexer (new Lexer (getConnectionManager ().openConnection (resource)));    }    /**     * Set the connection for this parser.     * This method creates a new <code>Lexer</code> reading from the connection.     * @param connection A fully conditioned connection. The connect()     * method will be called so it need not be connected yet.     * @exception ParserException if the character set specified in the     * HTTP header is not supported, or an i/o exception occurs creating the     * lexer.     * @see #setLexer     * @see #getConnection     * @exception IllegalArgumentException if <code>connection</code> is <code>null</code>.     * @exception ParserException if a problem occurs in connecting.     */    public void setConnection (URLConnection connection)        throws            ParserException    {        if (null == connection)            throw new IllegalArgumentException ("connection cannot be null");        setLexer (new Lexer (connection));    }    /**     * Return the current connection.     * @return The connection either created by the parser or passed into this     * parser via {@link #setConnection}.     * @see #setConnection(URLConnection)     */    public URLConnection getConnection ()
parser.java - 源码说明

本页面展示了「html 解析处理代码」中的 parser.java 源码文件，采用 Java 编程语言编写，共 857 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与html相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?