📄 parser.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Somik Raha//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v $// $Author: derrickoswald $// $Date: 2006/03/20 00:26:01 $// $Revision: 1.111 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser;import java.io.Serializable;import java.net.HttpURLConnection;import java.net.URLConnection;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.http.ConnectionManager;import org.htmlparser.http.ConnectionMonitor;import org.htmlparser.http.HttpHeader;import org.htmlparser.lexer.Lexer;import org.htmlparser.lexer.Page;import org.htmlparser.util.DefaultParserFeedback;import org.htmlparser.util.IteratorImpl;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.util.ParserFeedback;import org.htmlparser.visitors.NodeVisitor;/** * The main parser class. * This is the primary class of the HTML Parser library. It provides * constructors that take a {@link #Parser(String) String}, * a {@link #Parser(URLConnection) URLConnection}, or a * {@link #Parser(Lexer) Lexer}.  In the case of a String, an * attempt is made to open it as a URL, and if that fails it assumes it is a * local disk file. If you want to actually parse a String, use * {@link #setInputHTML setInputHTML()} after using the * {@link #Parser() no-args} constructor, or use {@link #createParser}. * <p>The Parser provides access to the contents of the * page, via a {@link #elements() NodeIterator}, a * {@link #parse(NodeFilter) NodeList} or a * {@link #visitAllNodesWith NodeVisitor}. * <p>Typical usage of the parser is: * <code> * <pre> * Parser parser = new Parser ("http://whatever"); * NodeList list = parser.parse (); * // do something with your list of nodes. * </pre> * </code></p> * <p>What types of nodes and what can be done with them is dependant on the * setup, but in general a node can be converted back to HTML and it's * children (enclosed nodes) and parent can be obtained, because nodes are * nested. See the {@link Node} interface.</p> * <p>For example, if the URL contains:<br> * <code> * {@.html * <html> * <head> * <title>Mondays -- What a bad idea.</title> * </head> * <body BGCOLOR="#FFFFFF"> * Most people have a pathological hatred of Mondays... * </body> * </html>} * </code><br> * and the example code above is used, the list contain only one element, the * {@.html <html>} node.  This node is a {@link org.htmlparser.tags tag}, * which is an object of class * {@link org.htmlparser.tags.Html Html} if the default {@link NodeFactory} * (a {@link PrototypicalNodeFactory}) is used.</p> * <p>To get at further content, the children of the top * level nodes must be examined. When digging through a node list one must be * conscious of the possibility of whitespace between nodes, e.g. in the example * above: * <code> * <pre> * Node node = list.elementAt (0); * NodeList sublist = node.getChildren (); * System.out.println (sublist.size ()); * </pre> * </code> * would print out 5, not 2, because there are newlines after {@.html <html>}, * {@.html </head>} and {@.html </body>} that are children of the HTML node * besides the {@.html <head>} and {@.html <body>} nodes.</p> * <p>Because processing nodes is so common, two interfaces are provided to * ease this task, {@link org.htmlparser.filters filters} * and {@link org.htmlparser.visitors visitors}. */public class Parser    implements        Serializable,        ConnectionMonitor{    // Please don't change the formatting of the version variables below.    // This is done so as to facilitate ant script processing.    /**     * The floating point version number ({@value}).     */    public static final double    VERSION_NUMBER = 1.6    ;    /**     * The type of version ({@value}).     */    public static final String    VERSION_TYPE = "Integration Build"    ;    /**     * The date of the version ({@value}).     */    public static final String    VERSION_DATE = "Mar 19, 2006"    ;    // End of formatting    /**     * The display version ({@value}).     */    public static final String VERSION_STRING =            "" + VERSION_NUMBER            + " (" + VERSION_TYPE + " " + VERSION_DATE + ")";    /**     * Feedback object.     */    protected ParserFeedback mFeedback;    /**     * The html lexer associated with this parser.     */    protected Lexer mLexer;    /**     * A quiet message sink.     * Use this for no feedback.     */    public static final ParserFeedback DEVNULL =        new DefaultParserFeedback (DefaultParserFeedback.QUIET);    /**     * A verbose message sink.     * Use this for output on <code>System.out</code>.     */    public static final ParserFeedback STDOUT = new DefaultParserFeedback ();    //    // Static methods    //    /**     * Return the version string of this parser.     * @return A string of the form:     * <pre>     * "[floating point number] ([build-type] [build-date])"     * </pre>     */    public static String getVersion ()    {        return (VERSION_STRING);    }    /**     * Return the version number of this parser.     * @return A floating point number, the whole number part is the major     * version, and the fractional part is the minor version.     */    public static double getVersionNumber ()    {        return (VERSION_NUMBER);    }    /**     * Get the connection manager all Parsers use.     * @return The connection manager.     * @see #setConnectionManager     */    public static ConnectionManager getConnectionManager ()    {        return (Page.getConnectionManager ());    }    /**     * Set the connection manager all Parsers use.     * @param manager The new connection manager.     * @see #getConnectionManager     */    public static void setConnectionManager (ConnectionManager manager)    {        Page.setConnectionManager (manager);    }    /**     * Creates the parser on an input string.     * @param html The string containing HTML.     * @param charset <em>Optional</em>. The character set encoding that will     * be reported by {@link #getEncoding}. If charset is <code>null</code>     * the default character set is used.     * @return A parser with the <code>html</code> string as input.     */    public static Parser createParser (String html, String charset)    {        Parser ret;        if (null == html)            throw new IllegalArgumentException ("html cannot be null");        ret = new Parser (new Lexer (new Page (html, charset)));        return (ret);    }    //    // Constructors    //    /**     * Zero argument constructor.     * The parser is in a safe but useless state parsing an empty string.     * Set the lexer or connection using {@link #setLexer}     * or {@link #setConnection}.     * @see #setLexer(Lexer)     * @see #setConnection(URLConnection)     */    public Parser ()    {        this (new Lexer (new Page ("")), DEVNULL);    }    /**     * Construct a parser using the provided lexer and feedback object.     * This would be used to create a parser for special cases where the     * normal creation of a lexer on a URLConnection needs to be customized.     * @param lexer The lexer to draw characters from.     * @param fb The object to use when information,     * warning and error messages are produced. If <em>null</em> no feedback     * is provided.     */    public Parser (Lexer lexer, ParserFeedback fb)    {        setFeedback (fb);        if (null == lexer)            throw new IllegalArgumentException ("lexer cannot be null");        setLexer (lexer);        setNodeFactory (new PrototypicalNodeFactory ());    }    /**     * Constructor for custom HTTP access.     * This would be used to create a parser for a URLConnection that needs     * a special setup or negotiation conditioning beyond what is available     * from the {@link #getConnectionManager ConnectionManager}.     * @param connection A fully conditioned connection. The connect()     * method will be called so it need not be connected yet.     * @param fb The object to use for message communication.     * @throws ParserException If the creation of the underlying Lexer     * cannot be performed.     */    public Parser (URLConnection connection, ParserFeedback fb)        throws            ParserException    {        this (new Lexer (connection), fb);    }    /**     * Creates a Parser object with the location of the resource (URL or file)     * You would typically create a DefaultHTMLParserFeedback object and pass     * it in.     * @see #Parser(URLConnection,ParserFeedback)     * @param resourceLocn Either the URL or the filename (autodetects).     * A standard HTTP GET is performed to read the content of the URL.     * @param feedback The HTMLParserFeedback object to use when information,     * warning and error messages are produced. If <em>null</em> no feedback     * is provided.     * @throws ParserException If the URL is invalid.     */    public Parser (String resourceLocn, ParserFeedback feedback)        throws            ParserException    {        this (getConnectionManager ().openConnection (resourceLocn), feedback);    }    /**     * Creates a Parser object with the location of the resource (URL or file).     * A DefaultHTMLParserFeedback object is used for feedback.     * @param resourceLocn Either the URL or the filename (autodetects).     * @throws ParserException If the resourceLocn argument does not resolve     * to a valid page or file.     */    public Parser (String resourceLocn) throws ParserException    {        this (resourceLocn, STDOUT);    }    /**     * Construct a parser using the provided lexer.     * A feedback object printing to {@link #STDOUT System.out} is used.     * This would be used to create a parser for special cases where the     * normal creation of a lexer on a URLConnection needs to be customized.     * @param lexer The lexer to draw characters from.     */    public Parser (Lexer lexer)    {        this (lexer, STDOUT);    }    /**     * Construct a parser using the provided URLConnection.     * This would be used to create a parser for a URLConnection that needs     * a special setup or negotiation conditioning beyond what is available     * from the {@link #getConnectionManager ConnectionManager}.     * A feedback object printing to {@link #STDOUT System.out} is used.     * @see #Parser(URLConnection,ParserFeedback)     * @param connection A fully conditioned connection. The connect()     * method will be called so it need not be connected yet.     * @throws ParserException If the creation of the underlying Lexer     * cannot be performed.     */    public Parser (URLConnection connection) throws ParserException    {        this (connection, STDOUT);    }    //    // Bean patterns    //    /**     * Set the connection for this parser.     * This method creates a new <code>Lexer</code> reading from the connection.     * @param connection A fully conditioned connection. The connect()     * method will be called so it need not be connected yet.     * @exception ParserException if the character set specified in the     * HTTP header is not supported, or an i/o exception occurs creating the     * lexer.     * @see #setLexer     * @see #getConnection     */    public void setConnection (URLConnection connection)        throws            ParserException    {        if (null == connection)            throw new IllegalArgumentException ("connection cannot be null");        setLexer (new Lexer (connection));    }    /**     * Return the current connection.     * @return The connection either created by the parser or passed into this     * parser via {@link #setConnection}.     * @see #setConnection(URLConnection)     */    public URLConnection getConnection ()    {        return (getLexer ().getPage ().getConnection ());    }    /**     * Set the URL for this parser.     * This method creates a new Lexer reading from the given URL.     * Trying to set the url to null or an empty string is a no-op.     * @param url The new URL for the parser.     * @throws ParserException If the url is invalid or creation of the     * underlying Lexer cannot be performed.     * @see #getURL     */    public void setURL (String url)        throws            ParserException    {        if ((null != url) && !"".equals (url))            setConnection (getConnectionManager ().openConnection (url));    }    /**     * Return the current URL being parsed.     * @return The current url. This is the URL for the current page.     * A string passed into the constructor or set via setURL may be altered,     * for example, a file name may be modified to be a URL.     * @see Page#getUrl     * @see #setURL     */    public String getURL ()    {        return (getLexer ().getPage ().getUrl ());    }
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -