📄 parser.java
字号:
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Somik Raha//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v $// $Author: derrickoswald $// $Date: 2006/03/20 00:26:01 $// $Revision: 1.111 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser;import java.io.Serializable;import java.net.HttpURLConnection;import java.net.URLConnection;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.http.ConnectionManager;import org.htmlparser.http.ConnectionMonitor;import org.htmlparser.http.HttpHeader;import org.htmlparser.lexer.Lexer;import org.htmlparser.lexer.Page;import org.htmlparser.util.DefaultParserFeedback;import org.htmlparser.util.IteratorImpl;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.util.ParserFeedback;import org.htmlparser.visitors.NodeVisitor;/** * The main parser class. * This is the primary class of the HTML Parser library. It provides * constructors that take a {@link #Parser(String) String}, * a {@link #Parser(URLConnection) URLConnection}, or a * {@link #Parser(Lexer) Lexer}. In the case of a String, an * attempt is made to open it as a URL, and if that fails it assumes it is a * local disk file. If you want to actually parse a String, use * {@link #setInputHTML setInputHTML()} after using the * {@link #Parser() no-args} constructor, or use {@link #createParser}. * <p>The Parser provides access to the contents of the * page, via a {@link #elements() NodeIterator}, a * {@link #parse(NodeFilter) NodeList} or a * {@link #visitAllNodesWith NodeVisitor}. * <p>Typical usage of the parser is: * <code> * <pre> * Parser parser = new Parser ("http://whatever"); * NodeList list = parser.parse (); * // do something with your list of nodes. * </pre> * </code></p> * <p>What types of nodes and what can be done with them is dependant on the * setup, but in general a node can be converted back to HTML and it's * children (enclosed nodes) and parent can be obtained, because nodes are * nested. See the {@link Node} interface.</p> * <p>For example, if the URL contains:<br> * <code> * {@.html * <html> * <head> * <title>Mondays -- What a bad idea.</title> * </head> * <body BGCOLOR="#FFFFFF"> * Most people have a pathological hatred of Mondays... * </body> * </html>} * </code><br> * and the example code above is used, the list contain only one element, the * {@.html <html>} node. This node is a {@link org.htmlparser.tags tag}, * which is an object of class * {@link org.htmlparser.tags.Html Html} if the default {@link NodeFactory} * (a {@link PrototypicalNodeFactory}) is used.</p> * <p>To get at further content, the children of the top * level nodes must be examined. When digging through a node list one must be * conscious of the possibility of whitespace between nodes, e.g. in the example * above: * <code> * <pre> * Node node = list.elementAt (0); * NodeList sublist = node.getChildren (); * System.out.println (sublist.size ()); * </pre> * </code> * would print out 5, not 2, because there are newlines after {@.html <html>}, * {@.html </head>} and {@.html </body>} that are children of the HTML node * besides the {@.html <head>} and {@.html <body>} nodes.</p> * <p>Because processing nodes is so common, two interfaces are provided to * ease this task, {@link org.htmlparser.filters filters} * and {@link org.htmlparser.visitors visitors}. */public class Parser implements Serializable, ConnectionMonitor{ // Please don't change the formatting of the version variables below. // This is done so as to facilitate ant script processing. /** * The floating point version number ({@value}). */ public static final double VERSION_NUMBER = 1.6 ; /** * The type of version ({@value}). */ public static final String VERSION_TYPE = "Integration Build" ; /** * The date of the version ({@value}). */ public static final String VERSION_DATE = "Mar 19, 2006" ; // End of formatting /** * The display version ({@value}). */ public static final String VERSION_STRING = "" + VERSION_NUMBER + " (" + VERSION_TYPE + " " + VERSION_DATE + ")"; /** * Feedback object. */ protected ParserFeedback mFeedback; /** * The html lexer associated with this parser. */ protected Lexer mLexer; /** * A quiet message sink. * Use this for no feedback. */ public static final ParserFeedback DEVNULL = new DefaultParserFeedback (DefaultParserFeedback.QUIET); /** * A verbose message sink. * Use this for output on <code>System.out</code>. */ public static final ParserFeedback STDOUT = new DefaultParserFeedback (); // // Static methods // /** * Return the version string of this parser. * @return A string of the form: * <pre> * "[floating point number] ([build-type] [build-date])" * </pre> */ public static String getVersion () { return (VERSION_STRING); } /** * Return the version number of this parser. * @return A floating point number, the whole number part is the major * version, and the fractional part is the minor version. */ public static double getVersionNumber () { return (VERSION_NUMBER); } /** * Get the connection manager all Parsers use. * @return The connection manager. * @see #setConnectionManager */ public static ConnectionManager getConnectionManager () { return (Page.getConnectionManager ()); } /** * Set the connection manager all Parsers use. * @param manager The new connection manager. * @see #getConnectionManager */ public static void setConnectionManager (ConnectionManager manager) { Page.setConnectionManager (manager); } /** * Creates the parser on an input string. * @param html The string containing HTML. * @param charset <em>Optional</em>. The character set encoding that will * be reported by {@link #getEncoding}. If charset is <code>null</code> * the default character set is used. * @return A parser with the <code>html</code> string as input. */ public static Parser createParser (String html, String charset) { Parser ret; if (null == html) throw new IllegalArgumentException ("html cannot be null"); ret = new Parser (new Lexer (new Page (html, charset))); return (ret); } // // Constructors // /** * Zero argument constructor. * The parser is in a safe but useless state parsing an empty string. * Set the lexer or connection using {@link #setLexer} * or {@link #setConnection}. * @see #setLexer(Lexer) * @see #setConnection(URLConnection) */ public Parser () { this (new Lexer (new Page ("")), DEVNULL); } /** * Construct a parser using the provided lexer and feedback object. * This would be used to create a parser for special cases where the * normal creation of a lexer on a URLConnection needs to be customized. * @param lexer The lexer to draw characters from. * @param fb The object to use when information, * warning and error messages are produced. If <em>null</em> no feedback * is provided. */ public Parser (Lexer lexer, ParserFeedback fb) { setFeedback (fb); if (null == lexer) throw new IllegalArgumentException ("lexer cannot be null"); setLexer (lexer); setNodeFactory (new PrototypicalNodeFactory ()); } /** * Constructor for custom HTTP access. * This would be used to create a parser for a URLConnection that needs * a special setup or negotiation conditioning beyond what is available * from the {@link #getConnectionManager ConnectionManager}. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @param fb The object to use for message communication. * @throws ParserException If the creation of the underlying Lexer * cannot be performed. */ public Parser (URLConnection connection, ParserFeedback fb) throws ParserException { this (new Lexer (connection), fb); } /** * Creates a Parser object with the location of the resource (URL or file) * You would typically create a DefaultHTMLParserFeedback object and pass * it in. * @see #Parser(URLConnection,ParserFeedback) * @param resourceLocn Either the URL or the filename (autodetects). * A standard HTTP GET is performed to read the content of the URL. * @param feedback The HTMLParserFeedback object to use when information, * warning and error messages are produced. If <em>null</em> no feedback * is provided. * @throws ParserException If the URL is invalid. */ public Parser (String resourceLocn, ParserFeedback feedback) throws ParserException { this (getConnectionManager ().openConnection (resourceLocn), feedback); } /** * Creates a Parser object with the location of the resource (URL or file). * A DefaultHTMLParserFeedback object is used for feedback. * @param resourceLocn Either the URL or the filename (autodetects). * @throws ParserException If the resourceLocn argument does not resolve * to a valid page or file. */ public Parser (String resourceLocn) throws ParserException { this (resourceLocn, STDOUT); } /** * Construct a parser using the provided lexer. * A feedback object printing to {@link #STDOUT System.out} is used. * This would be used to create a parser for special cases where the * normal creation of a lexer on a URLConnection needs to be customized. * @param lexer The lexer to draw characters from. */ public Parser (Lexer lexer) { this (lexer, STDOUT); } /** * Construct a parser using the provided URLConnection. * This would be used to create a parser for a URLConnection that needs * a special setup or negotiation conditioning beyond what is available * from the {@link #getConnectionManager ConnectionManager}. * A feedback object printing to {@link #STDOUT System.out} is used. * @see #Parser(URLConnection,ParserFeedback) * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @throws ParserException If the creation of the underlying Lexer * cannot be performed. */ public Parser (URLConnection connection) throws ParserException { this (connection, STDOUT); } // // Bean patterns // /** * Set the connection for this parser. * This method creates a new <code>Lexer</code> reading from the connection. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @exception ParserException if the character set specified in the * HTTP header is not supported, or an i/o exception occurs creating the * lexer. * @see #setLexer * @see #getConnection */ public void setConnection (URLConnection connection) throws ParserException { if (null == connection) throw new IllegalArgumentException ("connection cannot be null"); setLexer (new Lexer (connection)); } /** * Return the current connection. * @return The connection either created by the parser or passed into this * parser via {@link #setConnection}. * @see #setConnection(URLConnection) */ public URLConnection getConnection () { return (getLexer ().getPage ().getConnection ()); } /** * Set the URL for this parser. * This method creates a new Lexer reading from the given URL. * Trying to set the url to null or an empty string is a no-op. * @param url The new URL for the parser. * @throws ParserException If the url is invalid or creation of the * underlying Lexer cannot be performed. * @see #getURL */ public void setURL (String url) throws ParserException { if ((null != url) && !"".equals (url)) setConnection (getConnectionManager ().openConnection (url)); } /** * Return the current URL being parsed. * @return The current url. This is the URL for the current page. * A string passed into the constructor or set via setURL may be altered, * for example, a file name may be modified to be a URL. * @see Page#getUrl * @see #setURL */ public String getURL () { return (getLexer ().getPage ().getUrl ()); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -