📄 parser.java
字号:
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Somik Raha//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v $// $Author: derrickoswald $// $Date: 2006/06/10 15:11:32 $// $Revision: 1.120 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser;import java.io.Serializable;import java.net.HttpURLConnection;import java.net.URLConnection;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.http.ConnectionManager;import org.htmlparser.http.ConnectionMonitor;import org.htmlparser.http.HttpHeader;import org.htmlparser.lexer.Lexer;import org.htmlparser.lexer.Page;import org.htmlparser.util.DefaultParserFeedback;import org.htmlparser.util.IteratorImpl;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.util.ParserFeedback;import org.htmlparser.visitors.NodeVisitor;/** * The main parser class. * This is the primary class of the HTML Parser library. It provides * constructors that take a {@link #Parser(String) String}, * a {@link #Parser(URLConnection) URLConnection}, or a * {@link #Parser(Lexer) Lexer}. In the case of a String, * a check is made to see if the first non-whitespace character is a <, in * which case it is assumed to be HTML. Otherwise an * attempt is made to open it as a URL, and if that fails it assumes it is a * local disk file. If you want to parse a String after using the * {@link #Parser() no-args} constructor, use * {@link #setInputHTML setInputHTML()}, or you can use {@link #createParser}. * <p>The Parser provides access to the contents of the * page, via a {@link #elements() NodeIterator}, a * {@link #parse(NodeFilter) NodeList} or a * {@link #visitAllNodesWith NodeVisitor}. * <p>Typical usage of the parser is: * <code> * <pre> * Parser parser = new Parser ("http://whatever"); * NodeList list = parser.parse (null); * // do something with your list of nodes. * </pre> * </code></p> * <p>What types of nodes and what can be done with them is dependant on the * setup, but in general a node can be converted back to HTML and it's * children (enclosed nodes) and parent can be obtained, because nodes are * nested. See the {@link Node} interface.</p> * <p>For example, if the URL contains:<br> * <code> * {@.html * <html> * <head> * <title>Mondays -- What a bad idea.</title> * </head> * <body BGCOLOR="#FFFFFF"> * Most people have a pathological hatred of Mondays... * </body> * </html>} * </code><br> * and the example code above is used, the list contain only one element, the * {@.html <html>} node. This node is a {@link org.htmlparser.tags tag}, * which is an object of class * {@link org.htmlparser.tags.Html Html} if the default {@link NodeFactory} * (a {@link PrototypicalNodeFactory}) is used.</p> * <p>To get at further content, the children of the top * level nodes must be examined. When digging through a node list one must be * conscious of the possibility of whitespace between nodes, e.g. in the example * above: * <code> * <pre> * Node node = list.elementAt (0); * NodeList sublist = node.getChildren (); * System.out.println (sublist.size ()); * </pre> * </code> * would print out 5, not 2, because there are newlines after {@.html <html>}, * {@.html </head>} and {@.html </body>} that are children of the HTML node * besides the {@.html <head>} and {@.html <body>} nodes.</p> * <p>Because processing nodes is so common, two interfaces are provided to * ease this task, {@link org.htmlparser.filters filters} * and {@link org.htmlparser.visitors visitors}. */public class Parser implements Serializable, ConnectionMonitor{ // Please don't change the formatting of the version variables below. // This is done so as to facilitate ant script processing. /** * The floating point version number ({@value}). */ public static final double VERSION_NUMBER = 1.6 ; /** * The type of version ({@value}). */ public static final String VERSION_TYPE = "Release Build" ; /** * The date of the version ({@value}). */ public static final String VERSION_DATE = "Jun 10, 2006" ; // End of formatting /** * The display version ({@value}). */ public static final String VERSION_STRING = "" + VERSION_NUMBER + " (" + VERSION_TYPE + " " + VERSION_DATE + ")"; /** * Feedback object. */ protected ParserFeedback mFeedback; /** * The html lexer associated with this parser. */ protected Lexer mLexer; /** * A quiet message sink. * Use this for no feedback. */ public static final ParserFeedback DEVNULL = new DefaultParserFeedback (DefaultParserFeedback.QUIET); /** * A verbose message sink. * Use this for output on <code>System.out</code>. */ public static final ParserFeedback STDOUT = new DefaultParserFeedback (); static { getConnectionManager ().getDefaultRequestProperties ().put ( "User-Agent", "HTMLParser/" + getVersionNumber ()); } // // Static methods // /** * Return the version string of this parser. * @return A string of the form: * <pre> * "[floating point number] ([build-type] [build-date])" * </pre> */ public static String getVersion () { return (VERSION_STRING); } /** * Return the version number of this parser. * @return A floating point number, the whole number part is the major * version, and the fractional part is the minor version. */ public static double getVersionNumber () { return (VERSION_NUMBER); } /** * Get the connection manager all Parsers use. * @return The connection manager. * @see #setConnectionManager */ public static ConnectionManager getConnectionManager () { return (Page.getConnectionManager ()); } /** * Set the connection manager all Parsers use. * @param manager The new connection manager. * @see #getConnectionManager */ public static void setConnectionManager (ConnectionManager manager) { Page.setConnectionManager (manager); } /** * Creates the parser on an input string. * @param html The string containing HTML. * @param charset <em>Optional</em>. The character set encoding that will * be reported by {@link #getEncoding}. If charset is <code>null</code> * the default character set is used. * @return A parser with the <code>html</code> string as input. * @exception IllegalArgumentException if <code>html</code> is <code>null</code>. */ public static Parser createParser (String html, String charset) { Parser ret; if (null == html) throw new IllegalArgumentException ("html cannot be null"); ret = new Parser (new Lexer (new Page (html, charset))); return (ret); } // // Constructors // /** * Zero argument constructor. * The parser is in a safe but useless state parsing an empty string. * Set the lexer or connection using {@link #setLexer} * or {@link #setConnection}. * @see #setLexer(Lexer) * @see #setConnection(URLConnection) */ public Parser () { this (new Lexer (new Page ("")), DEVNULL); } /** * Construct a parser using the provided lexer and feedback object. * This would be used to create a parser for special cases where the * normal creation of a lexer on a URLConnection needs to be customized. * @param lexer The lexer to draw characters from. * @param fb The object to use when information, * warning and error messages are produced. If <em>null</em> no feedback * is provided. */ public Parser (Lexer lexer, ParserFeedback fb) { setFeedback (fb); setLexer (lexer); setNodeFactory (new PrototypicalNodeFactory ()); } /** * Constructor for custom HTTP access. * This would be used to create a parser for a URLConnection that needs * a special setup or negotiation conditioning beyond what is available * from the {@link #getConnectionManager ConnectionManager}. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @param fb The object to use for message communication. * @throws ParserException If the creation of the underlying Lexer * cannot be performed. */ public Parser (URLConnection connection, ParserFeedback fb) throws ParserException { this (new Lexer (connection), fb); } /** * Creates a Parser object with the location of the resource (URL or file) * You would typically create a DefaultHTMLParserFeedback object and pass * it in. * @see #Parser(URLConnection,ParserFeedback) * @param resource Either a URL, a filename or a string of HTML. * The string is considered HTML if the first non-whitespace character * is a <. The use of a url or file is autodetected by first attempting * to open the resource as a URL, if that fails it is assumed to be a file * name. * A standard HTTP GET is performed to read the content of the URL. * @param feedback The HTMLParserFeedback object to use when information, * warning and error messages are produced. If <em>null</em> no feedback * is provided. * @throws ParserException If the URL is invalid. */ public Parser (String resource, ParserFeedback feedback) throws ParserException { setFeedback (feedback); setResource (resource); setNodeFactory (new PrototypicalNodeFactory ()); } /** * Creates a Parser object with the location of the resource (URL or file). * A DefaultHTMLParserFeedback object is used for feedback. * @param resource Either HTML, a URL or a filename (autodetects). * @throws ParserException If the resourceLocn argument does not resolve * to a valid page or file. * @see #Parser(String,ParserFeedback) */ public Parser (String resource) throws ParserException { this (resource, STDOUT); } /** * Construct a parser using the provided lexer. * A feedback object printing to {@link #STDOUT System.out} is used. * This would be used to create a parser for special cases where the * normal creation of a lexer on a URLConnection needs to be customized. * @param lexer The lexer to draw characters from. */ public Parser (Lexer lexer) { this (lexer, STDOUT); } /** * Construct a parser using the provided URLConnection. * This would be used to create a parser for a URLConnection that needs * a special setup or negotiation conditioning beyond what is available * from the {@link #getConnectionManager ConnectionManager}. * A feedback object printing to {@link #STDOUT System.out} is used. * @see #Parser(URLConnection,ParserFeedback) * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @throws ParserException If the creation of the underlying Lexer * cannot be performed. */ public Parser (URLConnection connection) throws ParserException { this (connection, STDOUT); } // // Bean patterns // /** * Set the html, a url, or a file. * @param resource The resource to use. * @exception IllegalArgumentException if <code>resource</code> is <code>null</code>. * @exception ParserException if a problem occurs in connecting. */ public void setResource (String resource) throws ParserException { int length; boolean html; char ch; if (null == resource) throw new IllegalArgumentException ("resource cannot be null"); length = resource.length (); html = false; for (int i = 0; i < length; i++) { ch = resource.charAt (i); if (!Character.isWhitespace (ch)) { if ('<' == ch) html = true; break; } } if (html) setLexer (new Lexer (new Page (resource))); else setLexer (new Lexer (getConnectionManager ().openConnection (resource))); } /** * Set the connection for this parser. * This method creates a new <code>Lexer</code> reading from the connection. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @exception ParserException if the character set specified in the * HTTP header is not supported, or an i/o exception occurs creating the * lexer. * @see #setLexer * @see #getConnection * @exception IllegalArgumentException if <code>connection</code> is <code>null</code>. * @exception ParserException if a problem occurs in connecting. */ public void setConnection (URLConnection connection) throws ParserException { if (null == connection) throw new IllegalArgumentException ("connection cannot be null"); setLexer (new Lexer (connection)); } /** * Return the current connection. * @return The connection either created by the parser or passed into this * parser via {@link #setConnection}. * @see #setConnection(URLConnection) */ public URLConnection getConnection ()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -