📄 page.java
字号:
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v $// $Author: derrickoswald $// $Date: 2006/06/02 02:43:25 $// $Revision: 1.57 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.lexer;import java.io.InputStream;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.Serializable;import java.io.UnsupportedEncodingException;import java.lang.reflect.InvocationTargetException;import java.lang.reflect.Method;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.net.UnknownHostException;import java.util.zip.GZIPInputStream;import java.util.zip.Inflater;import java.util.zip.InflaterInputStream;import org.htmlparser.http.ConnectionManager;import org.htmlparser.util.ParserException;/** * Represents the contents of an HTML page. * Contains the source of characters and an index of positions of line * separators (actually the first character position on the next line). */public class Page implements Serializable{ /** * The default charset. * This should be <code>{@value}</code>, * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) * section 3.7.1 * <p>Another alias is "8859_1". */ public static final String DEFAULT_CHARSET = "ISO-8859-1"; /** * The default content type. * In the absence of alternate information, assume html content ({@value}). */ public static final String DEFAULT_CONTENT_TYPE = "text/html"; /** * Character value when the page is exhausted. * Has a value of {@value}. */ public static final char EOF = (char)Source.EOF; /** * The URL this page is coming from. * Cached value of <code>getConnection().toExternalForm()</code> or * <code>setUrl()</code>. */ protected String mUrl; /** * The base URL for this page. */ protected String mBaseUrl; /** * The source of characters. */ protected Source mSource; /** * Character positions of the first character in each line. */ protected PageIndex mIndex; /** * The connection this page is coming from or <code>null</code>. */ protected transient URLConnection mConnection; /** * Connection control (proxy, cookies, authorization). */ protected static ConnectionManager mConnectionManager = new ConnectionManager (); /** * Construct an empty page. */ public Page () { this (""); } /** * Construct a page reading from a URL connection. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @exception ParserException An exception object wrapping a number of * possible error conditions, some of which are outlined below. * <li>IOException If an i/o exception occurs creating the * source.</li> * <li>UnsupportedEncodingException if the character set specified in the * HTTP header is not supported.</li> */ public Page (URLConnection connection) throws ParserException { if (null == connection) throw new IllegalArgumentException ("connection cannot be null"); setConnection (connection); mBaseUrl = null; } /** * Construct a page from a stream encoded with the given charset. * @param stream The source of bytes. * @param charset The encoding used. * If null, defaults to the <code>DEFAULT_CHARSET</code>. * @exception UnsupportedEncodingException If the given charset * is not supported. */ public Page (InputStream stream, String charset) throws UnsupportedEncodingException { if (null == stream) throw new IllegalArgumentException ("stream cannot be null"); if (null == charset) charset = DEFAULT_CHARSET; mSource = new InputStreamSource (stream, charset); mIndex = new PageIndex (this); mConnection = null; mUrl = null; mBaseUrl = null; } /** * Construct a page from the given string. * @param text The HTML text. * @param charset <em>Optional</em>. The character set encoding that will * be reported by {@link #getEncoding}. If charset is <code>null</code> * the default character set is used. */ public Page (String text, String charset) { if (null == text) throw new IllegalArgumentException ("text cannot be null"); if (null == charset) charset = DEFAULT_CHARSET; mSource = new StringSource (text, charset); mIndex = new PageIndex (this); mConnection = null; mUrl = null; mBaseUrl = null; } /** * Construct a page from the given string. * The page will report that it is using an encoding of * {@link #DEFAULT_CHARSET}. * @param text The HTML text. */ public Page (String text) { this (text, null); } /** * Construct a page from a source. * @param source The source of characters. */ public Page (Source source) { if (null == source) throw new IllegalArgumentException ("source cannot be null"); mSource = source; mIndex = new PageIndex (this); mConnection = null; mUrl = null; mBaseUrl = null; } // // static methods // /** * Get the connection manager all Parsers use. * @return The connection manager. */ public static ConnectionManager getConnectionManager () { return (mConnectionManager); } /** * Set the connection manager to use. * @param manager The new connection manager. */ public static void setConnectionManager (ConnectionManager manager) { mConnectionManager = manager; } /** * Get a CharacterSet name corresponding to a charset parameter. * @param content A text line of the form: * <pre> * text/html; charset=Shift_JIS * </pre> * which is applicable both to the HTTP header field Content-Type and * the meta tag http-equiv="Content-Type". * Note this method also handles non-compliant quoted charset directives * such as: * <pre> * text/html; charset="UTF-8" * </pre> * and * <pre> * text/html; charset='UTF-8' * </pre> * @return The character set name to use when reading the input stream. * For JDKs that have the Charset class this is qualified by passing * the name to findCharset() to render it into canonical form. * If the charset parameter is not found in the given string, the default * character set is returned. * @see #findCharset * @see #DEFAULT_CHARSET */ public String getCharset (String content) { final String CHARSET_STRING = "charset"; int index; String ret; if (null == mSource) ret = DEFAULT_CHARSET; else // use existing (possibly supplied) character set: // bug #1322686 when illegal charset specified ret = mSource.getEncoding (); if (null != content) { index = content.indexOf (CHARSET_STRING); if (index != -1) { content = content.substring (index + CHARSET_STRING.length ()).trim (); if (content.startsWith ("=")) { content = content.substring (1).trim (); index = content.indexOf (";"); if (index != -1) content = content.substring (0, index); //remove any double quotes from around charset string if (content.startsWith ("\"") && content.endsWith ("\"") && (1 < content.length ())) content = content.substring (1, content.length () - 1); //remove any single quote from around charset string if (content.startsWith ("'") && content.endsWith ("'") && (1 < content.length ())) content = content.substring (1, content.length () - 1); ret = findCharset (content, ret); // Charset names are not case-sensitive; // that is, case is always ignored when comparing // charset names.// if (!ret.equalsIgnoreCase (content))// {// System.out.println (// "detected charset \""// + content// + "\", using \""// + ret// + "\"");// } } } } return (ret); } /** * Lookup a character set name. * <em>Vacuous for JVM's without <code>java.nio.charset</code>.</em> * This uses reflection so the code will still run under prior JDK's but * in that case the default is always returned. * @param name The name to look up. One of the aliases for a character set. * @param fallback The name to return if the lookup fails. * @return The character set name. */ public static String findCharset (String name, String fallback) { String ret; try { Class cls; Method method; Object object; cls = Class.forName ("java.nio.charset.Charset"); method = cls.getMethod ("forName", new Class[] { String.class }); object = method.invoke (null, new Object[] { name }); method = cls.getMethod ("name", new Class[] { }); object = method.invoke (object, new Object[] { }); ret = (String)object; } catch (ClassNotFoundException cnfe) { // for reflection exceptions, assume the name is correct ret = name; } catch (NoSuchMethodException nsme) { // for reflection exceptions, assume the name is correct ret = name; } catch (IllegalAccessException ia) { // for reflection exceptions, assume the name is correct ret = name; } catch (InvocationTargetException ita) { // java.nio.charset.IllegalCharsetNameException // and java.nio.charset.UnsupportedCharsetException // return the default ret = fallback; System.out.println ( "unable to determine cannonical charset name for " + name + " - using " + fallback); } return (ret); } // // Serialization support // /** * Serialize the page. * There are two modes to serializing a page based on the connected state. * If connected, the URL and the current offset is saved, while if * disconnected, the underling source is saved. * @param out The object stream to store this object in. * @exception IOException If there is a serialization problem. */ private void writeObject (ObjectOutputStream out) throws IOException { String href; Source source; PageIndex index; // two cases, reading from a URL and not if (null != getConnection ()) { out.writeBoolean (true); out.writeInt (mSource.offset ()); // need to preread this much href = getUrl (); out.writeObject (href); setUrl (getConnection ().getURL ().toExternalForm ()); source = getSource (); mSource = null; // don't serialize the source if we can avoid it index = mIndex; mIndex = null; // will get recreated; valid for the new page anyway? out.defaultWriteObject (); mSource = source; mIndex = index; } else { out.writeBoolean (false); href = getUrl (); out.writeObject (href); setUrl (null); // don't try and read a bogus URL out.defaultWriteObject (); setUrl (href); } } /** * Deserialize the page.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -