📄 source.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;
import java.io.*;
import java.net.*;

/**
 * Represents a source HTML document.
 * <p>
 * The first step in parsing an HTML document is always to construct a <code>Source</code> object from the source data, which can be a
 * <code>String</code>, <code>Reader</code>, <code>InputStream</code>, <code>URLConnection</code> or <code>URL</code>.
 * Each constructor uses all the evidence available to determine the original {@linkplain #getEncoding() character encoding} of the data.
 * <p>
 * Once the <code>Source</code> object has been created, you can immediately start searching for {@linkplain Tag tags} or {@linkplain Element elements} within the document
 * using the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
 * <p>
 * In certain circumstances you may be able to improve performance by calling the {@link #fullSequentialParse()} method before calling any
 * <a href="Tag.html#TagSearchMethods">tag search methods</a>.  See the documentation of the {@link #fullSequentialParse()} method for details.
 * <p>
 * Any issues encountered while parsing are logged to a {@link Logger} object.
 * The {@link #setLogger(Logger)} method can be used to explicitly set a <code>Logger</code> implementation for a particular <code>Source</code> instance,
 * otherwise the static {@link Config#LoggerProvider} property determines how the logger is set by default for all <code>Source</code> instances.
 * See the documentation of the {@link Config#LoggerProvider} property for information about how the default logging provider is determined.
 * <p>
 * Note that many of the useful functions which can be performed on the source document are
 * defined in its superclass, {@link Segment}.
 * The source object is itself a segment which spans the entire document.
 * <p>
 * Most of the methods defined in this class are useful for determining the elements and tags
 * surrounding or neighbouring a particular character position in the document.
 * <p>
 * For information on how to create a modified version of this source document, see the {@link OutputDocument} class.
 * <p>
 * <code>Source</code> objects are not thread safe, and should therefore not be shared between multiple threads unless all access is synchronized using
 * some mechanism external to the library.
 *
 * @see Segment
 */
public class Source extends Segment implements Iterable<Segment> {
	final String string;
	private String documentSpecifiedEncoding=UNINITIALISED;
	private String encoding=UNINITIALISED; // null value means no encoding specified.
	private String encodingSpecificationInfo;
	private String preliminaryEncodingInfo=null;
	private String newLine=UNINITIALISED;
	private ParseText parseText=null;
	private OutputDocument parseTextOutputDocument=null;
	Logger logger; // never null
	private RowColumnVector[] rowColumnVectorCacheArray=null;
	final Cache cache=new Cache(this);
	boolean useAllTypesCache=true;
	boolean useSpecialTypesCache=true;
	int[] fullSequentialParseData=null; // non-null iff a fullSequentialParse is underway. In version 2.5 this was passed around as a parameter during full sequential parse, but this approach was found to be error-prone and abandoned in 2.6
	// cached result lists:
	Tag[] allTagsArray=null; // non-null iff fullSequentialParse was called
	List<Tag> allTags=null; // non-null iff fullSequentialParse was called
	List<StartTag> allStartTags=null;
	private List<Element> allElements=null;
	private List<Element> childElements=null;

	private static String lastNewLine=null;

	private static final String UNINITIALISED="";
	private static final String CR="\r";
	private static final String LF="\n";
	private static final String CRLF="\r\n";

	static final String PACKAGE_NAME="net.htmlparser.jericho"; //Source.class.getPackage().getName();

	/**
	 * Constructs a new <code>Source</code> object from the specified text.
	 * @param text  the source text.
	 */
	public Source(final CharSequence text) {
		super(text.length());
		string=text.toString();
		setLogger(newLogger());
	}

	private Source(final EncodingDetector encodingDetector) throws IOException {
		this(getString(encodingDetector));
		encoding=encodingDetector.getEncoding();
		encodingSpecificationInfo=encodingDetector.getEncodingSpecificationInfo();
		preliminaryEncodingInfo=encodingDetector.getPreliminaryEncoding()+": "+encodingDetector.getPreliminaryEncodingSpecificationInfo();
	}

	Source(final Reader reader, final String encoding) throws IOException {
		this(Util.getString(reader));
		if (encoding!=null) {
			this.encoding=encoding;
			encodingSpecificationInfo="InputStreamReader.getEncoding() of constructor argument";
		}
	}

	/**
	 * Constructs a new <code>Source</code> object by loading the content from the specified <code>Reader</code>.
	 * <p>
	 * If the specified reader is an instance of <code>InputStreamReader</code>, the {@link #getEncoding()} method of the
	 * created source object returns the encoding from <code>InputStreamReader.getEncoding()</code>.
	 *
	 * @param reader  the <code>java.io.Reader</code> from which to load the source text.
	 * @throws java.io.IOException if an I/O error occurs.
	 */
	public Source(final Reader reader) throws IOException {
		this(reader,(reader instanceof InputStreamReader) ? ((InputStreamReader)reader).getEncoding() : null);
	}

	/**
	 * Constructs a new <code>Source</code> object by loading the content from the specified <code>InputStream</code>.
	 * <p>
	 * The algorithm for detecting the character {@linkplain #getEncoding() encoding} of the source document from the raw bytes
	 * of the specified input stream is the same as that for the {@link #Source(URL)} constructor,
	 * except that the first step is not possible as there is no
	 * <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header to check.
	 *
	 * @param inputStream  the <code>java.io.InputStream</code> from which to load the source text.
	 * @throws java.io.IOException if an I/O error occurs.
	 * @see #getEncoding()
	 */
	public Source(final InputStream inputStream) throws IOException {
		this(new EncodingDetector(inputStream));
	}

	/**
	 * Constructs a new <code>Source</code> object by loading the content from the specified URL.
	 * <p>
	 * This is equivalent to {@link #Source(URLConnection) Source(url.openConnection())}.
	 *
	 * @param url  the URL from which to load the source text.
	 * @throws java.io.IOException if an I/O error occurs.
	 * @see #getEncoding()
	 */
	public Source(final URL url) throws IOException {
		this(new EncodingDetector(url.openConnection()));
	}

	/**
	 * Constructs a new <code>Source</code> object by loading the content from the specified <code>URLConnection</code>.
	 * <p>
	 * The algorithm for detecting the character {@linkplain #getEncoding() encoding} of the source document is as follows:
	 * <br />(process termination is marked by &diams;)
	 * <ol class="HalfSeparated">
	 *  <li>If the HTTP headers received from the URL connection include a 
	 *   <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header
	 *   specifying a <code>charset</code> parameter, then use the encoding specified in the value of the <code>charset</code> parameter. &diams;
	 *  <li>Read the first four bytes of the input stream.
	 *  <li>If the input stream is empty, the created source document has zero length and its {@link #getEncoding()} method 
	 *   returns <code>null</code>. &diams;
	 *  <li>If the input stream starts with a unicode <a target="_blank" href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a> (BOM),
	 *   then use the encoding signified by the BOM. &diams;
	 *   <table class="bordered" cellspacing="0" style="margin: 15px">
	 *    <tr><th>BOM Bytes</th><th>Encoding</th></tr>
	 *    <tr><td><code>EF BB FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a></tr>
	 *    <tr><td><code>FF FE 00 00</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-32</a> (little-endian)</tr>
	 *    <tr><td><code>00 00 FE FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-32</a> (big-endian)</tr>
	 *    <tr><td><code>FF FE</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> (little-endian)</tr>
	 *    <tr><td><code>FE FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> (big-endian)</tr>
	 *    <tr><td><code>0E FE FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/Standard_Compression_Scheme_for_Unicode">SCSU</a></tr>
	 *    <tr><td><code>2B 2F 76</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-7">UTF-7</a></tr>
	 *    <tr><td><code>DD 73 66 73</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-EBCDIC">UTF-EBCDIC</a></tr>
	 *    <tr><td><code>FB EE 28</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/BOCU-1">BOCU-1</a></tr>
	 *   </table>
	 *  <li>If the stream contains less than four bytes, then:
	 *   <ol class="Unseparated">
	 *    <li>If the stream contains either one or three bytes, then use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a>. &diams;
	 *    <li>If the stream starts with a zero byte, then use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a>. &diams;
	 *    <li>If the second byte of the stream is zero, then use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16LE</a>. &diams;
	 *    <li>Otherwise use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a>. &diams;
	 *   </ol>
	 *  <li>Determine a {@linkplain #getPreliminaryEncodingInfo() preliminary encoding} by examining the first four bytes of the input stream.
	 *   See the {@link #getPreliminaryEncodingInfo()} method for details.
	 *  <li>Read the first 2048 bytes of the input stream and decode it using the preliminary encoding to create a "preview segment".
	 *   If the detected preliminary encoding is not supported on this platform, create the preview segment using
	 *   <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a> instead (this incident is logged at {@linkplain Logger#warn(String) warn} level).
	 *  <li>Search the preview segment for an <a href="#EncodingSpecification">encoding specification</a>, which should always appear at or near the top of the document.
	 *  <li>If an encoding specification is found:
	 *   <ol class="Unseparated">
	 *    <li>If the specified encoding is supported on this platform, use it. &diams;
	 *    <li>If the specified encoding is not supported on this platform, use the encoding that was used to create the preview segment,
	 *     which is normally the detected {@linkplain #getPreliminaryEncodingInfo() preliminary encoding}. &diams;
	 *   </ol>
	 *  <li>If the document {@linkplain #isXML() looks like XML}, then use <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>. &diams;
	 *   <br/>Section <a target="_blank" href="http://www.w3.org/TR/REC-xml/#charencoding">4.3.3</a> of the XML 1.0 specification states that
	 *   an XML file that is not encoded in UTF-8 must contain either a UTF-16 <a target="_blank" href="http://en.wikipedia.org/wiki/Byte_Order_Mark">BOM</a>
	 *   or an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#IDAS4MS">encoding declaration</a> in its {@linkplain StartTagType#XML_DECLARATION XML declaration}.
	 *   Since neither of these was detected, we can assume the encoding is <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
	 *  <li>Use the encoding that was used to create the preview segment, which is normally the detected {@linkplain #getPreliminaryEncodingInfo() preliminary encoding}. &diams;
	 *   <br />This is the best guess, in the absence of any explicit information about the encoding, based on the first four bytes of the stream.
	 *   The <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1">HTTP protocol section 3.7.1</a>
	 *   states that an encoding of <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a> can be assumed
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -