📄 source.java
字号:
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
import java.io.*;
import java.net.*;
/**
* Represents a source HTML document.
* <p>
* The first step in parsing an HTML document is always to construct a <code>Source</code> object from the source data, which can be a
* <code>String</code>, <code>Reader</code>, <code>InputStream</code>, <code>URLConnection</code> or <code>URL</code>.
* Each constructor uses all the evidence available to determine the original {@linkplain #getEncoding() character encoding} of the data.
* <p>
* Once the <code>Source</code> object has been created, you can immediately start searching for {@linkplain Tag tags} or {@linkplain Element elements} within the document
* using the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
* <p>
* In certain circumstances you may be able to improve performance by calling the {@link #fullSequentialParse()} method before calling any
* <a href="Tag.html#TagSearchMethods">tag search methods</a>. See the documentation of the {@link #fullSequentialParse()} method for details.
* <p>
* Any issues encountered while parsing are logged to a {@link Logger} object.
* The {@link #setLogger(Logger)} method can be used to explicitly set a <code>Logger</code> implementation for a particular <code>Source</code> instance,
* otherwise the static {@link Config#LoggerProvider} property determines how the logger is set by default for all <code>Source</code> instances.
* See the documentation of the {@link Config#LoggerProvider} property for information about how the default logging provider is determined.
* <p>
* Note that many of the useful functions which can be performed on the source document are
* defined in its superclass, {@link Segment}.
* The source object is itself a segment which spans the entire document.
* <p>
* Most of the methods defined in this class are useful for determining the elements and tags
* surrounding or neighbouring a particular character position in the document.
* <p>
* For information on how to create a modified version of this source document, see the {@link OutputDocument} class.
* <p>
* <code>Source</code> objects are not thread safe, and should therefore not be shared between multiple threads unless all access is synchronized using
* some mechanism external to the library.
*
* @see Segment
*/
public class Source extends Segment implements Iterable<Segment> {
final String string;
private String documentSpecifiedEncoding=UNINITIALISED;
private String encoding=UNINITIALISED; // null value means no encoding specified.
private String encodingSpecificationInfo;
private String preliminaryEncodingInfo=null;
private String newLine=UNINITIALISED;
private ParseText parseText=null;
private OutputDocument parseTextOutputDocument=null;
Logger logger; // never null
private RowColumnVector[] rowColumnVectorCacheArray=null;
final Cache cache=new Cache(this);
boolean useAllTypesCache=true;
boolean useSpecialTypesCache=true;
int[] fullSequentialParseData=null; // non-null iff a fullSequentialParse is underway. In version 2.5 this was passed around as a parameter during full sequential parse, but this approach was found to be error-prone and abandoned in 2.6
// cached result lists:
Tag[] allTagsArray=null; // non-null iff fullSequentialParse was called
List<Tag> allTags=null; // non-null iff fullSequentialParse was called
List<StartTag> allStartTags=null;
private List<Element> allElements=null;
private List<Element> childElements=null;
private static String lastNewLine=null;
private static final String UNINITIALISED="";
private static final String CR="\r";
private static final String LF="\n";
private static final String CRLF="\r\n";
static final String PACKAGE_NAME="net.htmlparser.jericho"; //Source.class.getPackage().getName();
/**
* Constructs a new <code>Source</code> object from the specified text.
* @param text the source text.
*/
public Source(final CharSequence text) {
super(text.length());
string=text.toString();
setLogger(newLogger());
}
private Source(final EncodingDetector encodingDetector) throws IOException {
this(getString(encodingDetector));
encoding=encodingDetector.getEncoding();
encodingSpecificationInfo=encodingDetector.getEncodingSpecificationInfo();
preliminaryEncodingInfo=encodingDetector.getPreliminaryEncoding()+": "+encodingDetector.getPreliminaryEncodingSpecificationInfo();
}
Source(final Reader reader, final String encoding) throws IOException {
this(Util.getString(reader));
if (encoding!=null) {
this.encoding=encoding;
encodingSpecificationInfo="InputStreamReader.getEncoding() of constructor argument";
}
}
/**
* Constructs a new <code>Source</code> object by loading the content from the specified <code>Reader</code>.
* <p>
* If the specified reader is an instance of <code>InputStreamReader</code>, the {@link #getEncoding()} method of the
* created source object returns the encoding from <code>InputStreamReader.getEncoding()</code>.
*
* @param reader the <code>java.io.Reader</code> from which to load the source text.
* @throws java.io.IOException if an I/O error occurs.
*/
public Source(final Reader reader) throws IOException {
this(reader,(reader instanceof InputStreamReader) ? ((InputStreamReader)reader).getEncoding() : null);
}
/**
* Constructs a new <code>Source</code> object by loading the content from the specified <code>InputStream</code>.
* <p>
* The algorithm for detecting the character {@linkplain #getEncoding() encoding} of the source document from the raw bytes
* of the specified input stream is the same as that for the {@link #Source(URL)} constructor,
* except that the first step is not possible as there is no
* <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header to check.
*
* @param inputStream the <code>java.io.InputStream</code> from which to load the source text.
* @throws java.io.IOException if an I/O error occurs.
* @see #getEncoding()
*/
public Source(final InputStream inputStream) throws IOException {
this(new EncodingDetector(inputStream));
}
/**
* Constructs a new <code>Source</code> object by loading the content from the specified URL.
* <p>
* This is equivalent to {@link #Source(URLConnection) Source(url.openConnection())}.
*
* @param url the URL from which to load the source text.
* @throws java.io.IOException if an I/O error occurs.
* @see #getEncoding()
*/
public Source(final URL url) throws IOException {
this(new EncodingDetector(url.openConnection()));
}
/**
* Constructs a new <code>Source</code> object by loading the content from the specified <code>URLConnection</code>.
* <p>
* The algorithm for detecting the character {@linkplain #getEncoding() encoding} of the source document is as follows:
* <br />(process termination is marked by ♦)
* <ol class="HalfSeparated">
* <li>If the HTTP headers received from the URL connection include a
* <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header
* specifying a <code>charset</code> parameter, then use the encoding specified in the value of the <code>charset</code> parameter. ♦
* <li>Read the first four bytes of the input stream.
* <li>If the input stream is empty, the created source document has zero length and its {@link #getEncoding()} method
* returns <code>null</code>. ♦
* <li>If the input stream starts with a unicode <a target="_blank" href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a> (BOM),
* then use the encoding signified by the BOM. ♦
* <table class="bordered" cellspacing="0" style="margin: 15px">
* <tr><th>BOM Bytes</th><th>Encoding</th></tr>
* <tr><td><code>EF BB FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a></tr>
* <tr><td><code>FF FE 00 00</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-32</a> (little-endian)</tr>
* <tr><td><code>00 00 FE FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-32</a> (big-endian)</tr>
* <tr><td><code>FF FE</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> (little-endian)</tr>
* <tr><td><code>FE FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> (big-endian)</tr>
* <tr><td><code>0E FE FF</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/Standard_Compression_Scheme_for_Unicode">SCSU</a></tr>
* <tr><td><code>2B 2F 76</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-7">UTF-7</a></tr>
* <tr><td><code>DD 73 66 73</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-EBCDIC">UTF-EBCDIC</a></tr>
* <tr><td><code>FB EE 28</code><td><a target="_blank" href="http://en.wikipedia.org/wiki/BOCU-1">BOCU-1</a></tr>
* </table>
* <li>If the stream contains less than four bytes, then:
* <ol class="Unseparated">
* <li>If the stream contains either one or three bytes, then use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a>. ♦
* <li>If the stream starts with a zero byte, then use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a>. ♦
* <li>If the second byte of the stream is zero, then use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16LE</a>. ♦
* <li>Otherwise use the encoding <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a>. ♦
* </ol>
* <li>Determine a {@linkplain #getPreliminaryEncodingInfo() preliminary encoding} by examining the first four bytes of the input stream.
* See the {@link #getPreliminaryEncodingInfo()} method for details.
* <li>Read the first 2048 bytes of the input stream and decode it using the preliminary encoding to create a "preview segment".
* If the detected preliminary encoding is not supported on this platform, create the preview segment using
* <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a> instead (this incident is logged at {@linkplain Logger#warn(String) warn} level).
* <li>Search the preview segment for an <a href="#EncodingSpecification">encoding specification</a>, which should always appear at or near the top of the document.
* <li>If an encoding specification is found:
* <ol class="Unseparated">
* <li>If the specified encoding is supported on this platform, use it. ♦
* <li>If the specified encoding is not supported on this platform, use the encoding that was used to create the preview segment,
* which is normally the detected {@linkplain #getPreliminaryEncodingInfo() preliminary encoding}. ♦
* </ol>
* <li>If the document {@linkplain #isXML() looks like XML}, then use <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>. ♦
* <br/>Section <a target="_blank" href="http://www.w3.org/TR/REC-xml/#charencoding">4.3.3</a> of the XML 1.0 specification states that
* an XML file that is not encoded in UTF-8 must contain either a UTF-16 <a target="_blank" href="http://en.wikipedia.org/wiki/Byte_Order_Mark">BOM</a>
* or an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#IDAS4MS">encoding declaration</a> in its {@linkplain StartTagType#XML_DECLARATION XML declaration}.
* Since neither of these was detected, we can assume the encoding is <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
* <li>Use the encoding that was used to create the preview segment, which is normally the detected {@linkplain #getPreliminaryEncodingInfo() preliminary encoding}. ♦
* <br />This is the best guess, in the absence of any explicit information about the encoding, based on the first four bytes of the stream.
* The <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1">HTTP protocol section 3.7.1</a>
* states that an encoding of <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a> can be assumed
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -