📄 source.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
	 *   if no <code>charset</code> parameter was included in the HTTP
	 *   <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header.
	 *   This is consistent with the preliminary encoding detected in this scenario.
	 * </ol>
	 *
	 * @param urlConnection  the URL connection from which to load the source text.
	 * @throws java.io.IOException if an I/O error occurs.
	 * @see #getEncoding()
	 */
	public Source(final URLConnection urlConnection) throws IOException {
		this(new EncodingDetector(urlConnection));
	}

	private String setEncoding(final String encoding, final String encodingSpecificationInfo) {
		if (this.encoding==UNINITIALISED) {
			this.encoding=encoding;
			this.encodingSpecificationInfo=encodingSpecificationInfo;
		}
		return encoding;
	}

	/**
	 * Returns the document {@linkplain #getEncoding() encoding} specified within the text of the document.
	 * <p>
	 * The document encoding can be specified within the document text in two ways.
	 * They are referred to generically in this library as an <i><a name="EncodingSpecification">encoding specification</a></i>,
	 * and are listed below in order of precedence:
	 * <ol class="HalfSeparated">
	 *  <li>
	 *   An <a target="_blank" href="http://www.w3.org/TR/REC-xml/#IDAS4MS">encoding declaration</a> within the
	 *   {@linkplain StartTagType#XML_DECLARATION XML declaration} of an XML document,
	 *   which must be present if it has an encoding other than <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>
	 *   or <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a>.
	 *   <pre>&lt;?xml version="1.0" encoding="ISO-8859-1" ?&gt;</pre>
	 *  <li>
	 *   A <a target="_blank" href="http://www.w3.org/TR/html401/charset.html#spec-char-encoding">META declaration</a>,
	 *   which is in the form of a {@link HTMLElementName#META META} tag with attribute <code>http-equiv="Content-Type"</code>.
	 *   The encoding is specified in the <code>charset</code> parameter of a
	 *   <code><a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a></code>
	 *   HTTP header value, which is placed in the value of the meta tag's <code>content</code> attribute.
	 *   This META declaration should appear as early as possible in the {@link HTMLElementName#HEAD HEAD} element.
	 *   <pre>&lt;META http-equiv=Content-Type content="text/html; charset=iso-8859-1"&gt;</pre>
	 * </ol>
	 * <p>
	 * Both of these tags must only use characters in the range U+0000 to U+007F, and in the case of the META declaration
	 * must use ASCII encoding.  This, along with the fact that they must occur at or near the beginning of the document,
	 * assists in their detection and decoding without the need to know the exact encoding of the full text.
	 *
	 * @return the document {@linkplain #getEncoding() encoding} specified within the text of the document, or <code>null</code> if no encoding is specified.
	 * @see #getEncoding()
	 */
	public String getDocumentSpecifiedEncoding() {
		if (documentSpecifiedEncoding!=UNINITIALISED) return documentSpecifiedEncoding;
		final Tag xmlDeclarationTag=getTagAt(0);
		if (xmlDeclarationTag!=null && xmlDeclarationTag.getTagType()==StartTagType.XML_DECLARATION) {
			documentSpecifiedEncoding=((StartTag)xmlDeclarationTag).getAttributeValue("encoding");
			if (documentSpecifiedEncoding!=null) return setEncoding(documentSpecifiedEncoding,xmlDeclarationTag.toString());
		}
		// Check for Content-Type http-equiv meta tag:
		final StartTag contentTypeMetaTag=getNextStartTag(0,"http-equiv","Content-Type",false);
		if (contentTypeMetaTag!=null) {
			final String contentValue=contentTypeMetaTag.getAttributeValue("content");
			if (contentValue!=null) {
				documentSpecifiedEncoding=getCharsetParameterFromHttpHeaderValue(contentValue);
				if (documentSpecifiedEncoding!=null) return setEncoding(documentSpecifiedEncoding,contentTypeMetaTag.toString());
			}
		}
		return setEncoding(null,"No encoding specified in document");
	}

	/**
	 * Returns the character encoding scheme of the source byte stream used to create this object.
	 * <p>
	 * The encoding of a document defines how the original byte stream was encoded into characters.
	 * The <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.4">HTTP specification section 3.4</a>
	 * uses the term "character set" to refer to the encoding, and the term "charset" is similarly used in Java
	 * (see the class <code>java.nio.charset.Charset</code>).
	 * This often causes confusion, as a modern "coded character set" such as <a target="_blank" href="http://www.unicode.org/">Unicode</a>
	 * can have several encodings, such as <a target="_blank" href="http://www.unicode.org/faq/utf_bom.html">UTF-8, UTF-16, and UTF-32</a>.
	 * See the Wikipedia <a target="_blank" href="http://en.wikipedia.org/wiki/Character_encoding">character encoding</a> article 
	 * for an explanation of the terminology.
	 * <p>
	 * This method makes the best possible effort to return the name of the encoding used to decode the original source byte stream
	 * into character data.  This decoding takes place in the constructor when a parameter based on a byte stream such as an
	 * <code>InputStream</code> or <code>URL</code> is used to specify the source text.
	 * The documentation of the {@link #Source(InputStream)} and {@link #Source(URL)} constructors describe how the return value of this
	 * method is determined in these cases.
	 * It is also possible in some circumstances for the encoding to be determined in the {@link #Source(Reader)} constructor.
	 * <p>
	 * If a constructor was used that specifies the source text directly in character form (not requiring the decoding of a byte sequence)
	 * then the document itself is searched for an <a href="#EncodingSpecification">encoding specification</a>.  In this case, this
	 * method returns the same value as the {@link #getDocumentSpecifiedEncoding()} method.
	 * <p>
	 * The {@link #getEncodingSpecificationInfo()} method returns a simple description of how the value of this method was determined.
	 *
	 * @return the character encoding scheme of the source byte stream used to create this object, or <code>null</code> if the encoding is not known.
	 * @see #getEncodingSpecificationInfo()
	 */
	public String getEncoding() {
		if (encoding==UNINITIALISED) getDocumentSpecifiedEncoding();
		return encoding;
	}

	/**
	 * Returns a concise description of how the {@linkplain #getEncoding() encoding} of the source document was determined.
	 * <p>
	 * The description is intended for informational purposes only.
	 * It is not guaranteed to have any particular format and can not be reliably parsed.
	 *
	 * @return a concise description of how the {@linkplain #getEncoding() encoding} of the source document was determined.
	 * @see #getEncoding()
	 */
	public String getEncodingSpecificationInfo() {
		if (encoding==UNINITIALISED) getDocumentSpecifiedEncoding();
		return encodingSpecificationInfo;
	}

	/**
	 * Returns the preliminary encoding of the source document together with a concise description of how it was determined.
	 * <p>
	 * It is sometimes necessary for the {@link #Source(InputStream)} and {@link #Source(URL)} constructors to search the document for an 
	 * <a href="#EncodingSpecification">encoding specification</a> in order to determine the exact {@linkplain #getEncoding() encoding}
	 * of the source byte stream.
	 * <p>
	 * In order to search for the {@linkplain #getDocumentSpecifiedEncoding() document specified encoding} before the exact encoding is known,
	 * a <i>preliminary encoding</i> is determined using the first four bytes of the input stream.
	 * <p>
	 * Because the encoding specification must only use characters in the range U+0000 to U+007F, the preliminary encoding need only have the following
	 * basic properties determined:
	 * <ul>
	 *  <li>Code unit size (8-bit, 16-bit or 32-bit)
	 *  <li>Byte order (big-endian or little-endian) if the code unit size is 16-bit or 32-bit
	 *  <li>Basic encoding of characters in the range U+0000 to U+007F (current implementation only distinguishes between ASCII and EBCDIC)
	 * </ul>
	 * <p>
	 * The encodings used to represent the most commonly encountered combinations of these basic properties are:
	 * <ul>
	 *  <li><a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a>: 8-bit <a target="_blank" href="http://en.wikipedia.org/wiki/Ascii">ASCII</a>-compatible encoding
	 *  <li><a target="_blank" href="http://en.wikipedia.org/wiki/EBCDIC_037">Cp037</a>: 8-bit <a target="_blank" href="http://en.wikipedia.org/wiki/Ebcdic">EBCDIC</a>-compatible encoding
	 *  <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a>: 16-bit big-endian encoding
	 *  <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16LE</a>: 16-bit little-endian encoding
	 *  <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32BE</a>: 32-bit big-endian encoding (not supported on most java platforms)
	 *  <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32LE</a>: 32-bit little-endian encoding (not supported on most java platforms)
	 * </ul>
	 * Note: all encodings with a code unit size greater than 8 bits are assumed to use an
	 * <a target="_blank" href="http://en.wikipedia.org/wiki/Ascii">ASCII</a>-compatible low-order byte.
	 * <p>
	 * In some descriptions returned by this method, and the documentation below, a pattern is used to help demonstrate the contents of the first four bytes of the stream.
	 * The patterns use the characters "<code>00</code>" to signify a zero byte, "<code>XX</code>" to signify a non-zero byte, and "<code>??</code>" to signify
	 * a byte than can be either zero or non-zero.
	 * <p>
	 * The algorithm for determining the preliminary encoding is as follows:
	 * <ol class="HalfSeparated">
	 *  <li>Byte pattern "<code>00 00</code>..." : If the stream starts with two zero bytes, the default 32-bit big-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32BE</a> is used.
	 *  <li>Byte pattern "<code>00 XX</code>..." : If the stream starts with a single zero byte, the default 16-bit big-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a> is used.
	 *  <li>Byte pattern "<code>XX ?? 00 00</code>..." : If the third and fourth bytes of the stream are zero, the default 32-bit little-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32LE</a> is used.
	 *  <li>Byte pattern "<code>XX 00</code>..." or "<code>XX ?? XX 00</code>..." : If the second or fourth byte of the stream is zero, the default 16-bit little-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16LE</a> is used.
	 *  <li>Byte pattern "<code>XX XX 00 XX</code>..." : If the third byte of the stream is zero, the default 16-bit big-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a> is used (assumes the first character is > U+00FF).
	 *  <li>Byte pattern "<code>4C XX XX XX</code>..." : If the first four bytes are consistent with the <a target="_blank" href="http://en.wikipedia.org/wiki/Ebcdic">EBCDIC</a> encoding of
	 *   an {@linkplain StartTagType#XML_DECLARATION XML declaration} ("<code>&lt;?xm</code>") or
	 *   a {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} ("<code>&lt;!DO</code>"),
	 *   or any other string starting with the EBCDIC character '&lt;' followed by three non-ASCII characters (8th bit set),
	 *   which is consistent with EBCDIC alphanumeric characters,
	 *   the default <a target="_blank" href="http://en.wikipedia.org/wiki/Ebcdic">EBCDIC</a>-compatible encoding
	 *   <a target="_blank" href="http://en.wikipedia.org/wiki/EBCDIC_037">Cp037</a> is used.
	 *  <li>Byte pattern "<code>XX XX XX XX</code>..." : Otherwise, if all of the first four bytes of the stream are non-zero,
	 *   the default 8-bit <a target="_blank" href="http://en.wikipedia.org/wiki/Ascii">ASCII</a>-compatible encoding
	 *   <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a> is used.
	 * </ol>
	 * <p>
	 * If it was not necessary to search for a {@linkplain #getDocumentSpecifiedEncoding() document specified encoding} when determining the
	 * {@linkplain #getEncoding() encoding} of this source document from a byte stream, this method returns <code>null</code>.
	 * <p>
	 * See the documentation of the {@link #Source(InputStream)} and {@link #Source(URL)} constructors for more detailed information about when the detection of a
	 * preliminary encoding is required.
	 * <p>
	 * The description returned by this method is intended for informational purposes only.
	 * It is not guaranteed to have any particular format and can not be reliably parsed.
	 *
	 * @return the preliminary encoding of the source document together with a concise description of how it was determined, or <code>null</code> if no preliminary encoding was required.
	 * @see #getEncoding()
	 */
	public String getPreliminaryEncodingInfo() {
		return preliminaryEncodingInfo;
	}

	/**
	 * Indicates whether the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>.
	 * <p>
	 * The algorithm used to determine this is designed to be relatively inexpensive and to provide an accurate result in
	 * most normal situations.
	 * An exact determination of whether the source document is XML would require a much more complex analysis of the text.
	 * <p>
	 * The algorithm is as follows:
	 * <ol class="HalfSeparated">
	 *  <li>If the document begins with an {@linkplain StartTagType#XML_DECLARATION XML declaration}, it is an XML document.
	 *  <li>If the document contains a {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} that contains the text
	 *   "<code>xhtml</code>", it is an <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> document, and hence
	 *   also an XML document.
	 *  <li>If none of the above conditions are met, assume the document is normal HTML, and therefore not an XML document.
	 * </ol>
	 *
	 * @return <code>true</code> if the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>, otherwise <code>false</code>.
	 */
	public boolean isXML() {
		final Tag xmlDeclarationTag=getTagAt(0);
		if (xmlDeclarationTag!=null && xmlDeclarationTag.getTagType()==StartTagType.XML_DECLARATION) return true;
		final Tag doctypeTag=getNextTag(0,StartTagType.DOCTYPE_DECLARATION);
		// if document has a DOCTYPE declaration and it contains the text "xhtml", it is an XML document:
		if (doctypeTag!=null && getParseText().indexOf("xhtml",doctypeTag.begin,doctypeTag.end)!=-1) return true;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -