📄 source.java
字号:
* if no <code>charset</code> parameter was included in the HTTP
* <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header.
* This is consistent with the preliminary encoding detected in this scenario.
* </ol>
*
* @param urlConnection the URL connection from which to load the source text.
* @throws java.io.IOException if an I/O error occurs.
* @see #getEncoding()
*/
public Source(final URLConnection urlConnection) throws IOException {
this(new EncodingDetector(urlConnection));
}
private String setEncoding(final String encoding, final String encodingSpecificationInfo) {
if (this.encoding==UNINITIALISED) {
this.encoding=encoding;
this.encodingSpecificationInfo=encodingSpecificationInfo;
}
return encoding;
}
/**
* Returns the document {@linkplain #getEncoding() encoding} specified within the text of the document.
* <p>
* The document encoding can be specified within the document text in two ways.
* They are referred to generically in this library as an <i><a name="EncodingSpecification">encoding specification</a></i>,
* and are listed below in order of precedence:
* <ol class="HalfSeparated">
* <li>
* An <a target="_blank" href="http://www.w3.org/TR/REC-xml/#IDAS4MS">encoding declaration</a> within the
* {@linkplain StartTagType#XML_DECLARATION XML declaration} of an XML document,
* which must be present if it has an encoding other than <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>
* or <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a>.
* <pre><?xml version="1.0" encoding="ISO-8859-1" ?></pre>
* <li>
* A <a target="_blank" href="http://www.w3.org/TR/html401/charset.html#spec-char-encoding">META declaration</a>,
* which is in the form of a {@link HTMLElementName#META META} tag with attribute <code>http-equiv="Content-Type"</code>.
* The encoding is specified in the <code>charset</code> parameter of a
* <code><a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a></code>
* HTTP header value, which is placed in the value of the meta tag's <code>content</code> attribute.
* This META declaration should appear as early as possible in the {@link HTMLElementName#HEAD HEAD} element.
* <pre><META http-equiv=Content-Type content="text/html; charset=iso-8859-1"></pre>
* </ol>
* <p>
* Both of these tags must only use characters in the range U+0000 to U+007F, and in the case of the META declaration
* must use ASCII encoding. This, along with the fact that they must occur at or near the beginning of the document,
* assists in their detection and decoding without the need to know the exact encoding of the full text.
*
* @return the document {@linkplain #getEncoding() encoding} specified within the text of the document, or <code>null</code> if no encoding is specified.
* @see #getEncoding()
*/
public String getDocumentSpecifiedEncoding() {
if (documentSpecifiedEncoding!=UNINITIALISED) return documentSpecifiedEncoding;
final Tag xmlDeclarationTag=getTagAt(0);
if (xmlDeclarationTag!=null && xmlDeclarationTag.getTagType()==StartTagType.XML_DECLARATION) {
documentSpecifiedEncoding=((StartTag)xmlDeclarationTag).getAttributeValue("encoding");
if (documentSpecifiedEncoding!=null) return setEncoding(documentSpecifiedEncoding,xmlDeclarationTag.toString());
}
// Check for Content-Type http-equiv meta tag:
final StartTag contentTypeMetaTag=getNextStartTag(0,"http-equiv","Content-Type",false);
if (contentTypeMetaTag!=null) {
final String contentValue=contentTypeMetaTag.getAttributeValue("content");
if (contentValue!=null) {
documentSpecifiedEncoding=getCharsetParameterFromHttpHeaderValue(contentValue);
if (documentSpecifiedEncoding!=null) return setEncoding(documentSpecifiedEncoding,contentTypeMetaTag.toString());
}
}
return setEncoding(null,"No encoding specified in document");
}
/**
* Returns the character encoding scheme of the source byte stream used to create this object.
* <p>
* The encoding of a document defines how the original byte stream was encoded into characters.
* The <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.4">HTTP specification section 3.4</a>
* uses the term "character set" to refer to the encoding, and the term "charset" is similarly used in Java
* (see the class <code>java.nio.charset.Charset</code>).
* This often causes confusion, as a modern "coded character set" such as <a target="_blank" href="http://www.unicode.org/">Unicode</a>
* can have several encodings, such as <a target="_blank" href="http://www.unicode.org/faq/utf_bom.html">UTF-8, UTF-16, and UTF-32</a>.
* See the Wikipedia <a target="_blank" href="http://en.wikipedia.org/wiki/Character_encoding">character encoding</a> article
* for an explanation of the terminology.
* <p>
* This method makes the best possible effort to return the name of the encoding used to decode the original source byte stream
* into character data. This decoding takes place in the constructor when a parameter based on a byte stream such as an
* <code>InputStream</code> or <code>URL</code> is used to specify the source text.
* The documentation of the {@link #Source(InputStream)} and {@link #Source(URL)} constructors describe how the return value of this
* method is determined in these cases.
* It is also possible in some circumstances for the encoding to be determined in the {@link #Source(Reader)} constructor.
* <p>
* If a constructor was used that specifies the source text directly in character form (not requiring the decoding of a byte sequence)
* then the document itself is searched for an <a href="#EncodingSpecification">encoding specification</a>. In this case, this
* method returns the same value as the {@link #getDocumentSpecifiedEncoding()} method.
* <p>
* The {@link #getEncodingSpecificationInfo()} method returns a simple description of how the value of this method was determined.
*
* @return the character encoding scheme of the source byte stream used to create this object, or <code>null</code> if the encoding is not known.
* @see #getEncodingSpecificationInfo()
*/
public String getEncoding() {
if (encoding==UNINITIALISED) getDocumentSpecifiedEncoding();
return encoding;
}
/**
* Returns a concise description of how the {@linkplain #getEncoding() encoding} of the source document was determined.
* <p>
* The description is intended for informational purposes only.
* It is not guaranteed to have any particular format and can not be reliably parsed.
*
* @return a concise description of how the {@linkplain #getEncoding() encoding} of the source document was determined.
* @see #getEncoding()
*/
public String getEncodingSpecificationInfo() {
if (encoding==UNINITIALISED) getDocumentSpecifiedEncoding();
return encodingSpecificationInfo;
}
/**
* Returns the preliminary encoding of the source document together with a concise description of how it was determined.
* <p>
* It is sometimes necessary for the {@link #Source(InputStream)} and {@link #Source(URL)} constructors to search the document for an
* <a href="#EncodingSpecification">encoding specification</a> in order to determine the exact {@linkplain #getEncoding() encoding}
* of the source byte stream.
* <p>
* In order to search for the {@linkplain #getDocumentSpecifiedEncoding() document specified encoding} before the exact encoding is known,
* a <i>preliminary encoding</i> is determined using the first four bytes of the input stream.
* <p>
* Because the encoding specification must only use characters in the range U+0000 to U+007F, the preliminary encoding need only have the following
* basic properties determined:
* <ul>
* <li>Code unit size (8-bit, 16-bit or 32-bit)
* <li>Byte order (big-endian or little-endian) if the code unit size is 16-bit or 32-bit
* <li>Basic encoding of characters in the range U+0000 to U+007F (current implementation only distinguishes between ASCII and EBCDIC)
* </ul>
* <p>
* The encodings used to represent the most commonly encountered combinations of these basic properties are:
* <ul>
* <li><a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a>: 8-bit <a target="_blank" href="http://en.wikipedia.org/wiki/Ascii">ASCII</a>-compatible encoding
* <li><a target="_blank" href="http://en.wikipedia.org/wiki/EBCDIC_037">Cp037</a>: 8-bit <a target="_blank" href="http://en.wikipedia.org/wiki/Ebcdic">EBCDIC</a>-compatible encoding
* <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a>: 16-bit big-endian encoding
* <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16LE</a>: 16-bit little-endian encoding
* <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32BE</a>: 32-bit big-endian encoding (not supported on most java platforms)
* <li><a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32LE</a>: 32-bit little-endian encoding (not supported on most java platforms)
* </ul>
* Note: all encodings with a code unit size greater than 8 bits are assumed to use an
* <a target="_blank" href="http://en.wikipedia.org/wiki/Ascii">ASCII</a>-compatible low-order byte.
* <p>
* In some descriptions returned by this method, and the documentation below, a pattern is used to help demonstrate the contents of the first four bytes of the stream.
* The patterns use the characters "<code>00</code>" to signify a zero byte, "<code>XX</code>" to signify a non-zero byte, and "<code>??</code>" to signify
* a byte than can be either zero or non-zero.
* <p>
* The algorithm for determining the preliminary encoding is as follows:
* <ol class="HalfSeparated">
* <li>Byte pattern "<code>00 00</code>..." : If the stream starts with two zero bytes, the default 32-bit big-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32BE</a> is used.
* <li>Byte pattern "<code>00 XX</code>..." : If the stream starts with a single zero byte, the default 16-bit big-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a> is used.
* <li>Byte pattern "<code>XX ?? 00 00</code>..." : If the third and fourth bytes of the stream are zero, the default 32-bit little-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-32">UTF-32LE</a> is used.
* <li>Byte pattern "<code>XX 00</code>..." or "<code>XX ?? XX 00</code>..." : If the second or fourth byte of the stream is zero, the default 16-bit little-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16LE</a> is used.
* <li>Byte pattern "<code>XX XX 00 XX</code>..." : If the third byte of the stream is zero, the default 16-bit big-endian encoding <a target="_blank" href="http://en.wikipedia.org/wiki/UTF-16">UTF-16BE</a> is used (assumes the first character is > U+00FF).
* <li>Byte pattern "<code>4C XX XX XX</code>..." : If the first four bytes are consistent with the <a target="_blank" href="http://en.wikipedia.org/wiki/Ebcdic">EBCDIC</a> encoding of
* an {@linkplain StartTagType#XML_DECLARATION XML declaration} ("<code><?xm</code>") or
* a {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} ("<code><!DO</code>"),
* or any other string starting with the EBCDIC character '<' followed by three non-ASCII characters (8th bit set),
* which is consistent with EBCDIC alphanumeric characters,
* the default <a target="_blank" href="http://en.wikipedia.org/wiki/Ebcdic">EBCDIC</a>-compatible encoding
* <a target="_blank" href="http://en.wikipedia.org/wiki/EBCDIC_037">Cp037</a> is used.
* <li>Byte pattern "<code>XX XX XX XX</code>..." : Otherwise, if all of the first four bytes of the stream are non-zero,
* the default 8-bit <a target="_blank" href="http://en.wikipedia.org/wiki/Ascii">ASCII</a>-compatible encoding
* <a target="_blank" href="http://en.wikipedia.org/wiki/ISO-8859-1#ISO-8859-1">ISO-8859-1</a> is used.
* </ol>
* <p>
* If it was not necessary to search for a {@linkplain #getDocumentSpecifiedEncoding() document specified encoding} when determining the
* {@linkplain #getEncoding() encoding} of this source document from a byte stream, this method returns <code>null</code>.
* <p>
* See the documentation of the {@link #Source(InputStream)} and {@link #Source(URL)} constructors for more detailed information about when the detection of a
* preliminary encoding is required.
* <p>
* The description returned by this method is intended for informational purposes only.
* It is not guaranteed to have any particular format and can not be reliably parsed.
*
* @return the preliminary encoding of the source document together with a concise description of how it was determined, or <code>null</code> if no preliminary encoding was required.
* @see #getEncoding()
*/
public String getPreliminaryEncodingInfo() {
return preliminaryEncodingInfo;
}
/**
* Indicates whether the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>.
* <p>
* The algorithm used to determine this is designed to be relatively inexpensive and to provide an accurate result in
* most normal situations.
* An exact determination of whether the source document is XML would require a much more complex analysis of the text.
* <p>
* The algorithm is as follows:
* <ol class="HalfSeparated">
* <li>If the document begins with an {@linkplain StartTagType#XML_DECLARATION XML declaration}, it is an XML document.
* <li>If the document contains a {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} that contains the text
* "<code>xhtml</code>", it is an <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> document, and hence
* also an XML document.
* <li>If none of the above conditions are met, assume the document is normal HTML, and therefore not an XML document.
* </ol>
*
* @return <code>true</code> if the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>, otherwise <code>false</code>.
*/
public boolean isXML() {
final Tag xmlDeclarationTag=getTagAt(0);
if (xmlDeclarationTag!=null && xmlDeclarationTag.getTagType()==StartTagType.XML_DECLARATION) return true;
final Tag doctypeTag=getNextTag(0,StartTagType.DOCTYPE_DECLARATION);
// if document has a DOCTYPE declaration and it contains the text "xhtml", it is an XML document:
if (doctypeTag!=null && getParseText().indexOf("xhtml",doctypeTag.begin,doctypeTag.end)!=-1) return true;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -