📄 characterreference.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 3 页
字号:

	/**
	 * {@linkplain #decode(CharSequence) Decodes} the specified text after collapsing its {@linkplain #isWhiteSpace(char) white space}.
	 * <p>
	 * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
	 * <p>
	 * The result is how the text would normally be rendered by a
	 * <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>,
	 * assuming it does not contain any tags.
	 * <p>
	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
	 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to normal spaces.
	 * <p>
	 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
	 * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
	 * See the discussion of the <code>insideAttributeValue</code> parameter of the {@link #decode(CharSequence, boolean insideAttributeValue)}
	 * method for a more detailed explanation of this topic.
	 *
	 * @param text  the source text
	 * @return the decoded text with collapsed white space.
	 * @see FormControl#getPredefinedValues()
	 */
	public static String decodeCollapseWhiteSpace(final CharSequence text) {
		return decodeCollapseWhiteSpace(text,Config.ConvertNonBreakingSpaces);
	}

	static String decodeCollapseWhiteSpace(final CharSequence text, final boolean convertNonBreakingSpaces) {
		return decode(appendCollapseWhiteSpace(new StringBuilder(text.length()),text),false,convertNonBreakingSpaces);
	}

	/**
	 * Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again.
	 * <p>
	 * This process ensures that the specified encoded text does not contain any remaining unencoded characters.
	 * <p>
	 * IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method
	 * followed by the {@link #encode(CharSequence) encode} method, but a more efficient implementation
	 * may be used in future.
	 *
	 * @param encodedText  the text to re-encode.
	 * @return the re-encoded string.
	 */
	public static String reencode(final CharSequence encodedText) {
		return encode(decode(encodedText,true));
	}

	/**
	 * Returns the encoded form of this character reference.
	 * <p>
	 * The exact behaviour of this method depends on the class of this object.
	 * See the {@link CharacterEntityReference#getCharacterReferenceString()} and
	 * {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details.
	 * <p>
	 * <dl>
	 *  <dt>Examples:</dt>
	 *   <dd><code>CharacterReference.parse("&amp;GT;").getCharacterReferenceString()</code> returns "<code>&amp;gt;</code>"</dd>
	 *   <dd><code>CharacterReference.parse("&amp;#x3E;").getCharacterReferenceString()</code> returns "<code>&amp;#3e;</code>"</dd>
	 * </dl>
	 *
	 * @return the encoded form of this character reference.
	 * @see #getCharacterReferenceString(int codePoint)
	 * @see #getDecimalCharacterReferenceString()
	 */
	public abstract String getCharacterReferenceString();

	/**
	 * Returns the encoded form of the specified unicode code point.
	 * <p>
	 * This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the unicode code point
	 * if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal character reference} encoded form.
	 * <p>
	 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
	 * which is encoded as the numeric character reference "<code>&amp;#39;</code>" instead of its character entity reference
	 * "<code>&amp;apos;</code>".
	 * <p>
	 * <dl>
	 *  <dt>Examples:</dt>
	 *   <dd><code>CharacterReference.getCharacterReferenceString(62)</code> returns "<code>&amp;gt;</code>"</dd>
	 *   <dd><code>CharacterReference.getCharacterReferenceString('&gt;')</code> returns "<code>&amp;gt;</code>"</dd>
	 *   <dd><code>CharacterReference.getCharacterReferenceString('&#9786;')</code> returns "<code>&amp;#9786;</code>"</dd>
	 * </dl>
	 *
	 * @param codePoint  the unicode code point to encode.
	 * @return the encoded form of the specified unicode code point.
	 * @see #getHexadecimalCharacterReferenceString(int codePoint)
	 */
	public static String getCharacterReferenceString(final int codePoint) {
		String characterReferenceString=null;
		if (codePoint!=CharacterEntityReference._apos) characterReferenceString=CharacterEntityReference.getCharacterReferenceString(codePoint);
		if (characterReferenceString==null) characterReferenceString=NumericCharacterReference.getCharacterReferenceString(codePoint);
		return characterReferenceString;
	}

	/**
	 * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of this character reference.
	 * <p>
	 * This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd><code>CharacterReference.parse("&amp;gt;").getDecimalCharacterReferenceString()</code> returns "<code>&amp;#62;</code>"</dd>
	 * </dl>
	 *
	 * @return the decimal encoded form of this character reference.
	 * @see #getCharacterReferenceString()
	 * @see #getHexadecimalCharacterReferenceString()
	 */
	public String getDecimalCharacterReferenceString() {
		return getDecimalCharacterReferenceString(codePoint);
	}

	/**
	 * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of the specified unicode code point.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd><code>CharacterReference.getDecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#62;</code>"</dd>
	 * </dl>
	 *
	 * @param codePoint  the unicode code point to encode.
	 * @return the decimal encoded form of the specified unicode code point.
	 * @see #getCharacterReferenceString(int codePoint)
	 * @see #getHexadecimalCharacterReferenceString(int codePoint)
	 */
	public static String getDecimalCharacterReferenceString(final int codePoint) {
		try {
			return appendDecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of this character reference.
	 * <p>
	 * This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd><code>CharacterReference.parse("&amp;gt;").getHexadecimalCharacterReferenceString()</code> returns "<code>&amp;#x3e;</code>"</dd>
	 * </dl>
	 *
	 * @return the hexadecimal encoded form of this character reference.
	 * @see #getCharacterReferenceString()
	 * @see #getDecimalCharacterReferenceString()
	 */
	public String getHexadecimalCharacterReferenceString() {
		return getHexadecimalCharacterReferenceString(codePoint);
	}

	/**
	 * Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of the specified unicode code point.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd><code>CharacterReference.getHexadecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#x3e;</code>"</dd>
	 * </dl>
	 *
	 * @param codePoint  the unicode code point to encode.
	 * @return the hexadecimal encoded form of the specified unicode code point.
	 * @see #getCharacterReferenceString(int codePoint)
	 * @see #getDecimalCharacterReferenceString(int codePoint)
	 */
	public static String getHexadecimalCharacterReferenceString(final int codePoint) {
		try {
			return appendHexadecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * Returns the unicode code point of this character reference in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
	 * <p>
	 * This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd><code>CharacterReference.parse("&amp;gt;").getUnicodeText()</code> returns "<code>U+003E</code>"</dd>
	 * </dl>
	 *
	 * @return the unicode code point of this character reference in U+ notation.
	 * @see #getUnicodeText(int codePoint)
	 */
	public String getUnicodeText() {
		return getUnicodeText(codePoint);
	}

	/**
	 * Returns the specified unicode code point in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd><code>CharacterReference.getUnicodeText('&gt;')</code> returns "<code>U+003E</code>"</dd>
	 * </dl>
	 *
	 * @param codePoint  the unicode code point.
	 * @return the specified unicode code point in U+ notation.
	 */
	public static String getUnicodeText(final int codePoint) {
		try {
			return appendUnicodeText(new StringBuilder(),codePoint).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	static final Appendable appendUnicodeText(final Appendable appendable, final int codePoint) throws IOException {
		appendable.append("U+");
		final String hex=Integer.toString(codePoint,16).toUpperCase();
		for (int i=4-hex.length(); i>0; i--) appendable.append('0');
		appendable.append(hex);
		return appendable;
	}

	/**
	 * Parses a single encoded character reference text into a <code>CharacterReference</code> object.
	 * <p>
	 * The character reference must be at the start of the given text, but may contain other characters at the end.
	 * The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended.
	 * <p>
	 * If the text does not represent a valid character reference, this method returns <code>null</code>.
	 * <p>
 	 * <a href="#Unterminated">Unterminated</a> character references are always accepted, regardless of the settings in the
	 * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
	 * <p>
	 * To decode <i>all</i> character references in a given text, use the {@link #decode(CharSequence)} method instead.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd><code>CharacterReference.parse("&amp;gt;").getChar()</code> returns '<code>&gt;</code>'</dd>
	 * </dl>
	 *
	 * @param characterReferenceText  the text containing a single encoded character reference.
	 * @return a <code>CharacterReference</code> object representing the specified text, or <code>null</code> if the text does not represent a valid character reference.
	 * @see #decode(CharSequence)
	 */
	public static CharacterReference parse(final CharSequence characterReferenceText) {
		return construct(new Source(characterReferenceText),0,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
	}

	/**
	 * Parses a single encoded character reference text into a unicode code point.
	 * <p>
	 * The character reference must be at the start of the given text, but may contain other characters at the end.
	 * <p>
	 * If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}.
	 * <p>
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -