📄 characterreference.java
字号:
/**
* {@linkplain #decode(CharSequence) Decodes} the specified text after collapsing its {@linkplain #isWhiteSpace(char) white space}.
* <p>
* All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
* <p>
* The result is how the text would normally be rendered by a
* <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>,
* assuming it does not contain any tags.
* <p>
* If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
* then all non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to normal spaces.
* <p>
* <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
* text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
* See the discussion of the <code>insideAttributeValue</code> parameter of the {@link #decode(CharSequence, boolean insideAttributeValue)}
* method for a more detailed explanation of this topic.
*
* @param text the source text
* @return the decoded text with collapsed white space.
* @see FormControl#getPredefinedValues()
*/
public static String decodeCollapseWhiteSpace(final CharSequence text) {
return decodeCollapseWhiteSpace(text,Config.ConvertNonBreakingSpaces);
}
static String decodeCollapseWhiteSpace(final CharSequence text, final boolean convertNonBreakingSpaces) {
return decode(appendCollapseWhiteSpace(new StringBuilder(text.length()),text),false,convertNonBreakingSpaces);
}
/**
* Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again.
* <p>
* This process ensures that the specified encoded text does not contain any remaining unencoded characters.
* <p>
* IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method
* followed by the {@link #encode(CharSequence) encode} method, but a more efficient implementation
* may be used in future.
*
* @param encodedText the text to re-encode.
* @return the re-encoded string.
*/
public static String reencode(final CharSequence encodedText) {
return encode(decode(encodedText,true));
}
/**
* Returns the encoded form of this character reference.
* <p>
* The exact behaviour of this method depends on the class of this object.
* See the {@link CharacterEntityReference#getCharacterReferenceString()} and
* {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details.
* <p>
* <dl>
* <dt>Examples:</dt>
* <dd><code>CharacterReference.parse("&GT;").getCharacterReferenceString()</code> returns "<code>&gt;</code>"</dd>
* <dd><code>CharacterReference.parse("&#x3E;").getCharacterReferenceString()</code> returns "<code>&#3e;</code>"</dd>
* </dl>
*
* @return the encoded form of this character reference.
* @see #getCharacterReferenceString(int codePoint)
* @see #getDecimalCharacterReferenceString()
*/
public abstract String getCharacterReferenceString();
/**
* Returns the encoded form of the specified unicode code point.
* <p>
* This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the unicode code point
* if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal character reference} encoded form.
* <p>
* The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
* which is encoded as the numeric character reference "<code>&#39;</code>" instead of its character entity reference
* "<code>&apos;</code>".
* <p>
* <dl>
* <dt>Examples:</dt>
* <dd><code>CharacterReference.getCharacterReferenceString(62)</code> returns "<code>&gt;</code>"</dd>
* <dd><code>CharacterReference.getCharacterReferenceString('>')</code> returns "<code>&gt;</code>"</dd>
* <dd><code>CharacterReference.getCharacterReferenceString('☺')</code> returns "<code>&#9786;</code>"</dd>
* </dl>
*
* @param codePoint the unicode code point to encode.
* @return the encoded form of the specified unicode code point.
* @see #getHexadecimalCharacterReferenceString(int codePoint)
*/
public static String getCharacterReferenceString(final int codePoint) {
String characterReferenceString=null;
if (codePoint!=CharacterEntityReference._apos) characterReferenceString=CharacterEntityReference.getCharacterReferenceString(codePoint);
if (characterReferenceString==null) characterReferenceString=NumericCharacterReference.getCharacterReferenceString(codePoint);
return characterReferenceString;
}
/**
* Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of this character reference.
* <p>
* This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
* <p>
* <dl>
* <dt>Example:</dt>
* <dd><code>CharacterReference.parse("&gt;").getDecimalCharacterReferenceString()</code> returns "<code>&#62;</code>"</dd>
* </dl>
*
* @return the decimal encoded form of this character reference.
* @see #getCharacterReferenceString()
* @see #getHexadecimalCharacterReferenceString()
*/
public String getDecimalCharacterReferenceString() {
return getDecimalCharacterReferenceString(codePoint);
}
/**
* Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of the specified unicode code point.
* <p>
* <dl>
* <dt>Example:</dt>
* <dd><code>CharacterReference.getDecimalCharacterReferenceString('>')</code> returns "<code>&#62;</code>"</dd>
* </dl>
*
* @param codePoint the unicode code point to encode.
* @return the decimal encoded form of the specified unicode code point.
* @see #getCharacterReferenceString(int codePoint)
* @see #getHexadecimalCharacterReferenceString(int codePoint)
*/
public static String getDecimalCharacterReferenceString(final int codePoint) {
try {
return appendDecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}
/**
* Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of this character reference.
* <p>
* This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
* <p>
* <dl>
* <dt>Example:</dt>
* <dd><code>CharacterReference.parse("&gt;").getHexadecimalCharacterReferenceString()</code> returns "<code>&#x3e;</code>"</dd>
* </dl>
*
* @return the hexadecimal encoded form of this character reference.
* @see #getCharacterReferenceString()
* @see #getDecimalCharacterReferenceString()
*/
public String getHexadecimalCharacterReferenceString() {
return getHexadecimalCharacterReferenceString(codePoint);
}
/**
* Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of the specified unicode code point.
* <p>
* <dl>
* <dt>Example:</dt>
* <dd><code>CharacterReference.getHexadecimalCharacterReferenceString('>')</code> returns "<code>&#x3e;</code>"</dd>
* </dl>
*
* @param codePoint the unicode code point to encode.
* @return the hexadecimal encoded form of the specified unicode code point.
* @see #getCharacterReferenceString(int codePoint)
* @see #getDecimalCharacterReferenceString(int codePoint)
*/
public static String getHexadecimalCharacterReferenceString(final int codePoint) {
try {
return appendHexadecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}
/**
* Returns the unicode code point of this character reference in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
* <p>
* This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}.
* <p>
* <dl>
* <dt>Example:</dt>
* <dd><code>CharacterReference.parse("&gt;").getUnicodeText()</code> returns "<code>U+003E</code>"</dd>
* </dl>
*
* @return the unicode code point of this character reference in U+ notation.
* @see #getUnicodeText(int codePoint)
*/
public String getUnicodeText() {
return getUnicodeText(codePoint);
}
/**
* Returns the specified unicode code point in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
* <p>
* <dl>
* <dt>Example:</dt>
* <dd><code>CharacterReference.getUnicodeText('>')</code> returns "<code>U+003E</code>"</dd>
* </dl>
*
* @param codePoint the unicode code point.
* @return the specified unicode code point in U+ notation.
*/
public static String getUnicodeText(final int codePoint) {
try {
return appendUnicodeText(new StringBuilder(),codePoint).toString();
} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}
static final Appendable appendUnicodeText(final Appendable appendable, final int codePoint) throws IOException {
appendable.append("U+");
final String hex=Integer.toString(codePoint,16).toUpperCase();
for (int i=4-hex.length(); i>0; i--) appendable.append('0');
appendable.append(hex);
return appendable;
}
/**
* Parses a single encoded character reference text into a <code>CharacterReference</code> object.
* <p>
* The character reference must be at the start of the given text, but may contain other characters at the end.
* The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended.
* <p>
* If the text does not represent a valid character reference, this method returns <code>null</code>.
* <p>
* <a href="#Unterminated">Unterminated</a> character references are always accepted, regardless of the settings in the
* {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
* <p>
* To decode <i>all</i> character references in a given text, use the {@link #decode(CharSequence)} method instead.
* <p>
* <dl>
* <dt>Example:</dt>
* <dd><code>CharacterReference.parse("&gt;").getChar()</code> returns '<code>></code>'</dd>
* </dl>
*
* @param characterReferenceText the text containing a single encoded character reference.
* @return a <code>CharacterReference</code> object representing the specified text, or <code>null</code> if the text does not represent a valid character reference.
* @see #decode(CharSequence)
*/
public static CharacterReference parse(final CharSequence characterReferenceText) {
return construct(new Source(characterReferenceText),0,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
}
/**
* Parses a single encoded character reference text into a unicode code point.
* <p>
* The character reference must be at the start of the given text, but may contain other characters at the end.
* <p>
* If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}.
* <p>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -