📄 characterreference.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;
import java.io.*;

/**
 * Represents an HTML <a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#entities">Character Reference</a>,
 * implemented by the subclasses {@link CharacterEntityReference} and {@link NumericCharacterReference}.
 * <p>
 * This class, together with its subclasses, contains static methods to perform most required operations
 * without having to instantiate an object.
 * <p>
 * Instances of this class are useful when the positions of character references in a source document are required,
 * or to replace the found character references with customised text.
 * <p>
 * <code>CharacterReference</code> instances are obtained using one of the following methods:
 * <ul>
 *  <li>{@link CharacterReference#parse(CharSequence characterReferenceText)}
 *  <li>{@link Source#getNextCharacterReference(int pos)}
 *  <li>{@link Source#getPreviousCharacterReference(int pos)}
 *  <li>{@link Segment#getAllCharacterReferences()}
 * </ul>
 */
public abstract class CharacterReference extends Segment {
	int codePoint;

	/**
	 * Represents an invalid unicode code point.
	 * <p>
	 * This can be the result of parsing a numeric character reference outside of the valid unicode range of 0x000000-0x10FFFF, or any other invalid character reference.
	 */
	public static final int INVALID_CODE_POINT=-1;

	static int MAX_ENTITY_REFERENCE_LENGTH; // set in CharacterEntityReference static class initialisation

	/** The number of spaces used to simulate a tab when {@linkplain #encodeWithWhiteSpaceFormatting encoding with white space formatting}. */
	private static final int TAB_LENGTH=4;

	CharacterReference(final Source source, final int begin, final int end, final int codePoint) {
		super(source,begin,end);
		this.codePoint=codePoint;
	}

	/**
	 * Returns the <a target="_blank" href="http://www.unicode.org">unicode</a> code point represented by this character reference.
	 * @return the unicode code point represented by this character reference.
	 */
	public int getCodePoint() {
		return codePoint;
	}

	/**
	 * Returns the character represented by this character reference.
	 * <p>
	 * If this character reference represents a unicode
	 * <a target="_blank" href="http://www.unicode.org/glossary/#supplementary_code_point">supplimentary code point</a>,
	 * any bits outside of the least significant 16 bits of the code point are truncated, yielding an incorrect result.
	 *
	 * @return the character represented by this character reference.
	 */
	public char getChar() {
		return (char)codePoint;
	}

	/**
	 * Indicates whether this character reference is terminated by a semicolon (<code>;</code>).
	 * <p>
	 * Conversely, this library defines an <i><a name="Unterminated">unterminated</a></i> character reference as one which does
	 * not end with a semicolon.
	 * <p>
	 * The SGML specification allows unterminated character references in some circumstances, and because the
	 * HTML 4.01 specification states simply that
	 * "<a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#entities">authors may use SGML character references</a>",
	 * it follows that they are also valid in HTML documents, although their use is strongly discouraged.
	 * <p>
	 * Unterminated character references are not allowed in <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> documents.
	 *
	 * @return <code>true</code> if this character reference is terminated by a semicolon, otherwise <code>false</code>.
	 * @see #decode(CharSequence encodedText, boolean insideAttributeValue)
	 */
	public boolean isTerminated() {
		return source.charAt(end-1)==';';
	}

	/**
	 * Encodes the specified text, escaping special characters into character references.
	 * <p>
	 * Each character is encoded only if the {@link #requiresEncoding(char)} method would return <code>true</code> for that character,
	 * using its {@link CharacterEntityReference} if available, or a decimal {@link NumericCharacterReference} if its unicode
	 * code point is greater than U+007F.
	 * <p>
	 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
	 * which depending on the current setting of the static {@link Config#IsApostropheEncoded} property,
	 * is either left unencoded (default setting), or encoded as the numeric character reference "<code>&amp;#39;</code>".
	 * <p>
	 * This method never encodes an apostrophe into its character entity reference {@link CharacterEntityReference#_apos &amp;apos;}
	 * as this entity is not defined for use in HTML.  See the comments in the {@link CharacterEntityReference} class for more information.
	 * <p>
	 * To encode text using only numeric character references, use the<br />
	 * {@link NumericCharacterReference#encode(CharSequence)} method instead.
	 *
	 * @param unencodedText  the text to encode.
	 * @return the encoded string.
	 * @see #decode(CharSequence)
	 */
	public static String encode(final CharSequence unencodedText) {
		if (unencodedText==null) return null;
		try {
			return appendEncode(new StringBuilder(unencodedText.length()*2),unencodedText,false).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * Encodes the specified character into a character reference if {@linkplain #requiresEncoding(char) required}.
	 * <p>
	 * The encoding of the character follows the same rules as for each character in the {@link #encode(CharSequence unencodedText)} method.
	 *
	 * @param ch  the character to encode.
	 * @return a character reference if appropriate, otherwise a string containing the original character.
	 */
	public static String encode(final char ch) {
		try {
			return appendEncode(new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH),ch).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * {@linkplain #encode(CharSequence) Encodes} the specified text, preserving line breaks, tabs and spaces for rendering by converting them to markup.
	 * <p>
	 * This performs the same encoding as the {@link #encode(CharSequence)} method, but also performs the following conversions:
	 * <ul>
	 *  <li>Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C)
	 *   are converted to "<code>&lt;br /&gt;</code>".  CR/LF pairs are treated as a single line break.
	 *  <li>Multiple consecutive spaces are converted so that every second space is converted to "<code>&amp;nbsp;</code>"
	 *   while ensuring the last is always a normal space.
	 *  <li>Tab characters (U+0009) are converted as if they were four consecutive spaces.
	 * </ul>
	 * <p>
	 * The conversion of multiple consecutive spaces to alternating space/non-breaking-space allows the correct number of
	 * spaces to be rendered, but also allows the line to wrap in the middle of it.
	 * <p>
	 * Note that zero-width spaces (U+200B) are converted to the numeric character reference
	 * "<code>&amp;#x200B;</code>" through the normal encoding process, but IE6 does not render them properly
	 * either encoded or unencoded.
	 * <p>
	 * There is no method provided to reverse this encoding.
	 *
	 * @param unencodedText  the text to encode.
	 * @return the encoded string with white space formatting converted to markup.
	 * @see #encode(CharSequence)
	 */
	public static String encodeWithWhiteSpaceFormatting(final CharSequence unencodedText) {
		if (unencodedText==null) return null;
		try {
			return appendEncode(new StringBuilder(unencodedText.length()*2),unencodedText,true).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * Decodes the specified HTML encoded text into normal text.
	 * <p>
	 * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
	 * are converted to their respective characters.
	 * <p>
	 * This is equivalent to {@link #decode(CharSequence,boolean) decode(encodedText,false)}.
	 * <p>
	 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
	 * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
	 * <p>
	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
	 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to normal spaces.
	 * <p>
	 * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
	 * some browsers also recognise them in a case-insensitive way.
	 * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
	 *
	 * @param encodedText  the text to decode.
	 * @return the decoded string.
	 * @see #encode(CharSequence)
	 */
	public static String decode(final CharSequence encodedText) {
		return decode(encodedText,false,Config.ConvertNonBreakingSpaces);
	}

	/**
	 * Decodes the specified HTML encoded text into normal text.
	 * <p>
	 * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
	 * are converted to their respective characters.
	 * <p>
	 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the
	 * value of the <code>insideAttributeValue</code> parameter and the
	 * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
	 * <p>
	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
	 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to normal spaces.
	 * <p>
	 * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
	 * some browsers also recognise them in a case-insensitive way.
	 * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
	 *
	 * @param encodedText  the text to decode.
	 * @param insideAttributeValue  specifies whether the encoded text is inside an attribute value.
	 * @return the decoded string.
	 * @see #decode(CharSequence)
	 * @see #encode(CharSequence)
	 */
	public static String decode(final CharSequence encodedText, final boolean insideAttributeValue) {
		return decode(encodedText,insideAttributeValue,Config.ConvertNonBreakingSpaces);
	}

	private static String decode(final CharSequence encodedText, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) {
		if (encodedText==null) return null;
		for (int i=0; i<encodedText.length(); i++) {
			if (encodedText.charAt(i)=='&') {
				try {
					return appendDecode(new StringBuilder(encodedText.length()),encodedText,i,insideAttributeValue,convertNonBreakingSpaces).toString();
				} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
			}
		}
		return encodedText.toString();
	}
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -