📄 characterreference.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
	 * This is equivalent to {@link #parse(CharSequence) parse(characterReferenceText)}<code>.</code>{@link #getCodePoint()},
	 * except that it returns {@link #INVALID_CODE_POINT} if an invalid character reference is specified instead of throwing a
	 * <code>NullPointerException</code>.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd><code>CharacterReference.getCodePointFromCharacterReferenceString("&amp;gt;")</code> returns <code>38</code></dd>
	 * </dl>
	 *
	 * @param characterReferenceText  the text containing a single encoded character reference.
	 * @return the unicode code point representing representing the specified text, or {@link #INVALID_CODE_POINT} if the text does not represent a valid character reference.
	 */
	public static int getCodePointFromCharacterReferenceString(final CharSequence characterReferenceText) {
		final CharacterReference characterReference=parse(characterReferenceText);
		return (characterReference!=null) ? characterReference.getCodePoint() : INVALID_CODE_POINT;
	}

	/**
	 * Indicates whether the specified character would need to be encoded in HTML text.
	 * <p>
	 * This is the case if a {@linkplain CharacterEntityReference character entity reference} exists for the character, or the unicode code point is greater than U+007F.
	 * <p>
	 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
	 * which only returns <code>true</code> if the static {@link Config#IsApostropheEncoded} property
	 * is currently set to <code>true</code>.
	 *
	 * @param ch  the character to test.
	 * @return <code>true</code> if the specified character would need to be encoded in HTML text, otherwise <code>false</code>.
	 */
	public static final boolean requiresEncoding(final char ch) {
		return ch>127 || (CharacterEntityReference.getName(ch)!=null && (ch!='\'' || Config.IsApostropheEncoded));
	}

	/**
	 * Returns a filter <code>Writer</code> that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified <code>Writer</code>.
	 *
	 * @param writer  the destination for the encoded text
	 * @return a filter <code>Writer</code> that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified <code>Writer</code>.
	 * @see #encode(CharSequence unencodedText)
	 */
	public static Writer getEncodingFilterWriter(final Writer writer) {
		return new EncodingFilterWriter(writer);
	}

	private static final class EncodingFilterWriter extends FilterWriter {
		StringBuilder sb=new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH);
		public EncodingFilterWriter(final Writer writer) {
			super(writer);
		}
		public void write(final char ch) throws IOException {
			sb.setLength(0);
			appendEncode(sb,ch);
			if (sb.length()==1)
				out.write(sb.charAt(0));
			else
				out.append(sb);
		}
		public void write(final int chInt) throws IOException {
			write((char)chInt);
		}
		public void write(final char[] cbuf, final int off, final int len) throws IOException {
			final int end=off+len;
			for (int i=off; i<end; i++) write(cbuf[i]);
		}
		public void write(final String str, final int off, final int len) throws IOException {
			final int end=off+len;
			for (int i=off; i<end; i++) write(str.charAt(i));
		}
	}

	private static Appendable appendEncode(final Appendable appendable, char ch) throws IOException {
		if (appendEncodeCheckForWhiteSpaceFormatting(appendable,ch,false)) return appendable;
		return appendable.append(ch);
	}

	static Appendable appendEncode(final Appendable appendable, CharSequence unencodedText, final boolean whiteSpaceFormatting) throws IOException {
		if (unencodedText==null) return appendable;
		int beginPos=0;
		int endPos=unencodedText.length();
		if (unencodedText instanceof Segment) {
			// this might improve performance slightly
			final Segment segment=(Segment)unencodedText;
			final int segmentOffset=segment.getBegin();
			beginPos=segmentOffset;
			endPos+=segmentOffset;
			unencodedText=segment.source.string;
		}
		final boolean isApostropheEncoded=Config.IsApostropheEncoded;
		for (int i=beginPos; i<endPos; i++) {
			char ch=unencodedText.charAt(i);
			if (appendEncodeCheckForWhiteSpaceFormatting(appendable,ch,whiteSpaceFormatting)) continue;
			// need to process white space
			// whiteSpaceFormatting tries to simulate the formatting characters by converting them to markup
			int spaceCount;
			int nexti=i+1;
			if (ch!=' ') {
				if (ch!='\t') {
					// must be line feed, carriage return or form feed, since zero-width space should have been processed as a character reference string
					if (ch=='\r' && nexti<endPos && unencodedText.charAt(nexti)=='\n') i++; // process cr/lf pair as one line break
					appendable.append("<br />"); // add line break
					continue;
				} else {
					spaceCount=TAB_LENGTH;
				}
			} else {
				spaceCount=1;
			}
			while (nexti<endPos) {
				ch=unencodedText.charAt(nexti);
				if (ch==' ')
					spaceCount+=1;
				else if (ch=='\t')
					spaceCount+=TAB_LENGTH;
				else
					break;
				nexti++;
			}
			if (spaceCount==1) {
				// handle the very common case of a single character to improve efficiency slightly
				appendable.append(' ');
				continue;
			}
			if (spaceCount%2==1) appendable.append(' '); // fist character is a space if we have an odd number of spaces
			while (spaceCount>=2) {
				appendable.append("&nbsp; "); // use alternating &nbsp; and spaces to keep original number of spaces
				spaceCount-=2;
			}
			// note that the last character is never a nbsp, so that word wrapping won't result in a nbsp before the first character in a line
			i=nexti-1; // minus 1 because top level for loop will add it again
		}
		return appendable;
	}

	private static final boolean appendEncodeCheckForWhiteSpaceFormatting(final Appendable appendable, char ch, final boolean whiteSpaceFormatting) throws IOException {
		final String characterEntityReferenceName=CharacterEntityReference.getName(ch);
		if (characterEntityReferenceName!=null) {
			if (ch=='\'') {
				if (Config.IsApostropheEncoded)
					appendable.append("&#39;");
				else
					appendable.append(ch);
			} else {
				CharacterEntityReference.appendCharacterReferenceString(appendable,characterEntityReferenceName);
			}
		} else if (ch>127) {
			appendDecimalCharacterReferenceString(appendable,ch);
		} else if (!(whiteSpaceFormatting && isWhiteSpace(ch))) {
			appendable.append(ch);
		} else {
			return false;
		}
		return true;
	}

	static CharacterReference getPrevious(final Source source, final int pos) {
		return getPrevious(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
	}

	static CharacterReference getNext(final Source source, final int pos) {
		return getNext(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
	}

	private static CharacterReference getPrevious(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
		final ParseText parseText=source.getParseText();
		pos=parseText.lastIndexOf('&',pos);
		while (pos!=-1) {
			final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
			if (characterReference!=null) return characterReference;
			pos=parseText.lastIndexOf('&',pos-1);
		}
		return null;
	}

	private static CharacterReference getNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
		final ParseText parseText=source.getParseText();
		pos=parseText.indexOf('&',pos);
		while (pos!=-1) {
			final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
			if (characterReference!=null) return characterReference;
			pos=parseText.indexOf('&',pos+1);
		}
		return null;
	}

	static final Appendable appendHexadecimalCharacterReferenceString(final Appendable appendable, final int codePoint) throws IOException {
		return appendable.append("&#x").append(Integer.toString(codePoint,16)).append(';');
	}

	static final Appendable appendDecimalCharacterReferenceString(final Appendable appendable, final int codePoint) throws IOException {
		return appendable.append("&#").append(Integer.toString(codePoint)).append(';');
	}

	private static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
		try {
			if (source.getParseText().charAt(begin)!='&') return null;
			return (source.getParseText().charAt(begin+1)=='#')
				? NumericCharacterReference.construct(source,begin,unterminatedCharacterReferenceSettings)
				: CharacterEntityReference.construct(source,begin,unterminatedCharacterReferenceSettings.characterEntityReferenceMaxCodePoint);
		} catch (IndexOutOfBoundsException ex) {
			return null;
		}
	}

	private static Appendable appendDecode(final Appendable appendable, final CharSequence encodedText, int pos, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) throws IOException {
		final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue);
		int lastEnd=0;
		final Source source=new Source(encodedText);
		while (true) {
			final CharacterReference characterReference=getNext(source,pos,unterminatedCharacterReferenceSettings);
			if (characterReference==null) break;
			if (lastEnd!=characterReference.getBegin()) appendable.append(encodedText,lastEnd,characterReference.getBegin());
			if (characterReference.getChar()==CharacterEntityReference._nbsp && convertNonBreakingSpaces) {
				appendable.append(' ');
			} else {
				characterReference.appendTo(appendable);
			}
			pos=lastEnd=characterReference.getEnd();
		}
		if (lastEnd!=encodedText.length()) appendable.append(encodedText,lastEnd,encodedText.length());
		return appendable;
	}

	private void appendTo(Appendable appendable) throws IOException {
		if (Character.isSupplementaryCodePoint(codePoint)) {
			appendable.append(getHighSurrogate(codePoint));
			appendable.append(getLowSurrogate(codePoint));
		} else {
			appendable.append(getChar());
		}
	}
	
	// pinched from http://svn.apache.org/repos/asf/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/text/CharUtils.java
	private static char getHighSurrogate(int codePoint) {
		return (char)((0xD800 - (0x10000 >> 10)) + (codePoint >> 10));
	}
	private static char getLowSurrogate(int codePoint) {
		return (char)(0xDC00 + (codePoint & 0x3FF));
	}
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -