📄 tag.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
	/**
	 * Returns an XML representation of this tag.
	 * <p>
	 * This is an abstract method which is implemented in the {@link StartTag} and {@link EndTag} subclasses.
	 * See the documentation of the {@link StartTag#tidy()} and {@link EndTag#tidy()} methods for details.
	 *
	 * @return an XML representation of this tag.
	 */
	public abstract String tidy();

	/**
	 * Indicates whether the specified text is a valid <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>.
	 * <p>
	 * This implementation first checks that the first character of the specified text is a valid XML Name start character
	 * as defined by the {@link #isXMLNameStartChar(char)} method, and then checks that the rest of the characters are valid
	 * XML Name characters as defined by the {@link #isXMLNameChar(char)} method.
	 * <p>
	 * Note that this implementation does not exactly adhere to the
	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">formal definition of an XML Name</a>,
	 * but the differences are unlikely to be significant in real-world XML or HTML documents.
	 *
	 * @param text  the text to test.
	 * @return <code>true</code> if the specified text is a valid <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>.
	 * @see Source#getNameEnd(int pos)
	 */
	public static final boolean isXMLName(final CharSequence text) {
		if (text==null || text.length()==0 || !isXMLNameStartChar(text.charAt(0))) return false;
		for (int i=1; i<text.length(); i++)
			if (!isXMLNameChar(text.charAt(i))) return false;
		return true;
	}

	/**
	 * Indicates whether the specified character is valid at the start of an
	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>.
	 * <p>
	 * The <a target="_blank" href="http://www.w3.org/TR/REC-xml/#sec-common-syn">XML 1.0 specification section 2.3</a> defines a
	 * <code><a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">Name</a></code> as starting with one of the characters
	 * <br /><code>(<a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">Letter</a> | '_' | ':')</code>.
	 * <p>
	 * This method uses the expression
	 * <br /><code>Character.isLetter(ch) || ch=='_' || ch==':'</code>.
	 * <p>
	 * Note that there are many differences between the <code>Character.isLetter()</code> definition of a Letter and the
	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">XML definition of a Letter</a>,
	 * but these differences are unlikely to be significant in real-world XML or HTML documents.
	 *
	 * @param ch  the character to test.
	 * @return <code>true</code> if the specified character is valid at the start of an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>.
	 * @see Source#getNameEnd(int pos)
	 */
	public static final boolean isXMLNameStartChar(final char ch) {
		return Character.isLetter(ch) || ch=='_' || ch==':';
	}

	/**
	 * Indicates whether the specified character is valid anywhere in an
	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>.
	 * <p>
	 * The <a target="_blank" href="http://www.w3.org/TR/REC-xml/#sec-common-syn">XML 1.0 specification section 2.3</a> uses the
	 * entity <code><a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-NameChar">NameChar</a></code> to represent this set of
	 * characters, which is defined as
	 * <br /><code>(<a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">Letter</a>
	 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Digit">Digit</a> | '.' | '-' | '_' | ':'
	 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-CombiningChar">CombiningChar</a>
	 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Extender">Extender</a>)</code>.
	 * <p>
	 * This method uses the expression
	 * <br /><code>Character.isLetterOrDigit(ch) || ch=='.' || ch=='-' || ch=='_' || ch==':'</code>.
	 * <p>
	 * Note that there are many differences between these definitions,
	 * but these differences are unlikely to be significant in real-world XML or HTML documents.
	 *
	 * @param ch  the character to test.
	 * @return <code>true</code> if the specified character is valid anywhere in an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>.
	 * @see Source#getNameEnd(int pos)
	 */
	public static final boolean isXMLNameChar(final char ch) {
		return Character.isLetterOrDigit(ch) || ch=='.' || ch=='-' || ch=='_' || ch==':';
	}

	/**
	 * This method has been deprecated as of version 3.0 in order to apply a consistent naming convention across all <a href="Tag.html#TagSearchMethods">tag search methods</a>.
	 * @deprecated  Replaced by {@link #getNextTag()}.
	 */
	@Deprecated
	public Tag findNextTag() {
		return getNextTag();
	}

	/**
	 * This method has been deprecated as of version 3.0 in order to apply a consistent naming convention across all <a href="Tag.html#TagSearchMethods">tag search methods</a>.
	 * @deprecated  Replaced by {@link #getPreviousTag()}.
	 */
	@Deprecated
	public Tag findPreviousTag() {
		return getPreviousTag();
	}

	// *** consider making public
	StartTag getNextStartTag() {
		Tag tag=this;
		while (true) {
			tag=tag.getNextTag();
			if (tag==null) return null;
			if (tag instanceof StartTag) return (StartTag)tag;
		}
	}

	// *** consider making public
	StartTag getPreviousStartTag() {
		Tag tag=this;
		while (true) {
			tag=tag.getPreviousTag();
			if (tag==null) return null;
			if (tag instanceof StartTag) return (StartTag)tag;
		}
	}

	// *** consider making public
	Tag getNextTag(final TagType tagType) {
		if (tagType==null) return getNextTag();
		if (tagType==StartTagType.UNREGISTERED || tagType==EndTagType.UNREGISTERED) return getNextTag(source,begin+1,tagType);
		Tag tag=this;
		while (true) {
			if (tag.nextTag==NOT_CACHED) return getNextTag(source,tag.begin+1,tagType);
			tag=tag.nextTag;
			if (tag==null) return null;
			if (tag.getTagType()==tagType) return tag;
		}
	}

	// *** consider making public
	Tag getPreviousTag(final TagType tagType) {
		if (tagType==null) return getPreviousTag();
		if (tagType==StartTagType.UNREGISTERED || tagType==EndTagType.UNREGISTERED) return getPreviousTag(source,begin-1,tagType);
		Tag tag=this;
		while (true) {
			if (tag.previousTag==NOT_CACHED) return getPreviousTag(source,tag.begin-1,tagType);
			tag=tag.previousTag;
			if (tag==null) return null;
			if (tag.getTagType()==tagType) return tag;
		}
	}

	final boolean includeInSearch() {
		return INCLUDE_UNREGISTERED_IN_SEARCH || !isUnregistered();
	}

	static final Tag getPreviousTag(final Source source, final int pos) {
		// returns null if pos is out of range.
		return source.useAllTypesCache
			? source.cache.getPreviousTag(pos)
			: getPreviousTagUncached(source,pos,ParseText.NO_BREAK);
	}

	static final Tag getNextTag(final Source source, final int pos) {
		// returns null if pos is out of range.
		return source.useAllTypesCache
			? source.cache.getNextTag(pos)
			: getNextTagUncached(source,pos,ParseText.NO_BREAK);
	}
		
	static final Tag getPreviousTagUncached(final Source source, final int pos, final int breakAtPos) {
		// returns null if pos is out of range.
		try {
			final ParseText parseText=source.getParseText();
			int begin=pos;
			do {
				begin=parseText.lastIndexOf('<',begin,breakAtPos); // this assumes that all tags start with '<'
				// parseText.lastIndexOf and indexOf return -1 if pos is out of range.
				if (begin==-1) return null;
				final Tag tag=getTagAt(source,begin,false);
				if (tag!=null && tag.includeInSearch()) return tag;
			} while ((begin-=1)>=0);
		} catch (IndexOutOfBoundsException ex) {
			throw new AssertionError("Unexpected internal exception");
		}
		return null;
	}

	static final Tag getNextTagUncached(final Source source, final int pos, final int breakAtPos) {
		// returns null if pos is out of range.
		try {
			final ParseText parseText=source.getParseText();
			int begin=pos;
			do {
				begin=parseText.indexOf('<',begin,breakAtPos); // this assumes that all tags start with '<'
				// parseText.lastIndexOf and indexOf return -1 if pos is out of range.
				if (begin==-1) return null;
				final Tag tag=getTagAt(source,begin,false);
				if (tag!=null && tag.includeInSearch()) return tag;
			} while ((begin+=1)<source.end);
		} catch (IndexOutOfBoundsException ex) {
			// this should only happen when the end of file is reached in the middle of a tag.
			// we don't have to do anything to handle it as there are no more tags anyway.
		}
		return null;
	}

	static final Tag getPreviousTag(final Source source, final int pos, final TagType tagType) {
		// returns null if pos is out of range.
		if (source.useSpecialTypesCache) return source.cache.getPreviousTag(pos,tagType);
		return getPreviousTagUncached(source,pos,tagType,ParseText.NO_BREAK);
	}

	static final Tag getNextTag(final Source source, final int pos, final TagType tagType) {
		// returns null if pos is out of range.
		if (source.useSpecialTypesCache) return source.cache.getNextTag(pos,tagType);
		return getNextTagUncached(source,pos,tagType,ParseText.NO_BREAK);
	}

	static final Tag getPreviousTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos) {
		// returns null if pos is out of range.
		if (tagType==null) return getPreviousTagUncached(source,pos,breakAtPos);
		final char[] startDelimiterCharArray=tagType.getStartDelimiterCharArray();
		try {
			final ParseText parseText=source.getParseText();
			int begin=pos;
			do {
				begin=parseText.lastIndexOf(startDelimiterCharArray,begin,breakAtPos);
				// parseText.lastIndexOf and indexOf return -1 if pos is out of range.
				if (begin==-1) return null;
				final Tag tag=getTagAt(source,begin,false);
				if (tag!=null && tag.getTagType()==tagType) return tag;
			} while ((begin-=1)>=0);
		} catch (IndexOutOfBoundsException ex) {
			// this should never happen during a get previous operation so rethrow it:
			throw ex;
		}
		return null;
	}

	static final Tag getNextTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos) {
		// returns null if pos is out of range.
		if (tagType==null) return getNextTagUncached(source,pos,breakAtPos);
		final char[] startDelimiterCharArray=tagType.getStartDelimiterCharArray();
		try {
			final ParseText parseText=source.getParseText();
			int begin=pos;
			do {
				begin=parseText.indexOf(startDelimiterCharArray,begin,breakAtPos);
				// parseText.lastIndexOf and indexOf return -1 if pos is out of range.
				if (begin==-1) return null;
				final Tag tag=getTagAt(source,begin,false);
				if (tag!=null && tag.getTagType()==tagType) return tag;
			} while ((begin+=1)<source.end);
		} catch (IndexOutOfBoundsException ex) {
			// this should only happen when the end of file is reached in the middle of a tag.
			// we don't have to do anything to handle it as there are no more tags anyway.
		}
		return null;
	}

	static final Tag getTagAt(final Source source, final int pos, final boolean serverTagOnly) {
		// returns null if pos is out of range.
		return source.useAllTypesCache
			? source.cache.getTagAt(pos,serverTagOnly)
			: getTagAtUncached(source,pos,serverTagOnly);
	}

	static final Tag getTagAtUncached(final Source source, final int pos, final boolean serverTagOnly) {
		// returns null if pos is out of range.
		return TagType.getTagAt(source,pos,serverTagOnly,false);
	}

	static final Tag[] parseAll(final Source source, final boolean assumeNoNestedTags) {
		int registeredTagCount=0;
		int registeredStartTagCount=0;
		final ArrayList<Tag> list=new ArrayList<Tag>();
		source.fullSequentialParseData=new int[1]; // fullSequentialParseData is simply a holder for a single mutable integer. It holds the end position of the last normal tag (ie one that ignores enclosed markup), or MAX_VALUE if we are in a SCRIPT element.
		if (source.end!=0) {
			final ParseText parseText=source.getParseText();
			Tag tag=parseAllgetNextTag(source,parseText,0,assumeNoNestedTags);
			while (tag!=null) {
				list.add(tag);
				if (!tag.isUnregistered()) {
					registeredTagCount++;
					if (tag instanceof StartTag) registeredStartTagCount++;
				}
				// Look for next tag after end of next tag if we're assuming tags don't appear inside other tags, as long as the last tag found was not an unregistered tag:
				final int pos=(assumeNoNestedTags && !tag.isUnregistered()) ? tag.end : tag.begin+1;
				if (pos==source.end) break;
				tag=parseAllgetNextTag(source,parseText,pos,assumeNoNestedTags);
			}
		}
		final Tag[] allRegisteredTags=new Tag[registeredTagCount];
		final StartTag[] allRegisteredStartTags=new StartTag[registeredStartTagCount];
		source.cache.loadAllTags(list,allRegisteredTags,allRegisteredStartTags);
		source.allTagsArray=allRegisteredTags;
		source.allTags=Arrays.asList(allRegisteredTags);
		source.allStartTags=Arrays.asList(allRegisteredStartTags);
		final int lastIndex=allRegisteredTags.length-1;
		for (int i=0; i<allRegisteredTags.length; i++) {
			final Tag tag=allRegisteredTags[i];
			tag.previousTag=i>0 ? allRegisteredTags[i-1] : null;
			tag.nextTag=i<lastIndex ? allRegisteredTags[i+1] : null;
		}
		return allRegisteredTags;
	}

	private static final Tag parseAllgetNextTag(final Source source, final ParseText parseText, final int pos, final boolean assumeNoNestedTags) {
		try {
			int begin=pos;
			do {
				begin=parseText.indexOf('<',begin); // this assumes that all tags start with '<'
				if (begin==-1) return null;
				final Tag tag=TagType.getTagAt(source,begin,false,assumeNoNestedTags);
				if (tag!=null) {
					if (!assumeNoNestedTags) {
						final TagType tagType=tag.getTagType();
						if (tag.end>source.fullSequentialParseData[0]
								&& tagType!=StartTagType.DOCTYPE_DECLARATION
								&& tagType!=StartTagType.UNREGISTERED && tagType!=EndTagType.UNREGISTERED) {
							source.fullSequentialParseData[0]=(tagType==StartTagType.NORMAL && tag.name==HTMLElementName.SCRIPT) ? Integer.MAX_VALUE : tag.end;
						}
					}
					return tag;
				}
			} while ((begin+=1)<source.end);
		} catch (IndexOutOfBoundsException ex) {
			// this should only happen when the end of file is reached in the middle of a tag.
			// we don't have to do anything to handle it as there are no more tags anyway.
		}
		return null;
	}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -