📄 starttag.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
		appendDebugTagType(sb);
		sb.append(super.getDebugInfo());
		return sb.toString();
	}

	StringBuilder appendDebugTag(final StringBuilder sb) {
		if (startTagType==StartTagType.NORMAL && getAttributes().isEmpty()) {
			sb.append(this);
		} else {
			sb.append('<').append(getNameSegment()).append(' ');
			if (isSyntacticalEmptyElementTag()) sb.append('/');
			sb.append(startTagType.getClosingDelimiter());
		}
		return sb;
	}

	StringBuilder appendDebugTagType(final StringBuilder sb) {
		if (startTagType!=StartTagType.NORMAL) sb.append('(').append(startTagType.getDescription()).append(") ");
		return sb;
	}

	private EndTag getEndTagInternal() {
		boolean checkForEmptyElementTag=true;
		// A missing optional end tag returns a zero length EndTag instead of null
		final EndTagType endTagType=startTagType.getCorrespondingEndTagType();
		if (startTagType==StartTagType.NORMAL) {
			final HTMLElementTerminatingTagNameSets terminatingTagNameSets=HTMLElements.getTerminatingTagNameSets(name);
			if (terminatingTagNameSets!=null) // end tag is optional
				return getOptionalEndTag(terminatingTagNameSets);
			if (HTMLElements.getEndTagForbiddenElementNames().contains(name)) // end tag is forbidden
				return null;
			checkForEmptyElementTag=!HTMLElements.getEndTagRequiredElementNames().contains(name); // check for empty-element tags if tag is not an HTML element
			if (checkForEmptyElementTag && isSyntacticalEmptyElementTag()) // non-html empty-element tag
				return null; 
		} else if (endTagType==null) {
			return null;
		}
		// This is either a start tag type other than NORMAL that requires an end tag, or an HTML element tag that requires an end tag,
		// or a non-HTML element tag that is not an empty-element tag.
		// In all of these cases the end tag is required.
		final EndTag nextEndTag=source.getNextEndTag(end,endTagType.getEndTagName(name),endTagType);
		if (nextEndTag!=null) {
			if (startTagType==StartTagType.NORMAL && HTMLElements.END_TAG_REQUIRED_NESTING_FORBIDDEN_SET.contains(name)) {
				final StartTag nextStartTag=source.getNextStartTag(end,name);
				if (nextStartTag==null || nextStartTag.begin>nextEndTag.begin) return nextEndTag;
				if (source.logger.isInfoEnabled()) source.logger.info(source.getRowColumnVector(begin).appendTo(new StringBuilder(200).append("StartTag at ")).append(" missing required end tag - invalid nested start tag encountered before end tag").toString());
				// Terminate the element at the start of the invalidly nested start tag.
				// This is how IE and Mozilla treat illegally nested A elements, but other elements may vary.
				return new EndTag(source,nextStartTag.begin,nextStartTag.begin,EndTagType.NORMAL,name);
			}
			final Segment[] getResult=getEndTag(nextEndTag,checkForEmptyElementTag,Tag.isXMLName(name));
			if (getResult!=null) return (EndTag)getResult[0];
		}
		if (source.logger.isInfoEnabled()) source.logger.info(source.getRowColumnVector(begin).appendTo(new StringBuilder(200).append("StartTag at ")).append(" missing required end tag").toString());
		return null;
	}

	private EndTag getOptionalEndTag(final HTMLElementTerminatingTagNameSets terminatingTagNameSets) {
		int pos=end;
		while (pos<source.end) {
			final Tag tag=Tag.getNextTag(source,pos);
			if (tag==null) break;
			Set terminatingTagNameSet;
			if (tag instanceof EndTag) {
				if (tag.name==name) return (EndTag)tag;
				terminatingTagNameSet=terminatingTagNameSets.TerminatingEndTagNameSet;
			} else {
				terminatingTagNameSet=terminatingTagNameSets.NonterminatingElementNameSet;
				if (terminatingTagNameSet!=null && terminatingTagNameSet.contains(tag.name)) {
					Element nonterminatingElement=((StartTag)tag).getElement();
					pos=nonterminatingElement.end;
					continue;
				}
				terminatingTagNameSet=terminatingTagNameSets.TerminatingStartTagNameSet;
			}
			if (terminatingTagNameSet!=null && terminatingTagNameSet.contains(tag.name)) return new EndTag(source,tag.begin,tag.begin,EndTagType.NORMAL,name);
			pos=tag.begin+1;
		}
		// Ran out of tags. The only legitimate case of this happening is if the HTML end tag is missing, in which case the end of the element is the end of the source document
		return new EndTag(source,source.end,source.end,EndTagType.NORMAL,name);
	}

	static char[] getStartDelimiterCharArray(final String searchName) {
		if (searchName.length()==0) throw new IllegalArgumentException("searchName argument must not be zero length");
		final char[] startDelimiterCharArray=Util.getConcatenatedCharArray(StartTagType.START_DELIMITER_PREFIX,searchName);
		if (startDelimiterCharArray[StartTagType.START_DELIMITER_PREFIX.length()]=='/') throw new IllegalArgumentException("searchName argument \""+searchName+"\" must not start with '/'");
		return startDelimiterCharArray;
	}

	static StartTag getPrevious(final Source source, final int pos, final String searchName, final StartTagType searchStartTagType) {
		return getPrevious(source,pos,searchName,searchStartTagType,searchStartTagType==StartTagType.NORMAL ? Tag.isXMLName(searchName) : true);
	}

	static StartTag getPrevious(final Source source, final int pos, final String searchName, final StartTagType searchStartTagType, final boolean isXMLTagName) {
		// searchName is already in lower case
		if (searchName==null) return (StartTag)source.getPreviousTag(pos,searchStartTagType);
		final char[] startDelimiterCharArray=getStartDelimiterCharArray(searchName);
		try {
			final ParseText parseText=source.getParseText();
			int begin=pos;
			do {
				begin=parseText.lastIndexOf(startDelimiterCharArray,begin);
				if (begin==-1) return null;
				final StartTag startTag=(StartTag)Tag.getTagAt(source,begin,false);
				if (startTag==null) continue; // keep looking if it wasn't a start tag
				if (searchStartTagType!=startTag.getStartTagType()) {
					// The start tag is of the wrong type.  The only case in which we want to return it is if
					// we are looking for a normal start tag, the found start tag is unregistered, and the search name is NOT a valid XML name.
					// This allows users to search for some types of unregistered tags by name rather than having to register custom tag types.
					if (searchStartTagType!=StartTagType.NORMAL || isXMLTagName || !startTag.isUnregistered()) continue;
				}
				if (startTag.getStartTagType().isNameAfterPrefixRequired() && startTag.getName().length()>searchName.length()) {
					// The name of the start tag is longer than the search name, and the type of tag indicates 
					// that we are probably looking for an exact match.
					// (eg searchName="a", startTag.name="applet" -> reject)
					// We only require an exact match if the last character of the search name is part of the name, as the
					// search name might be just the prefix of a server tag.
					// (eg searchName="?", startTag.name="?abc" -> accept, but searchName="?a", startTag.name="?abc" -> reject)
					// The only exception to this is if the last character of the search name is a colon (which also forms part of
					// the name), but signifies that we want to search on the entire namespace.
					// (eg searchName="o:", startTag.name="o:p" -> accept)
					char lastSearchNameChar=searchName.charAt(searchName.length()-1);
					if (lastSearchNameChar!=':' && isXMLNameChar(lastSearchNameChar)) continue;
				}
				return startTag;
			} while ((begin-=2)>=0);
		} catch (IndexOutOfBoundsException ex) {
			// this should never happen during a get previous operation so rethrow it:
			throw ex;
		}
		return null;
	}

	static StartTag getNext(final Source source, final int pos, final String searchName, final StartTagType searchStartTagType) {
		return getNext(source,pos,searchName,searchStartTagType,searchStartTagType==StartTagType.NORMAL ? Tag.isXMLName(searchName) : true);
	}

	static StartTag getNext(final Source source, final int pos, final String searchName, final StartTagType searchStartTagType, final boolean isXMLTagName) {
		// searchName is already in lower case, but may be null
		// searchStartTagType must not be null
		// isXMLTagName is only used if searchStartTagType==StartTagType.NORMAL
		if (searchName==null) return (StartTag)source.getNextTag(pos,searchStartTagType);
		final char[] startDelimiterCharArray=getStartDelimiterCharArray(searchName);
		try {
			final ParseText parseText=source.getParseText();
			int begin=pos;
			do {
				begin=parseText.indexOf(startDelimiterCharArray,begin);
				if (begin==-1) return null;
				final StartTag startTag=(StartTag)Tag.getTagAt(source,begin,false);
				if (startTag==null) continue; // keep looking if it wasn't a start tag
				if (searchStartTagType!=startTag.getStartTagType()) {
					// The start tag is of the wrong type.  The only case in which we want to return it is if
					// we are looking for a normal start tag, the found start tag is unregistered, and the search name is NOT a valid XML name.
					// This allows users to search for some types of unregistered tags by name rather than having to register custom tag types.
					if (searchStartTagType!=StartTagType.NORMAL || isXMLTagName || !startTag.isUnregistered()) continue;
				}
				if (startTag.getStartTagType().isNameAfterPrefixRequired() && startTag.getName().length()>searchName.length()) {
					// The name of the start tag is longer than the search name, and the type of tag indicates 
					// that we are probably looking for an exact match.
					// (eg searchName="a", startTag.name="applet" -> reject)
					// We only require an exact match if the last character of the search name is part of the name, as the
					// search name might be just the prefix of a server tag.
					// (eg searchName="?", startTag.name="?abc" -> accept, but searchName="?a", startTag.name="?abc" -> reject)
					// The only exception to this is if the last character of the search name is a colon (which also forms part of
					// the name), but signifies that we want to search on the entire namespace.
					// (eg searchName="o:", startTag.name="o:p" -> accept)
					char lastSearchNameChar=searchName.charAt(searchName.length()-1);
					if (lastSearchNameChar!=':' && isXMLNameChar(lastSearchNameChar)) continue;
				}
				return startTag;
			} while ((begin+=1)<source.end);
		} catch (IndexOutOfBoundsException ex) {
			// this should only happen when the end of file is reached in the middle of a tag.
			// we don't have to do anything to handle it as there are no more tags anyway.
		}
		return null;
	}

	static StartTag getPrevious(final Source source, int pos) {
		Tag tag=Tag.getPreviousTag(source,pos);
		if (tag==null) return null;
		if (tag instanceof StartTag) return (StartTag)tag;
		return tag.getPreviousStartTag();
	}

	static StartTag getNext(final Source source, int pos) {
		Tag tag=Tag.getNextTag(source,pos);
		if (tag==null) return null;
		if (tag instanceof StartTag) return (StartTag)tag;
		return tag.getNextStartTag();
	}

	static StartTag getNext(final Source source, final int pos, final String attributeName, final String value, final boolean valueCaseSensitive) {
		if (value==null || attributeName.length()==0) throw new IllegalArgumentException();
		// Determine whether to perform the text search on the name or value:
		// - perform the text search on the value if it is >= 3 chars long.
		// - have to perform the text search on the name if the value is zero length.
		// - perform the text search on the name if the name >= 3 chars long, otherwise on the value.
		final String searchString=value.length()>=3 || (value.length()>0 && attributeName.length()<3) ? value : attributeName;
		final char[] searchCharArray=searchString.toLowerCase().toCharArray();
		final ParseText parseText=source.getParseText();
		int searchPos=pos;
		while (searchPos<source.end) {
			searchPos=parseText.indexOf(searchCharArray,searchPos);
			if (searchPos==-1) return null;
			final Tag tag=source.getEnclosingTag(searchPos);
			if (tag==null || !(tag instanceof StartTag)) {
				searchPos++;
				continue;
			}
			if (tag.begin>=pos) {
				final StartTag startTag=(StartTag)tag;
				if (startTag.getAttributes()!=null) {
					final String attributeValue=startTag.getAttributes().getValue(attributeName);
					if (attributeValue!=null) {
						if (value.equals(attributeValue)) return startTag;
						if (value.equalsIgnoreCase(attributeValue)) {
							if (!valueCaseSensitive) return startTag;
							if (source.logger.isInfoEnabled()) source.logger.info(source.getRowColumnVector(searchPos).appendTo(new StringBuilder(200)).append(": StartTag with attribute ").append(attributeName).append("=\"").append(attributeValue).append("\" ignored during search because its case does not match search value \"").append(value).append('"').toString());
						}
					}
				}
			}
			searchPos=tag.end+5; // next attribute value can't be less than 5 chars after last start tag
		}
		return null;
	}

	private Segment[] getEndTag(final EndTag nextEndTag, final boolean checkForEmptyElementTag, final boolean isXMLTagName) {
		assert nextEndTag!=null;
		StartTag nextStartTag=getNext(source,end,name,startTagType,isXMLTagName);
		if (checkForEmptyElementTag) {
			while (nextStartTag!=null && nextStartTag.isSyntacticalEmptyElementTag())
				nextStartTag=getNext(source,nextStartTag.end,name,startTagType,isXMLTagName);
		}
		return getEndTag(end,nextStartTag,nextEndTag,checkForEmptyElementTag,isXMLTagName);
	}

	private Segment[] getEndTag(final int afterPos, final StartTag nextStartTag, final EndTag nextEndTag, final boolean checkForEmptyElementTag, final boolean isXMLTagName) {
		// returns null if no end tag exists in the rest of the file, otherwise the following two segments:
		// first is the matching end tag to this start tag.  Must be present if array is returned.
		// second is the next occurrence after the returned end tag of a start tag of the same name. (null if none exists)
		if (nextEndTag==null) return null;  // no end tag in the rest of the file
		final Segment[] returnArray={nextEndTag,nextStartTag};
		if (nextStartTag==null || nextStartTag.begin>nextEndTag.begin) return returnArray;  // no more start tags of the same name in rest of file, or they occur after the end tag that we found.  This means we have found the matching end tag.
		final Segment[] getResult=nextStartTag.getEndTag(nextEndTag,checkForEmptyElementTag,isXMLTagName);  // get the matching end tag to the interloping start tag
		if (getResult==null) return null;  // no end tag in the rest of the file
		final EndTag nextStartTagsEndTag=(EndTag)getResult[0];
		final EndTag nextNextEndTag=EndTag.getNext(source,nextStartTagsEndTag.end,nextEndTag.getName(),nextEndTag.getEndTagType()); // get end tag after the interloping start tag's end tag
		return getEndTag(nextStartTagsEndTag.end,(StartTag)getResult[1],nextNextEndTag,checkForEmptyElementTag,isXMLTagName);  // recurse to see if this is the matching end tag
	}
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -