📄 attributes.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import net.htmlparser.jericho.nodoc.*;
import java.util.*;
import java.io.*;

/**
 * Represents the list of {@link Attribute} objects present within a particular {@link StartTag}.
 * <p>
 * This segment starts at the end of the start tag's {@linkplain StartTag#getName() name}
 * and ends at the end of the last attribute.
 * <p>
 * The attributes in this list are a representation of those found in the source document and are not modifiable.
 * The {@link OutputDocument#replace(Attributes, Map)} and {@link OutputDocument#replace(Attributes, boolean convertNamesToLowerCase)} methods
 * provide the means to add, delete or modify attributes and their values in an {@link OutputDocument}.
 * <p>
 * Any {@linkplain TagType#isServerTag() server tags} encountered inside the attributes area of a non-server tag
 * do not interfere with the parsing of the attributes.
 * <p>
 * If too many syntax errors are encountered while parsing a start tag's attributes, the parser rejects the entire start tag
 * and generates a {@linkplain Source#getLogger() log} entry.
 * The threshold for the number of errors allowed can be set using the {@link #setDefaultMaxErrorCount(int)} static method.
 * <p>
 * Obtained using the {@link StartTag#getAttributes()} method, or explicitly using the {@link Source#parseAttributes(int pos, int maxEnd)} method.
 * <p>
 * It is common for instances of this class to contain no attributes.
 * <p>
 * See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-attr">attributes</a>.
 *
 * @see StartTag
 * @see Attribute
 */
public final class Attributes extends SequentialListSegment<Attribute> {
	private final LinkedList<Attribute> attributeList; // never null
	
	final boolean containsServerTagOutsideOfAttributeValue;

	private enum ParsingState {
		AFTER_TAG_NAME,
		BETWEEN_ATTRIBUTES,
		IN_NAME,
		AFTER_NAME, // this only happens if an attribute name is followed by whitespace
		START_VALUE,
		IN_VALUE,
		AFTER_VALUE_FINAL_QUOTE
	}

	private static int defaultMaxErrorCount=2; // defines maximum number of minor errors that can be encountered in attributes before entire start tag is rejected.

	private Attributes(final Source source, final int begin, final int end, final LinkedList<Attribute> attributeList, final boolean containsServerTagOutsideOfAttributeValue) {
		super(source,begin,end);
		this.attributeList=attributeList;
		this.containsServerTagOutsideOfAttributeValue=containsServerTagOutsideOfAttributeValue;
	}

	/** called from StartTagType.parseAttributes(Source, int startTagBegin, String tagName) */
	static Attributes construct(final Source source, final int startTagBegin, final StartTagType startTagType, final String tagName) {
		return construct(source,"StartTag",ParsingState.AFTER_TAG_NAME,startTagBegin,-1,-1,startTagType,tagName,defaultMaxErrorCount);
	}

	/** called from StartTag.parseAttributes(int maxErrorCount) */
	static Attributes construct(final Source source, final int startTagBegin, final int attributesBegin, final int maxEnd, final StartTagType startTagType, final String tagName, final int maxErrorCount) {
		return construct(source,"Attributes for StartTag",ParsingState.BETWEEN_ATTRIBUTES,startTagBegin,attributesBegin,maxEnd,startTagType,tagName,maxErrorCount);
	}

	/** called from Source.parseAttributes(int pos, int maxEnd, int maxErrorCount) */
	static Attributes construct(final Source source, final int begin, final int maxEnd, final int maxErrorCount) {
		return construct(source,"Attributes",ParsingState.BETWEEN_ATTRIBUTES,begin,-1,maxEnd,StartTagType.NORMAL,null,maxErrorCount);
	}

	/**
	 * Any &lt; character found within the start tag is treated as though it is part of the attribute
	 * list, which is consistent with the way IE treats it.
	 * @param logBegin  the position of the beginning of the object being searched (for logging)
	 * @param attributesBegin  the position of the beginning of the attribute list, or -1 if it should be calculated automatically from logBegin.
	 * @param maxEnd  the position at which the attributes must end if a terminating character is not found, or -1 if no maximum.
	 * @param tagName  the name of the enclosing StartTag, or null if constucting attributes directly.
	 */
	private static Attributes construct(final Source source, final String logType, ParsingState parsingState, final int logBegin, int attributesBegin, final int maxEnd, final StartTagType startTagType, final String tagName, final int maxErrorCount) {
		boolean isClosingSlashIgnored=false;
		if (tagName!=null) {
			// 'logBegin' parameter is the start of the associated start tag
			if (attributesBegin==-1) attributesBegin=logBegin+1+tagName.length();
			if (startTagType==StartTagType.NORMAL && HTMLElements.isClosingSlashIgnored(tagName)) isClosingSlashIgnored=true;
		} else {
			attributesBegin=logBegin;
		}
		int attributesEnd=attributesBegin;
		final LinkedList<Attribute> attributeList=new LinkedList<Attribute>();
		boolean containsServerTagOutsideOfAttributeValue=false;
		final ParseText parseText=source.getParseText();
		int i=attributesBegin;
		char quote=' ';
		Segment nameSegment=null;
		String key=null;
		int currentBegin=-1;
		boolean isTerminatingCharacter=false;
		int errorCount=0;
		try {
			while (!isTerminatingCharacter) {
				if (i==maxEnd || startTagType.atEndOfAttributes(source,i,isClosingSlashIgnored)) isTerminatingCharacter=true;
				final char ch=parseText.charAt(i);
				// First check if there is a server tag in this position:
				if (ch=='<') {
					final Tag interlopingTag=Tag.getTagAt(source,i,true); // search for server tags only
					if (interlopingTag!=null) {
						// There is a server tag in this position. Skip over it:
						if (parsingState==ParsingState.START_VALUE) {
							currentBegin=i;
							quote=' ';
							parsingState=ParsingState.IN_VALUE;
						}
						i=attributesEnd=interlopingTag.end;
						if (parsingState!=ParsingState.IN_VALUE) containsServerTagOutsideOfAttributeValue=true;
						continue;
					}
				}
				// There is no server tag in this position. Now we can parse the attributes:
				switch (parsingState) {
					case IN_VALUE:
						if (isTerminatingCharacter || ch==quote || (quote==' ' && isWhiteSpace(ch))) {
							Segment valueSegment;
							Segment valueSegmentIncludingQuotes;
							if (quote==' ') {
								valueSegment=valueSegmentIncludingQuotes=new Segment(source,currentBegin,i);
							} else {
								if (isTerminatingCharacter) {
									if (i==maxEnd) {
										if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"terminated in the middle of a quoted attribute value",i);
										if (reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null;
										valueSegment=new Segment(source,currentBegin,i);
										valueSegmentIncludingQuotes=new Segment(source,currentBegin-1,i); // this is missing the end quote
									} else {
										// don't want to terminate, only encountered a terminating character in the middle of a quoted value
										isTerminatingCharacter=false;
										break;
									}
								} else {
									valueSegment=new Segment(source,currentBegin,i);
									valueSegmentIncludingQuotes=new Segment(source,currentBegin-1,i+1);
								}
							}
							attributeList.add(new Attribute(source,key,nameSegment,valueSegment,valueSegmentIncludingQuotes));
							attributesEnd=valueSegmentIncludingQuotes.getEnd();
							parsingState=ParsingState.BETWEEN_ATTRIBUTES;
						} else if (ch=='<' && quote==' ') {
							if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character in unquoted attribute value",i);
							return null;
						}
						break;
					case IN_NAME:
						if (isTerminatingCharacter || ch=='=' || isWhiteSpace(ch)) {
							nameSegment=new Segment(source,currentBegin,i);
							key=nameSegment.toString().toLowerCase();
							if (isTerminatingCharacter) {
								attributeList.add(new Attribute(source,key,nameSegment)); // attribute with no value
								attributesEnd=i;
							} else {
								parsingState=(ch=='=' ? ParsingState.START_VALUE : ParsingState.AFTER_NAME);
							}
						} else if (!Tag.isXMLNameChar(ch)) {
							// invalid character detected in attribute name.
							if (ch=='<') {
								if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character in attribute name",i);
								return null;
							}
							if (isInvalidEmptyElementTag(startTagType,source,i,logType,tagName,logBegin)) break;
							if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"contains attribute name with invalid character",i);
							if (reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null;
						}
						break;
					case AFTER_NAME:
						// attribute name has been followed by whitespace, but may still be followed by an '=' character.
						if (isTerminatingCharacter || !(ch=='=' || isWhiteSpace(ch))) {
							attributeList.add(new Attribute(source,key,nameSegment)); // attribute with no value
							attributesEnd=nameSegment.getEnd();
							if (isTerminatingCharacter) break;
							// The current character is the first character of an attribute name
							parsingState=ParsingState.BETWEEN_ATTRIBUTES;
							i--; // want to reparse the same character again, so decrement i.  Note we could instead just fall into the next case statement without a break, but such code is always discouraged.
						} else if (ch=='=') {
							parsingState=ParsingState.START_VALUE;
						} else if (ch=='<') {
							if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character after attribute name",i);
							return null;
						}
						break;
					case BETWEEN_ATTRIBUTES:
						if (!isTerminatingCharacter) {
							// the quote variable is used here to make sure whitespace has come after the last quoted attribute value
							if (isWhiteSpace(ch)) {
								quote=' ';
							} else {
								if (quote!=' ') {
									if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"has missing whitespace after quoted attribute value",i);
									// only count this as an error if there have already been other errors, otherwise allow unlimited errors of this type.
									if (errorCount>0 && reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null;
								}
								if (!Tag.isXMLNameStartChar(ch)) {
									// invalid character detected as first character of attribute name.
									if (ch=='<') {
										if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character",i);
										return null;
									}
									if (isInvalidEmptyElementTag(startTagType,source,i,logType,tagName,logBegin)) break;
									if (startTagType==StartTagType.NORMAL && startTagType.atEndOfAttributes(source,i,false)) {
										// This checks whether we've found the characters "/>" but it wasn't recognised as the closing delimiter because isClosingSlashIgnored is true.
										if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"contains a '/' character before the closing '>', which is ignored because tags of this name cannot be empty-element tags");
										break;
									}
									if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"contains attribute name with invalid first character",i);
									if (reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null;
								}
								parsingState=ParsingState.IN_NAME;
								currentBegin=i;
							}
						}
						break;
					case START_VALUE:
						currentBegin=i;
						if (isTerminatingCharacter) {
							if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"has missing attribute value after '=' sign",i);
							// only count this as an error if there have already been other errors, otherwise allow unlimited errors of this type.
							if (errorCount>0 && reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null;
							final Segment valueSegment=new Segment(source,i,i);
							attributeList.add(new Attribute(source,key,nameSegment,valueSegment,valueSegment));
							attributesEnd=i;
							parsingState=ParsingState.BETWEEN_ATTRIBUTES;
							break;
						}
						if (ch=='\'' || ch=='"') {
							quote=ch;
							currentBegin++;
						} else if (isWhiteSpace(ch)) {
							break; // just ignore whitespace after the '=' sign as nearly all browsers do.
						} else if (ch=='<') {
							if (source.logger.isInfoEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character at the start of an attribute value",i);
							return null;
						} else {
							quote=' ';
						}
						parsingState=ParsingState.IN_VALUE;
						break;
					case AFTER_TAG_NAME:
						if (!isTerminatingCharacter) {
							if (!isWhiteSpace(ch)) {
								if (isInvalidEmptyElementTag(startTagType,source,i,logType,tagName,logBegin)) break;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -