📄 starttag.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;
import java.io.*;

/**
 * Represents the <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-element-2">start tag</a> of an 
 * {@linkplain Element element} in a specific {@linkplain Source source} document.
 * <p>
 * A start tag always has a {@linkplain #getTagType() type} that is a subclass of {@link StartTagType}, meaning that any tag
 * that does <b>not</b> start with the characters '<code>&lt;/</code>' is categorised as a start tag.
 * <p>
 * This includes many tags which stand alone, without a {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag},
 * and would not intuitively be categorised as a "start tag".
 * For example, an HTML {@linkplain StartTagType#COMMENT comment} is represented as a single start tag that spans the whole comment,
 * and does not have an end tag at all.
 * <p>
 * See the <a href="StartTagType.html#field_summary">static fields</a> defined in the {@link StartTagType} class for a list of the 
 * <a href="TagType.html#Standard">standard</a> start tag types.
 * <p>
 * <code>StartTag</code> instances are obtained using one of the following methods:
 * <ul>
 *  <li>{@link Element#getStartTag()}
 *  <li>{@link Tag#getNextTag()}
 *  <li>{@link Tag#getPreviousTag()}
 *  <li>{@link Source#getPreviousStartTag(int pos)}
 *  <li>{@link Source#getPreviousStartTag(int pos, String name)}
 *  <li>{@link Source#getPreviousTag(int pos)}
 *  <li>{@link Source#getPreviousTag(int pos, TagType)}
 *  <li>{@link Source#getNextStartTag(int pos)}
 *  <li>{@link Source#getNextStartTag(int pos, String name)}
 *  <li>{@link Source#getNextStartTag(int pos, String attributeName, String value, boolean valueCaseSensitive)}
 *  <li>{@link Source#getNextTag(int pos)}
 *  <li>{@link Source#getNextTag(int pos, TagType)}
 *  <li>{@link Source#getEnclosingTag(int pos)}
 *  <li>{@link Source#getEnclosingTag(int pos, TagType)}
 *  <li>{@link Source#getTagAt(int pos)}
 *  <li>{@link Segment#getAllStartTags()}
 *  <li>{@link Segment#getAllStartTags(String name)}
 *  <li>{@link Segment#getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)}
 *  <li>{@link Segment#getAllTags()}
 *  <li>{@link Segment#getAllTags(TagType)}
 * </ul>
 * <p>
 * The methods above which accept a <code>name</code> parameter are categorised as <a href="Tag.html#NamedSearch">named search</a> methods.
 * <p>
 * In such methods dealing with start tags, specifying an argument to the <code>name</code> parameter that ends in a
 * colon (<code>:</code>) searches for all start tags in the specified XML namespace.
 * <p>
 * The constants defined in the {@link HTMLElementName} interface can be used directly as arguments to these <code>name</code> parameters.
 * For example, <code>source.getAllStartTags(</code>{@link HTMLElementName#A}<code>)</code> is equivalent to
 * <code>source.getAllStartTags("a")</code>, and gets all hyperlink start tags.
 * <p>
 * The {@link Tag} superclass defines a method called {@link Tag#getName() getName()} to get the name of this start tag.
 * <p>
 * See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-stag">start tags</a>.
 *
 * @see Tag
 * @see Element
 * @see EndTag
 */
public final class StartTag extends Tag {
	private final Attributes attributes;
	final StartTagType startTagType;

	/**
	 * Constructs a new <code>StartTag</code>.
	 *
	 * @param source  the {@link Source} document.
	 * @param begin  the character position in the source document where this tag {@linkplain Segment#getBegin() begins}.
	 * @param end  the character position in the source document where this tag {@linkplain Segment#getEnd() ends}.
	 * @param startTagType  the {@linkplain #getStartTagType() type} of the start tag.
	 * @param name  the {@linkplain Tag#getName() name} of the tag.
	 * @param attributes  the {@linkplain #getAttributes() attributes} of the tag.
	 */
	StartTag(final Source source, final int begin, final int end, final StartTagType startTagType, final String name, final Attributes attributes) {
		super(source,begin,end,name);
		this.attributes=attributes;
		this.startTagType=startTagType;
	}

	// only used to create Tag.NOT_CACHED
	StartTag() {
		attributes=null;
		startTagType=null;
	}

	/**
	 * Returns the {@linkplain Element element} that is started by this start tag.
	 * Guaranteed not <code>null</code>.
 	 * <p>
	 * <dl>
	 *  <dt>Example 1: Elements for which the {@linkplain HTMLElements#getEndTagRequiredElementNames() end tag is required}</dt>
	 *  <dd>
	 *   <pre>
	 *    1. &lt;div&gt;
	 *    2.   &lt;div&gt;
	 *    3.     &lt;div&gt;
	 *    4.       &lt;div&gt;This is line 4&lt;/div&gt;
	 *    5.     &lt;/div&gt;
	 *    6.     &lt;div&gt;This is line 6&lt;/div&gt;
	 *    7.   &lt;/div&gt;</pre>
	 *   <ul>
	 *    <li>The start tag on line 1 returns an empty element spanning only the start tag.
	 *     This is because the end tag of a <code>&lt;div&gt;</code> element is required,
	 *     making the sample code invalid as all the end tags are matched with other start tags.
	 *    <li>The start tag on line 2 returns an element spanning to the end of line 7.
	 *    <li>The start tag on line 3 returns an element spanning to the end of line 5.
	 *    <li>The start tag on line 4 returns an element spanning to the end of line 4.
	 *    <li>The start tag on line 6 returns an element spanning to the end of line 6.
	 *   </ul>
	 *   <p>
	 *  </dd>
	 *  <dt>Example 2: Elements for which the {@linkplain HTMLElements#getEndTagOptionalElementNames() end tag is optional}</dt>
	 *  <dd>
	 *   <pre>
	 *    1. &lt;ul&gt;
	 *    2.   &lt;li&gt;item 1
	 *    3.   &lt;li&gt;item 2
	 *    4.     &lt;ul&gt;
	 *    5.       &lt;li&gt;subitem 1&lt;/li&gt;
	 *    6.       &lt;li&gt;subitem 2
	 *    7.     &lt;/ul&gt;
	 *    8.   &lt;li&gt;item 3&lt;/li&gt;
	 *    9. &lt;/ul&gt;</pre>
	 *   <ul>
	 *    <li>The start tag on line 1 returns an element spanning to the end of line 9.
	 *    <li>The start tag on line 2 returns an element spanning to the start of the <code>&lt;li&gt;</code> start tag on line 3.
	 *    <li>The start tag on line 3 returns an element spanning to the start of the <code>&lt;li&gt;</code> start tag on line 8.
	 *    <li>The start tag on line 4 returns an element spanning to the end of line 7.
	 *    <li>The start tag on line 5 returns an element spanning to the end of line 5.
	 *    <li>The start tag on line 6 returns an element spanning to the start of the <code>&lt;/ul&gt;</code> end tag on line 7.
	 *    <li>The start tag on line 8 returns an element spanning to the end of line 8.
	 *   </ul>
	 *  </dd>
	 * </dl>
	 *
	 * @return the {@linkplain Element element} that is started by this start tag.
	 */
	public Element getElement() {
		if (element==Element.NOT_CACHED) {
			final EndTag endTag=getEndTagInternal();
			element=new Element(source,this,endTag);
			if (endTag!=null) {
				if (endTag.element!=Element.NOT_CACHED) {
					// This is presumably impossible, except in certain circumstances where the cache was cleared, such as if the parser decides to do a full sequential parse after some tags have already been found.
					// If the existing element and the current element are not the same, log it.
					if (source.logger.isInfoEnabled() && !element.equals(endTag.element)) source.logger.info(source.getRowColumnVector(endTag.begin).appendTo(new StringBuilder(200).append("End tag ").append(endTag).append(" at ")).append(" terminates more than one element").toString()); 
				}
				endTag.element=element;
			}
		}
		return element;
	}

	/**
	 * Indicates whether this start tag is an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>.
	 * <p>
	 * This property checks that the the tag is {@linkplain #isSyntacticalEmptyElementTag() syntactically an empty-element tag},
	 * but in addition checks that the {@linkplain #getName() name} of the tag is not one that is defined in the HTML specification to have a
	 * {@linkplain HTMLElements#getEndTagRequiredElementNames() required} or {@linkplain HTMLElements#getEndTagOptionalElementNames() optional} end tag,
	 * which the major browsers do not recognise as empty-element tags, even in an <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> document.
	 * <p>
	 * This is equivalent to:<br />
	 * {@link #isSyntacticalEmptyElementTag()}<code> && !(</code>{@link HTMLElements#getEndTagOptionalElementNames()}<code>.contains(</code>{@link #getName() getName()}<code>) || </code>{@link HTMLElements#getEndTagRequiredElementNames()}<code>.contains(</code>{@link #getName() getName()}<code>))</code>.
	 * <p>
	 * Prior to Version 2.6, the implementation of this method was equivalent to {@link #isSyntacticalEmptyElementTag()}.
	 *
	 * @return <code>true</code> if this start tag is an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>, otherwise <code>false</code>.
	 */
	public boolean isEmptyElementTag() {
		return isSyntacticalEmptyElementTag() && !HTMLElements.isClosingSlashIgnored(name);
	}

	/**
	 * Indicates whether this start tag is syntactically an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>.
	 * <p>
	 * This is signified by the characters "/&gt;" at the end of the start tag.
	 * <p>
	 * Only a {@linkplain StartTagType#NORMAL normal} start tag can be syntactically an empty-element tag.
	 * <p>
	 * This property simply reports whether the syntax of the start tag is consistent with that of an empty-element tag,
	 * it does not guarantee that this start tag's {@linkplain #getElement() element} is actually {@linkplain Element#isEmpty() empty}.
	 * <p>
	 * This possible discrepancy reflects the way major browsers interpret illegal empty element tags used in
	 * <a href="HTMLElements.html#HTMLElement">HTML elements</a>, and is explained further in the documentation of the
	 * {@link #isEmptyElementTag()} property.
	 *
	 * @return <code>true</code> if this start tag is syntactically an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>, otherwise <code>false</code>.
	 * @see #isEmptyElementTag()
	 */
	public boolean isSyntacticalEmptyElementTag() {
		return startTagType==StartTagType.NORMAL && source.charAt(end-2)=='/';
	}

	/**
	 * Returns the {@linkplain StartTagType type} of this start tag.	
	 * <p>
	 * This is equivalent to <code>(StartTagType)</code>{@link #getTagType()}.
	 *
	 * @return the {@linkplain StartTagType type} of this start tag.	
	 */
	public StartTagType getStartTagType() {
		return startTagType;
	}

	// Documentation inherited from Tag
	public TagType getTagType() {
		return startTagType;
	}

	/**
	 * Returns the attributes specified in this start tag.
	 * <p>
	 * Return value is not <code>null</code> if and only if
	 * {@link #getStartTagType()}<code>.</code>{@link StartTagType#hasAttributes() hasAttributes()}<code>==true</code>.
	 * <p>
	 * To force the parsing of attributes in other start tag types, use the {@link #parseAttributes()} method instead.
	 *
	 * @return the attributes specified in this start tag, or <code>null</code> if the {@linkplain #getStartTagType() type} of this start tag does not {@linkplain StartTagType#hasAttributes() have attributes}.
	 * @see #parseAttributes()
	 * @see Source#parseAttributes(int pos, int maxEnd)
	 */
	public Attributes getAttributes() {
		return attributes;
	}

	/**
	 * Returns the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name (case insensitive).
	 * <p>
	 * Returns <code>null</code> if this start tag does not {@linkplain StartTagType#hasAttributes() have attributes},
	 * no attribute with the specified name exists or the attribute {@linkplain Attribute#hasValue() has no value}.
	 * <p>
	 * This is equivalent to {@link #getAttributes()}<code>.</code>{@link Attributes#getValue(String) getValue(attributeName)},
	 * except that it returns <code>null</code> if this start tag does not have attributes instead of throwing a
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -