📄 starttag.java
字号:
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
import java.io.*;
/**
* Represents the <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-element-2">start tag</a> of an
* {@linkplain Element element} in a specific {@linkplain Source source} document.
* <p>
* A start tag always has a {@linkplain #getTagType() type} that is a subclass of {@link StartTagType}, meaning that any tag
* that does <b>not</b> start with the characters '<code></</code>' is categorised as a start tag.
* <p>
* This includes many tags which stand alone, without a {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag},
* and would not intuitively be categorised as a "start tag".
* For example, an HTML {@linkplain StartTagType#COMMENT comment} is represented as a single start tag that spans the whole comment,
* and does not have an end tag at all.
* <p>
* See the <a href="StartTagType.html#field_summary">static fields</a> defined in the {@link StartTagType} class for a list of the
* <a href="TagType.html#Standard">standard</a> start tag types.
* <p>
* <code>StartTag</code> instances are obtained using one of the following methods:
* <ul>
* <li>{@link Element#getStartTag()}
* <li>{@link Tag#getNextTag()}
* <li>{@link Tag#getPreviousTag()}
* <li>{@link Source#getPreviousStartTag(int pos)}
* <li>{@link Source#getPreviousStartTag(int pos, String name)}
* <li>{@link Source#getPreviousTag(int pos)}
* <li>{@link Source#getPreviousTag(int pos, TagType)}
* <li>{@link Source#getNextStartTag(int pos)}
* <li>{@link Source#getNextStartTag(int pos, String name)}
* <li>{@link Source#getNextStartTag(int pos, String attributeName, String value, boolean valueCaseSensitive)}
* <li>{@link Source#getNextTag(int pos)}
* <li>{@link Source#getNextTag(int pos, TagType)}
* <li>{@link Source#getEnclosingTag(int pos)}
* <li>{@link Source#getEnclosingTag(int pos, TagType)}
* <li>{@link Source#getTagAt(int pos)}
* <li>{@link Segment#getAllStartTags()}
* <li>{@link Segment#getAllStartTags(String name)}
* <li>{@link Segment#getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)}
* <li>{@link Segment#getAllTags()}
* <li>{@link Segment#getAllTags(TagType)}
* </ul>
* <p>
* The methods above which accept a <code>name</code> parameter are categorised as <a href="Tag.html#NamedSearch">named search</a> methods.
* <p>
* In such methods dealing with start tags, specifying an argument to the <code>name</code> parameter that ends in a
* colon (<code>:</code>) searches for all start tags in the specified XML namespace.
* <p>
* The constants defined in the {@link HTMLElementName} interface can be used directly as arguments to these <code>name</code> parameters.
* For example, <code>source.getAllStartTags(</code>{@link HTMLElementName#A}<code>)</code> is equivalent to
* <code>source.getAllStartTags("a")</code>, and gets all hyperlink start tags.
* <p>
* The {@link Tag} superclass defines a method called {@link Tag#getName() getName()} to get the name of this start tag.
* <p>
* See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-stag">start tags</a>.
*
* @see Tag
* @see Element
* @see EndTag
*/
public final class StartTag extends Tag {
private final Attributes attributes;
final StartTagType startTagType;
/**
* Constructs a new <code>StartTag</code>.
*
* @param source the {@link Source} document.
* @param begin the character position in the source document where this tag {@linkplain Segment#getBegin() begins}.
* @param end the character position in the source document where this tag {@linkplain Segment#getEnd() ends}.
* @param startTagType the {@linkplain #getStartTagType() type} of the start tag.
* @param name the {@linkplain Tag#getName() name} of the tag.
* @param attributes the {@linkplain #getAttributes() attributes} of the tag.
*/
StartTag(final Source source, final int begin, final int end, final StartTagType startTagType, final String name, final Attributes attributes) {
super(source,begin,end,name);
this.attributes=attributes;
this.startTagType=startTagType;
}
// only used to create Tag.NOT_CACHED
StartTag() {
attributes=null;
startTagType=null;
}
/**
* Returns the {@linkplain Element element} that is started by this start tag.
* Guaranteed not <code>null</code>.
* <p>
* <dl>
* <dt>Example 1: Elements for which the {@linkplain HTMLElements#getEndTagRequiredElementNames() end tag is required}</dt>
* <dd>
* <pre>
* 1. <div>
* 2. <div>
* 3. <div>
* 4. <div>This is line 4</div>
* 5. </div>
* 6. <div>This is line 6</div>
* 7. </div></pre>
* <ul>
* <li>The start tag on line 1 returns an empty element spanning only the start tag.
* This is because the end tag of a <code><div></code> element is required,
* making the sample code invalid as all the end tags are matched with other start tags.
* <li>The start tag on line 2 returns an element spanning to the end of line 7.
* <li>The start tag on line 3 returns an element spanning to the end of line 5.
* <li>The start tag on line 4 returns an element spanning to the end of line 4.
* <li>The start tag on line 6 returns an element spanning to the end of line 6.
* </ul>
* <p>
* </dd>
* <dt>Example 2: Elements for which the {@linkplain HTMLElements#getEndTagOptionalElementNames() end tag is optional}</dt>
* <dd>
* <pre>
* 1. <ul>
* 2. <li>item 1
* 3. <li>item 2
* 4. <ul>
* 5. <li>subitem 1</li>
* 6. <li>subitem 2
* 7. </ul>
* 8. <li>item 3</li>
* 9. </ul></pre>
* <ul>
* <li>The start tag on line 1 returns an element spanning to the end of line 9.
* <li>The start tag on line 2 returns an element spanning to the start of the <code><li></code> start tag on line 3.
* <li>The start tag on line 3 returns an element spanning to the start of the <code><li></code> start tag on line 8.
* <li>The start tag on line 4 returns an element spanning to the end of line 7.
* <li>The start tag on line 5 returns an element spanning to the end of line 5.
* <li>The start tag on line 6 returns an element spanning to the start of the <code></ul></code> end tag on line 7.
* <li>The start tag on line 8 returns an element spanning to the end of line 8.
* </ul>
* </dd>
* </dl>
*
* @return the {@linkplain Element element} that is started by this start tag.
*/
public Element getElement() {
if (element==Element.NOT_CACHED) {
final EndTag endTag=getEndTagInternal();
element=new Element(source,this,endTag);
if (endTag!=null) {
if (endTag.element!=Element.NOT_CACHED) {
// This is presumably impossible, except in certain circumstances where the cache was cleared, such as if the parser decides to do a full sequential parse after some tags have already been found.
// If the existing element and the current element are not the same, log it.
if (source.logger.isInfoEnabled() && !element.equals(endTag.element)) source.logger.info(source.getRowColumnVector(endTag.begin).appendTo(new StringBuilder(200).append("End tag ").append(endTag).append(" at ")).append(" terminates more than one element").toString());
}
endTag.element=element;
}
}
return element;
}
/**
* Indicates whether this start tag is an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>.
* <p>
* This property checks that the the tag is {@linkplain #isSyntacticalEmptyElementTag() syntactically an empty-element tag},
* but in addition checks that the {@linkplain #getName() name} of the tag is not one that is defined in the HTML specification to have a
* {@linkplain HTMLElements#getEndTagRequiredElementNames() required} or {@linkplain HTMLElements#getEndTagOptionalElementNames() optional} end tag,
* which the major browsers do not recognise as empty-element tags, even in an <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> document.
* <p>
* This is equivalent to:<br />
* {@link #isSyntacticalEmptyElementTag()}<code> && !(</code>{@link HTMLElements#getEndTagOptionalElementNames()}<code>.contains(</code>{@link #getName() getName()}<code>) || </code>{@link HTMLElements#getEndTagRequiredElementNames()}<code>.contains(</code>{@link #getName() getName()}<code>))</code>.
* <p>
* Prior to Version 2.6, the implementation of this method was equivalent to {@link #isSyntacticalEmptyElementTag()}.
*
* @return <code>true</code> if this start tag is an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>, otherwise <code>false</code>.
*/
public boolean isEmptyElementTag() {
return isSyntacticalEmptyElementTag() && !HTMLElements.isClosingSlashIgnored(name);
}
/**
* Indicates whether this start tag is syntactically an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>.
* <p>
* This is signified by the characters "/>" at the end of the start tag.
* <p>
* Only a {@linkplain StartTagType#NORMAL normal} start tag can be syntactically an empty-element tag.
* <p>
* This property simply reports whether the syntax of the start tag is consistent with that of an empty-element tag,
* it does not guarantee that this start tag's {@linkplain #getElement() element} is actually {@linkplain Element#isEmpty() empty}.
* <p>
* This possible discrepancy reflects the way major browsers interpret illegal empty element tags used in
* <a href="HTMLElements.html#HTMLElement">HTML elements</a>, and is explained further in the documentation of the
* {@link #isEmptyElementTag()} property.
*
* @return <code>true</code> if this start tag is syntactically an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>, otherwise <code>false</code>.
* @see #isEmptyElementTag()
*/
public boolean isSyntacticalEmptyElementTag() {
return startTagType==StartTagType.NORMAL && source.charAt(end-2)=='/';
}
/**
* Returns the {@linkplain StartTagType type} of this start tag.
* <p>
* This is equivalent to <code>(StartTagType)</code>{@link #getTagType()}.
*
* @return the {@linkplain StartTagType type} of this start tag.
*/
public StartTagType getStartTagType() {
return startTagType;
}
// Documentation inherited from Tag
public TagType getTagType() {
return startTagType;
}
/**
* Returns the attributes specified in this start tag.
* <p>
* Return value is not <code>null</code> if and only if
* {@link #getStartTagType()}<code>.</code>{@link StartTagType#hasAttributes() hasAttributes()}<code>==true</code>.
* <p>
* To force the parsing of attributes in other start tag types, use the {@link #parseAttributes()} method instead.
*
* @return the attributes specified in this start tag, or <code>null</code> if the {@linkplain #getStartTagType() type} of this start tag does not {@linkplain StartTagType#hasAttributes() have attributes}.
* @see #parseAttributes()
* @see Source#parseAttributes(int pos, int maxEnd)
*/
public Attributes getAttributes() {
return attributes;
}
/**
* Returns the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name (case insensitive).
* <p>
* Returns <code>null</code> if this start tag does not {@linkplain StartTagType#hasAttributes() have attributes},
* no attribute with the specified name exists or the attribute {@linkplain Attribute#hasValue() has no value}.
* <p>
* This is equivalent to {@link #getAttributes()}<code>.</code>{@link Attributes#getValue(String) getValue(attributeName)},
* except that it returns <code>null</code> if this start tag does not have attributes instead of throwing a
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -