📄 segment.java
字号:
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
/**
* Represents a segment of a {@link Source} document.
* <p>
* Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
* <p>
* The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
*/
public class Segment implements Comparable<Segment>, CharSequence {
final int begin;
final int end;
final Source source;
private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method
/**
* Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
* @param source the {@link Source} document, must not be <code>null</code>.
* @param begin the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
* @param end the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
*/
public Segment(final Source source, final int begin, final int end) {
if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException();
this.begin=begin;
this.end=end;
if (source==null) throw new IllegalArgumentException("source argument must not be null");
this.source=source;
}
// Only called from Source constructor
Segment(final int length) {
begin=0;
this.end=length;
source=(Source)this;
}
// Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
Segment() {
begin=0;
end=0;
source=null;
}
/**
* Returns the {@link Source} document containing this segment.
* @return the {@link Source} document containing this segment.
*/
public final Source getSource() {
return source;
}
/**
* Returns the character position in the {@link Source} document at which this segment begins, inclusive.
* @return the character position in the {@link Source} document at which this segment begins, inclusive.
*/
public final int getBegin() {
return begin;
}
/**
* Returns the character position in the {@link Source} document immediately after the end of this segment.
* <p>
* The character at the position specified by this property is <b>not</b> included in the segment.
*
* @return the character position in the {@link Source} document immediately after the end of this segment.
*/
public final int getEnd() {
return end;
}
/**
* Compares the specified object with this <code>Segment</code> for equality.
* <p>
* Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>,
* and both segments have the same {@link Source}, and the same begin and end positions.
* @param object the object to be compared for equality with this <code>Segment</code>.
* @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
*/
public final boolean equals(final Object object) {
if (this==object) return true;
if (object==null || !(object instanceof Segment)) return false;
final Segment segment=(Segment)object;
return segment.begin==begin && segment.end==end && segment.source==source;
}
/**
* Returns a hash code value for the segment.
* <p>
* The current implementation returns the sum of the begin and end positions, although this is not
* guaranteed in future versions.
*
* @return a hash code value for the segment.
*/
public int hashCode() {
return begin+end;
}
/**
* Returns the length of the segment.
* This is defined as the number of characters between the begin and end positions.
* @return the length of the segment.
*/
public final int length() {
return end-begin;
}
/**
* Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
* <p>
* This is the case if {@link #getBegin()}<code><=segment.</code>{@link #getBegin()}<code> && </code>{@link #getEnd()}<code>>=segment.</code>{@link #getEnd()}.
* <p>
* Note that a segment encloses itself.
*
* @param segment the segment to be tested for being enclosed by this segment.
* @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
*/
public final boolean encloses(final Segment segment) {
return begin<=segment.begin && end>=segment.end;
}
/**
* Indicates whether this segment encloses the specified character position in the source document.
* <p>
* This is the case if {@link #getBegin()}<code> <= pos < </code>{@link #getEnd()}.
*
* @param pos the position in the {@link Source} document.
* @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
*/
public final boolean encloses(final int pos) {
return begin<=pos && pos<end;
}
/**
* Returns the source text of this segment as a <code>String</code>.
* <p>
* The returned <code>String</code> is newly created with every call to this method, unless this
* segment is itself an instance of {@link Source}.
*
* @return the source text of this segment as a <code>String</code>.
*/
public String toString() {
return source.string.substring(begin,end).toString();
}
/**
* Performs a simple rendering of the HTML markup in this segment into text.
* <p>
* The output can be configured by setting any number of properties on the returned {@link Renderer} instance before
* {@linkplain Renderer#writeTo(Writer) obtaining its output}.
*
* @return an instance of {@link Renderer} based on this segment.
* @see #getTextExtractor()
*/
public Renderer getRenderer() {
return new Renderer(this);
}
/**
* Extracts the textual content from the HTML markup of this segment.
* <p>
* The output can be configured by setting properties on the returned {@link TextExtractor} instance before
* {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
* <p>
* @return an instance of {@link TextExtractor} based on this segment.
* @see #getRenderer()
*/
public TextExtractor getTextExtractor() {
return new TextExtractor(this);
}
/**
* Returns an iterator over every tag and text segment contained within this segment.
* <p>
* Every tag found in the {@link #getAllTags()} list is included in this iterator, including all {@linkplain TagType#isServerTag() server tags}.
* <p>
* Segments of the document between the tags are also included, resulting in a sequential walk-through of every "node" in this segment, where a node is either
* a tag or a segment of text.
* The {@linkplain #getEnd() end} position of each segment should correspond with the {@linkplain #getBegin() begin} position of the subsequent segment,
* unless any of the tags are enclosed by other tags, which is common when {@linkplain TagType#isServerTag() server tags} are present.
* <p>
* The {@link CharacterReference#decodeCollapseWhiteSpace(CharSequence)} method can be used to retrieve the text from each text segment.
* <p>
* <dl>
* <dt>Example:</dt>
* <dd>
* <p>
* The following code demonstrates the typical usage of this method.
* <p>
* <pre>
* for (Iterator<Segment> nodeIterator=segment.getNoteIterator(); nodeIterator.hasNext();) {
* Segment nodeSegment=nodeIterator.next();
* if (nodeSegment instanceof Tag) {
* Tag tag=(Tag)nodeSegment;
* if (tag.getTagType().isServerTag()) continue; // ignore server tags
* // Process the tag (just output it in this example):
* System.out.println(tag.tidy());
* } else {
* // Segment is a text segment.
* // Process the text segment (just output its text in this example):
* String text=CharacterReference.decodeCollapseWhiteSpace(nodeSegment);
* System.out.println(text);
* }
* }</pre>
* </dd>
* </dl>
* @return an iterator over every tag and text segment contained within this segment.
*/
public Iterator<Segment> getNodeIterator() {
return new NodeIterator(this);
}
/**
* Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
* <p>
* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
* if this method is to be used on a large proportion of the source.
* It is called automatically if this method is called on the {@link Source} object itself.
* <p>
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List<Tag> getAllTags() {
return getAllTags(null);
}
/**
* Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
* <p>
* See the {@link Tag} class documentation for more details about the behaviour of this method.
* <p>
* Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}.
*
* @param tagType the {@linkplain TagType type} of tags to get.
* @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @see #getAllStartTags(StartTagType)
*/
public List<Tag> getAllTags(final TagType tagType) {
Tag tag=checkEnclosure(Tag.getNextTag(source,begin,tagType));
if (tag==null) return Collections.emptyList();
final ArrayList<Tag> list=new ArrayList<Tag>();
do {
list.add(tag);
tag=checkEnclosure(tag.getNextTag(tagType));
} while (tag!=null);
return list;
}
/**
* Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
* <p>
* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
* if this method is to be used on a large proportion of the source.
* It is called automatically if this method is called on the {@link Source} object itself.
* <p>
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List<StartTag> getAllStartTags() {
return getAllStartTags((StartTagType)null);
}
/**
* Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
* <p>
* See the {@link Tag} class documentation for more details about the behaviour of this method.
* <p>
* Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}.
*
* @param startTagType the {@linkplain StartTagType type} of tags to get.
* @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List<StartTag> getAllStartTags(final StartTagType startTagType) {
if (startTagType==null) return getAllStartTags();
StartTag startTag=(StartTag)checkEnclosure(Tag.getNextTag(source,begin,startTagType));
if (startTag==null) return Collections.emptyList();
final ArrayList<StartTag> list=new ArrayList<StartTag>();
do {
list.add(startTag);
startTag=(StartTag)checkEnclosure(startTag.getNextTag(startTagType));
} while (startTag!=null);
return list;
}
/**
* Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
* <p>
* See the {@link Tag} class documentation for more details about the behaviour of this method.
* <p>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -