📄 segment.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;

/**
 * Represents a segment of a {@link Source} document.
 * <p>
 * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
 * <p>
 * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
 */
public class Segment implements Comparable<Segment>, CharSequence {
	final int begin;
	final int end;
	final Source source;
	
	private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method

	/**
	 * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
	 * @param source  the {@link Source} document, must not be <code>null</code>.
	 * @param begin  the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
	 * @param end  the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
	 */
	public Segment(final Source source, final int begin, final int end) {
		if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException();
		this.begin=begin;
		this.end=end;
		if (source==null) throw new IllegalArgumentException("source argument must not be null");
		this.source=source;
	}

	// Only called from Source constructor
	Segment(final int length) {
		begin=0;
		this.end=length;
		source=(Source)this;
	}

	// Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
	Segment() {
		begin=0;
		end=0;
		source=null;
	}

	/**
	 * Returns the {@link Source} document containing this segment.
	 * @return the {@link Source} document containing this segment.
	 */
	public final Source getSource() {
		return source;
	}

	/**
	 * Returns the character position in the {@link Source} document at which this segment begins, inclusive.
	 * @return the character position in the {@link Source} document at which this segment begins, inclusive.
	 */
	public final int getBegin() {
		return begin;
	}

	/**
	 * Returns the character position in the {@link Source} document immediately after the end of this segment.
	 * <p>
	 * The character at the position specified by this property is <b>not</b> included in the segment.
	 *
	 * @return the character position in the {@link Source} document immediately after the end of this segment.
	 */
	public final int getEnd() {
		return end;
	}

	/**
	 * Compares the specified object with this <code>Segment</code> for equality.
	 * <p>
	 * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>,
	 * and both segments have the same {@link Source}, and the same begin and end positions.
	 * @param object  the object to be compared for equality with this <code>Segment</code>.
	 * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
	 */
	public final boolean equals(final Object object) {
		if (this==object) return true;
		if (object==null || !(object instanceof Segment)) return false;
		final Segment segment=(Segment)object;
		return segment.begin==begin && segment.end==end && segment.source==source;
	}

	/**
	 * Returns a hash code value for the segment.
	 * <p>
	 * The current implementation returns the sum of the begin and end positions, although this is not
	 * guaranteed in future versions.
	 *
	 * @return a hash code value for the segment.
	 */
	public int hashCode() {
		return begin+end;
	}

	/**
	 * Returns the length of the segment.
	 * This is defined as the number of characters between the begin and end positions.
	 * @return the length of the segment.
	 */
	public final int length() {
		return end-begin;
	}

	/**
	 * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
	 * <p>
	 * This is the case if {@link #getBegin()}<code>&lt;=segment.</code>{@link #getBegin()}<code> &amp;&amp; </code>{@link #getEnd()}<code>&gt;=segment.</code>{@link #getEnd()}.
	 * <p>
	 * Note that a segment encloses itself.
	 *
	 * @param segment  the segment to be tested for being enclosed by this segment.
	 * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
	 */
	public final boolean encloses(final Segment segment) {
		return begin<=segment.begin && end>=segment.end;
	}

	/**
	 * Indicates whether this segment encloses the specified character position in the source document.
	 * <p>
	 * This is the case if {@link #getBegin()}<code> &lt;= pos &lt; </code>{@link #getEnd()}.
	 *
	 * @param pos  the position in the {@link Source} document.
	 * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
	 */
	public final boolean encloses(final int pos) {
		return begin<=pos && pos<end;
	}

	/**
	 * Returns the source text of this segment as a <code>String</code>.
	 * <p>
	 * The returned <code>String</code> is newly created with every call to this method, unless this
	 * segment is itself an instance of {@link Source}.
	 *
	 * @return the source text of this segment as a <code>String</code>.
	 */
	public String toString() {
		return source.string.substring(begin,end).toString();
	}

	/**
	 * Performs a simple rendering of the HTML markup in this segment into text.
	 * <p>
	 * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before
	 * {@linkplain Renderer#writeTo(Writer) obtaining its output}.
	 * 
	 * @return an instance of {@link Renderer} based on this segment.
	 * @see #getTextExtractor()
	 */
	public Renderer getRenderer() {
		return new Renderer(this);
	}

	/**
	 * Extracts the textual content from the HTML markup of this segment.
	 * <p>
	 * The output can be configured by setting properties on the returned {@link TextExtractor} instance before
	 * {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
	 * <p>
	 * @return an instance of {@link TextExtractor} based on this segment.
	 * @see #getRenderer()
	 */
	public TextExtractor getTextExtractor() {
		return new TextExtractor(this);
	}

	/**
	 * Returns an iterator over every tag and text segment contained within this segment.
	 * <p>
	 * Every tag found in the {@link #getAllTags()} list is included in this iterator, including all {@linkplain TagType#isServerTag() server tags}.
	 * <p>
	 * Segments of the document between the tags are also included, resulting in a sequential walk-through of every "node" in this segment, where a node is either
	 * a tag or a segment of text.
	 * The {@linkplain #getEnd() end} position of each segment should correspond with the {@linkplain #getBegin() begin} position of the subsequent segment,
	 * unless any of the tags are enclosed by other tags, which is common when {@linkplain TagType#isServerTag() server tags} are present.
	 * <p>
	 * The {@link CharacterReference#decodeCollapseWhiteSpace(CharSequence)} method can be used to retrieve the text from each text segment.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd>
	 *   <p>
	 *   The following code demonstrates the typical usage of this method.
	 *   <p>
	 * <pre>
	 * for (Iterator&lt;Segment&gt; nodeIterator=segment.getNoteIterator(); nodeIterator.hasNext();) {
	 *   Segment nodeSegment=nodeIterator.next();
	 *   if (nodeSegment instanceof Tag) {
	 *     Tag tag=(Tag)nodeSegment;
	 *     if (tag.getTagType().isServerTag()) continue; // ignore server tags
	 *     // Process the tag (just output it in this example):
	 *     System.out.println(tag.tidy());
	 *   } else {
	 *     // Segment is a text segment.
	 *     // Process the text segment (just output its text in this example):
	 *     String text=CharacterReference.decodeCollapseWhiteSpace(nodeSegment);
	 *     System.out.println(text);
	 *   }
	 * }</pre>
	 *  </dd>
	 * </dl>
	 * @return an iterator over every tag and text segment contained within this segment.
	 */
	public Iterator<Segment> getNodeIterator() {
		return new NodeIterator(this);
	}

	/**
	 * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * <p>
	 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
	 * if this method is to be used on a large proportion of the source.
	 * It is called automatically if this method is called on the {@link Source} object itself.
	 * <p>
	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 *
	 * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List<Tag> getAllTags() {
		return getAllTags(null);
	}

	/**
	 * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * <p>
	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 * <p>
	 * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}.
	 *
	 * @param tagType  the {@linkplain TagType type} of tags to get.
	 * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * @see #getAllStartTags(StartTagType)
	 */
	public List<Tag> getAllTags(final TagType tagType) {
		Tag tag=checkEnclosure(Tag.getNextTag(source,begin,tagType));
		if (tag==null) return Collections.emptyList();
		final ArrayList<Tag> list=new ArrayList<Tag>();
		do {
			list.add(tag);
			tag=checkEnclosure(tag.getNextTag(tagType));
		} while (tag!=null);
		return list;
	}

	/**
	 * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * <p>
	 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
	 * if this method is to be used on a large proportion of the source.
	 * It is called automatically if this method is called on the {@link Source} object itself.
	 * <p>
	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 *
	 * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List<StartTag> getAllStartTags() {
		return getAllStartTags((StartTagType)null);
	}

	/**
	 * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * <p>
	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 * <p>
	 * Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}.
	 *
	 * @param startTagType  the {@linkplain StartTagType type} of tags to get.
	 * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List<StartTag> getAllStartTags(final StartTagType startTagType) {
		if (startTagType==null) return getAllStartTags();
		StartTag startTag=(StartTag)checkEnclosure(Tag.getNextTag(source,begin,startTagType));
		if (startTag==null) return Collections.emptyList();
		final ArrayList<StartTag> list=new ArrayList<StartTag>();
		do {
			list.add(startTag);
			startTag=(StartTag)checkEnclosure(startTag.getNextTag(startTagType));
		} while (startTag!=null);
		return list;
	}

	/**
	 * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * <p>
	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 * <p>
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -