📄 textextractor.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/links.html#adef-href">href</a>,
	 * but the method can be overridden in a subclass to perform a check of arbitrary complexity on each attribute.
	 * <p>
	 * * The value of a <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a> attribute is only included if a 
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-name-META">name</a> attribute is also present in the specified start tag,
	 * as the content attribute of a {@link HTMLElementName#META META} tag only contains human readable text if the name attribute is used as opposed to an
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-http-equiv">http-equiv</a> attribute.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd>
	 *   To include only the value of <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a> and
	 *   <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a> attributes:<br /><br />
	 *   <code>
	 *    final Set includeAttributeNames=new HashSet(Arrays.asList(new String[] {"title","alt"}));<br />
	 *    TextExtractor textExtractor=new TextExtractor(segment) {<br />
	 *    &nbsp; &nbsp; public boolean includeAttribute(StartTag startTag, Attribute attribute) {<br />
	 *    &nbsp; &nbsp; &nbsp; &nbsp; return includeAttributeNames.contains(attribute.getKey());<br />
	 *    &nbsp; &nbsp; }<br />
	 *    };<br />
	 *    textExtractor.setIncludeAttributes(true);<br />
	 *    String extractedText=textExtractor.toString();
	 *   </code>
	 *  </dd>
	 * </dl>
	 * @param startTag  the start tag of the element to check for inclusion.
	 * @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
	 */
	public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
		AttributeIncludeChecker attributeIncludeChecker=map.get(attribute.getKey());
		if (attributeIncludeChecker==null) return false;
		return attributeIncludeChecker.includeAttribute(startTag,attribute);
	}

	/**
	 * Sets whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
	 * <p>
	 * The default value is <code>false</code>, meaning that content from all elements meeting the other criteria is included.
	 *
	 * @param excludeNonHTMLElements  specifies whether content <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
	 * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement. 
	 * @see #getExcludeNonHTMLElements()
	 */
	public TextExtractor setExcludeNonHTMLElements(boolean excludeNonHTMLElements) {
		this.excludeNonHTMLElements=excludeNonHTMLElements;
		return this;
	}
	
	/**
	 * Indicates whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
	 * <p>
	 * See the {@link #setExcludeNonHTMLElements(boolean)} method for a full description of this property.
	 * 
	 * @return <code>true</code> if the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output, otherwise <code>false</code>.
	 */
	public boolean getExcludeNonHTMLElements() {
		return excludeNonHTMLElements;
	}

	/**
	 * Indicates whether the text inside the {@link Element} of the specified start tag should be excluded from the output.
	 * <p>
	 * During the text extraction process, every start tag encountered in the segment is checked using this method to determine whether the text inside its
	 * {@linkplain StartTag#getElement() associated element} should be excluded from the output.
	 * <p>
	 * The default implementation of this method is to always return <code>false</code>, so that every element is included,
	 * but the method can be overridden in a subclass to perform a check of arbitrary complexity on each start tag.
	 * <p>
	 * All elements nested inside an excluded element are also implicitly excluded, as are all
	 * {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements.
	 * Such elements are skipped over without calling this method, so there is no way to include them by overriding the method.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd>
	 *   To extract the text from a <code>segment</code>, excluding any text inside elements with the attribute <code>class="NotIndexed"</code>:<br /><br />
	 *   <code>
	 *    TextExtractor textExtractor=new TextExtractor(segment) {<br />
	 *    &nbsp; &nbsp; public boolean excludeElement(StartTag startTag) {<br />
	 *    &nbsp; &nbsp; &nbsp; &nbsp; return "NotIndexed".equalsIgnoreCase(startTag.getAttributeValue("class"));<br />
	 *    &nbsp; &nbsp; }<br />
	 *    };<br />
	 *    String extractedText=textExtractor.toString();
	 *   </code>
	 *  </dd>
	 * </dl>
	 * @param startTag  the start tag of the element to check for inclusion.
	 * @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
	 */
	public boolean excludeElement(final StartTag startTag) {
		return false;
	}

	private static interface AttributeIncludeChecker {
		boolean includeAttribute(final StartTag startTag, final Attribute attribute);
	}

	private static AttributeIncludeChecker ALWAYS_INCLUDE=new AttributeIncludeChecker() {
		public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
			return true;
		}
	};

	private static AttributeIncludeChecker INCLUDE_IF_NAME_ATTRIBUTE_PRESENT=new AttributeIncludeChecker() {
		public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
			return startTag.getAttributes().get("name")!=null;
		}
	};

	private static Map<String,AttributeIncludeChecker> initDefaultAttributeIncludeCheckerMap() {
		Map<String,AttributeIncludeChecker> map=new HashMap<String,AttributeIncludeChecker>();
		map.put("title",ALWAYS_INCLUDE); // add title attribute
		map.put("alt",ALWAYS_INCLUDE); // add alt attribute (APPLET, AREA, IMG and INPUT elements)
		map.put("label",ALWAYS_INCLUDE); // add label attribute (OPTION and OPTGROUP elements)
		map.put("summary",ALWAYS_INCLUDE); // add summary attribute (TABLE element)
		map.put("content",INCLUDE_IF_NAME_ATTRIBUTE_PRESENT); // add content attribute (META element)
		map.put("href",ALWAYS_INCLUDE); // add href attribute (A, AREA and LINK elements)
		// don't bother with the prompt attribute from the ININDEX element as the element is deprecated and very rarely used.
		return map;
	}

	/**
	 * This class does the actual work, but is first passed final copies of all the parameters for efficiency.
	 * Note at present this is not implemented in a memory-efficient manner.
	 * Once the CharacterReference.decodeCollapseWhiteSpace functionality is available as a FilterWriter (coming in release 3.0),
	 * the main algorithm with be implemented in the writeTo(Writer) method to allow for more memory-efficient processing.
	 */
	private final class Processor {
		private final Segment segment;
		private final Source source;
		private final boolean convertNonBreakingSpaces;
		private final boolean includeAttributes;
		private final boolean excludeNonHTMLElements;

		public Processor(final Segment segment, final boolean convertNonBreakingSpaces, final boolean includeAttributes, final boolean excludeNonHTMLElements) {
			this.segment=segment;
			source=segment.source;
			this.convertNonBreakingSpaces=convertNonBreakingSpaces;
			this.includeAttributes=includeAttributes;
			this.excludeNonHTMLElements=excludeNonHTMLElements;
		}

		public String toString() {
			final StringBuilder sb=new StringBuilder(segment.length());
			for (NodeIterator nodeIterator=new NodeIterator(segment); nodeIterator.hasNext();) {
				Segment segment=nodeIterator.next();
				if (segment instanceof Tag) {
					final Tag tag=(Tag)segment;
					if (tag.getTagType().isServerTag()) {
						// elementContainsMarkup should be made into a TagType property one day.
						// for the time being assume all server element content is code, although this is not true for some Mason elements.
						final boolean elementContainsMarkup=false;
						if (!elementContainsMarkup) {
							final Element element=tag.getElement();
							if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
						}
						continue;
					}
					if (tag.getTagType()==StartTagType.NORMAL) {
						final StartTag startTag=(StartTag)tag;
						if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE || excludeElement(startTag) || (excludeNonHTMLElements && !HTMLElements.getElementNames().contains(tag.name))) {
							nodeIterator.skipToPos(startTag.getElement().getEnd());
							continue;
						}
						if (includeAttributes) {
							for (Attribute attribute : startTag.getAttributes()) {
								if (includeAttribute(startTag,attribute)) sb.append(' ').append(attribute.getValueSegment()).append(' ');
							}
						}
					}
					// Treat both start and end tags not belonging to inline-level elements as whitespace:
					if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) sb.append(' ');
				} else {
					sb.append(segment);
				}
			}
			final String decodedText=CharacterReference.decodeCollapseWhiteSpace(sb,convertNonBreakingSpaces);
			return decodedText;
		}
	}
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -