⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 renderer.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;
import java.io.*;

/**
 * Performs a simple rendering of HTML markup into text.
 * <p>
 * This provides a human readable version of the segment content that is modelled on the way
 * <a target="_blank" href="http://www.mozilla.com/thunderbird/">Mozilla Thunderbird</a> and other email clients provide an automatic conversion of
 * HTML content to text in their <a target="_blank" href="http://tools.ietf.org/html/rfc2046#section-5.1.4">alternative MIME encoding</a> of emails.
 * <p>
 * The output using default settings complies with the "text/plain; format=flowed" (DelSp=No) protocol described in
 * <a target="_blank" href="http://tools.ietf.org/html/rfc3676">RFC3676</a>.
 * <p>
 * Many properties are available to customise the output, possibly the most significant of which being {@link #setMaxLineLength(int) MaxLineLength}.
 * See the individual property descriptions for details.
 * <p>
 * Use one of the following methods to obtain the output:
 * <ul>
 *  <li>{@link #writeTo(Writer)}</li>
 *  <li>{@link #appendTo(Appendable)}</li>
 *  <li>{@link #toString()}</li>
 *  <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
 * </ul>
 * <p>
 * The rendering of some constructs, especially tables, is very rudimentary.
 * No attempt is made to render nested tables properly, except to ensure that all of the text content is included in the output.
 * <p>
 * Rendering an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
 * <p>
 * Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions.
 * <p>
 * To extract pure text without any rendering of the markup, use the {@link TextExtractor} class instead.
 */
public class Renderer implements CharStreamSource {
	private final Segment rootSegment;
	private int maxLineLength=76;
	private String newLine="\r\n";
	private boolean includeHyperlinkURLs=true;
	private boolean decorateFontStyles=false;
	private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces;
	private int blockIndentSize=4;
	private int listIndentSize=6;
	private char[] listBullets=new char[] {'*','o','+','#'};
	private String tableCellSeparator=" \t";

	/**
	 * Constructs a new <code>Renderer</code> based on the specified {@link Segment}.
	 * @param segment  the segment containing the HTML to be rendered.
	 * @see Segment#getRenderer()
	 */
	public Renderer(final Segment segment) {
		rootSegment=segment;
	}

	// Documentation inherited from CharStreamSource
	public void writeTo(final Writer writer) throws IOException {
		appendTo(writer);
		writer.flush();
	}

	// Documentation inherited from CharStreamSource
	public void appendTo(final Appendable appendable) throws IOException {
		new Processor(this,rootSegment,getMaxLineLength(),getNewLine(),getIncludeHyperlinkURLs(),getDecorateFontStyles(),getConvertNonBreakingSpaces(),getBlockIndentSize(),getListIndentSize(),getListBullets(),getTableCellSeparator()).appendTo(appendable);
	}
	
	// Documentation inherited from CharStreamSource
	public long getEstimatedMaximumOutputLength() {
		return rootSegment.length();
	}

	// Documentation inherited from CharStreamSource
	public String toString() {
		return CharStreamSourceUtil.toString(this);
	}

	/**
	 * Sets the column at which lines are to be wrapped.
	 * <p>
	 * Lines that would otherwise exceed this length are wrapped onto a new line at a word boundary.
	 * <p>
	 * A Line may still exceed this length if it consists of a single word, where the length of the word plus the line indent exceeds the maximum length.
	 * In this case the line is wrapped immediately after the end of the word.
	 * <p>
	 * The default value is <code>76</code>, which reflects the maximum line length for sending
	 * email data specified in <a target="_blank" href="http://rfc.net/rfc2049.html#s3.">RFC2049 section 3.5</a>.
	 * 
	 * @param maxLineLength  the column at which lines are to be wrapped.
	 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement. 
	 * @see #getMaxLineLength()
	 */
	public Renderer setMaxLineLength(final int maxLineLength) {
		this.maxLineLength=maxLineLength;
		return this;
	}

	/**
	 * Returns the column at which lines are to be wrapped.
	 * <p>
	 * See the {@link #setMaxLineLength(int)} method for a full description of this property.
	 *
	 * @return the column at which lines are to be wrapped.
	 */	
	public int getMaxLineLength() {
		return maxLineLength;
	}

	/**
	 * Sets the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
	 * <p>
	 * The default value is <code>"\r\n"</code> <span title="carriage return + line feed">(CR+LF)</span> regardless of the platform on which the library is running.
	 * This is so that the default configuration produces valid 
	 * <a target="_blank" href="http://tools.ietf.org/html/rfc1521#section-7.1.2">MIME plain/text</a> output, which mandates the use of CR+LF for line breaks.
	 * <p>
	 * Specifying a <code>null</code> argument causes the output to use same new line string as is used in the source document, which is
	 * determined via the {@link Source#getNewLine()} method.
	 * If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document,
	 * or using the value from the static {@link Config#NewLine} property.
	 * 
	 * @param newLine  the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output, may be <code>null</code>.
	 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement. 
	 * @see #getNewLine()
	 */
	public Renderer setNewLine(final String newLine) {
		this.newLine=newLine;
		return this;
	}

	/**
	 * Returns the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
	 * <p>
	 * See the {@link #setNewLine(String)} method for a full description of this property.
	 *
	 * @return the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
	 */
	public String getNewLine() {
		if (newLine==null) newLine=rootSegment.source.getBestGuessNewLine();
		return newLine;
	}

	/**
	 * Sets whether hyperlink URL's are included in the output.
	 * <p>
	 * The default value is <code>true</code>.
	 * <p>
	 * When this property is <code>true</code>, the URL of each hyperlink is included in the output as determined by the implementation of the
	 * {@link #renderHyperlinkURL(StartTag)} method.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd>
	 *   <p>
	 *   Assuming the default implementation of {@link #renderHyperlinkURL(StartTag)}, when this property is <code>true</code>, the following HTML:
	 *   <blockquote class="code">
	 *    <code>&lt;a href="http://jericho.htmlparser.net/"&gt;Jericho HTML Parser&lt;/a&gt;</code>
	 *   </blockquote>
	 *   produces the following output:
	 *   <blockquote class="code">
	 *    <code>Jericho HTML Parser &lt;http://jericho.htmlparser.net/&gt;</code>
	 *   </blockquote>
	 *  </dd>
	 * </dl>
	 *
	 * @param includeHyperlinkURLs  specifies whether hyperlink URL's are included in the output.
	 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement. 
	 * @see #getIncludeHyperlinkURLs()
	 */
	public Renderer setIncludeHyperlinkURLs(final boolean includeHyperlinkURLs) {
		this.includeHyperlinkURLs=includeHyperlinkURLs;
		return this;
	}

	/**
	 * Indicates whether hyperlink URL's are included in the output.
	 * <p>
	 * See the {@link #setIncludeHyperlinkURLs(boolean)} method for a full description of this property.
	 *
	 * @return <code>true</code> if hyperlink URL's are included in the output, otherwise <code>false</code>.
	 */
	public boolean getIncludeHyperlinkURLs() {
		return includeHyperlinkURLs;
	}

	/**
	 * Renders the hyperlink URL from the specified {@link StartTag}.
	 * <p>
	 * A return value of <code>null</code> indicates that the hyperlink URL should not be rendered at all.
	 * <p>
	 * The default implementation of this method returns <code>null</code> if the <code>href</code> attribute of the specified start tag
	 * is '<code>#</code>', starts with "<code>javascript:</code>", or is missing.
	 * In all other cases it returns the value of the <code>href</code> attribute enclosed in angle brackets.
	 * <p>
	 * See the documentation of the {@link #setIncludeHyperlinkURLs(boolean)} method for an example of how a hyperlink is rendered by the default implementation.
	 * <p>
	 * This method can be overridden in a subclass to customise the rendering of hyperlink URLs.
	 * <p>
	 * Rendering of hyperlink URLs can be disabled completely without overriding this method by setting the
	 * {@link #setIncludeHyperlinkURLs(boolean) IncludeHyperlinkURLs} property to <code>false</code>.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *  <dd>
	 *   To render hyperlink URLs without the enclosing angle brackets:<br /><br />
	 *   <code>
	 *    Renderer renderer=new Renderer(segment) {<br />
	 *    &nbsp; &nbsp; public String renderHyperlinkURL(StartTag startTag) {<br />
	 *    &nbsp; &nbsp; &nbsp; &nbsp; String href=startTag.getAttributeValue("href");<br />
	 *    &nbsp; &nbsp; &nbsp; &nbsp; if (href==null || href.equals("#") || href.startsWith("javascript:")) return null;<br />
	 *    &nbsp; &nbsp; &nbsp; &nbsp; return href;<br />
	 *    &nbsp; &nbsp; }<br />
	 *    };<br />
	 *    String renderedSegment=renderer.toString();
	 *   </code>
	 *  </dd>
	 * </dl>
	 * @param startTag  the start tag of the hyperlink element, must not be <code>null</code>.
	 * @return The rendered hyperlink URL from the specified {@link StartTag}, or <code>null</code> if the hyperlink URL should not be rendered.
	 */
	public String renderHyperlinkURL(final StartTag startTag) {
		final String href=startTag.getAttributeValue("href");
		if (href==null || href.equals("#") || href.startsWith("javascript:")) return null;
		return '<'+href+'>';
	}

	/**
	 * Sets whether decoration characters are to be included around the content of some
	 * <a target="_blank" href="http://www.w3.org/TR/html401/present/graphics.html#h-15.2.1">font style elements</a> and
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.2.1">phrase elements</a>.
	 * <p>
	 * The default value is <code>false</code>.
	 * <p>
	 * Below is a table summarising the decorated elements.
	 * <p>
	 * <style type="text/css">
	 *  table#FontStyleElementSummary td, table#FontStyleElementSummary th {text-align: center; padding-bottom: 2px}
	 * </style>
	 * <table id="FontStyleElementSummary" class="bordered" cellspacing="0">
	 *  <tr><th title="HTML elements decorated">Elements</th><th title="The character placed around the element content">Character</th><th>Example Output</th></tr>
	 *  <tr><td>{@link HTMLElementName#B B} and {@link HTMLElementName#STRONG STRONG}</td><td><code>*</code></td><td><code>*bold text*</code></td></tr>
	 *  <tr><td>{@link HTMLElementName#I I} and {@link HTMLElementName#EM EM}</td><td><code>/</code></td><td><code>/italic text/</code></td></tr>
	 *  <tr><td>{@link HTMLElementName#U U}</td><td><code>_</code></td><td><code>_underlined text_</code></td></tr>
	 *  <tr><td>{@link HTMLElementName#CODE CODE}</td><td><code>|</code></td><td><code>|code|</code></td></tr>
	 * </table>
	 *
	 * @param decorateFontStyles  specifies whether decoration characters are to be included around the content of some font style elements.
	 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement. 
	 * @see #getDecorateFontStyles()
	 */
	public Renderer setDecorateFontStyles(final boolean decorateFontStyles) {
		this.decorateFontStyles=decorateFontStyles;
		return this;
	}

	/**
	 * Indicates whether decoration characters are to be included around the content of some
	 * <a target="_blank" href="http://www.w3.org/TR/html401/present/graphics.html#h-15.2.1">font style elements</a> and
	 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.2.1">phrase elements</a>.
	 * <p>
	 * See the {@link #setDecorateFontStyles(boolean)} method for a full description of this property.
	 *
	 * @return <code>true</code> if decoration characters are to be included around the content of some font style elements, otherwise <code>false</code>.
	 */
	public boolean getDecorateFontStyles() {
		return decorateFontStyles;
	}

	/**
	 * Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.
	 * <p>
	 * The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the <code>Renderer</code> is instantiated.
	 *
	 * @param convertNonBreakingSpaces  specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -