📄 renderer.java
字号:
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
import java.io.*;
/**
* Performs a simple rendering of HTML markup into text.
* <p>
* This provides a human readable version of the segment content that is modelled on the way
* <a target="_blank" href="http://www.mozilla.com/thunderbird/">Mozilla Thunderbird</a> and other email clients provide an automatic conversion of
* HTML content to text in their <a target="_blank" href="http://tools.ietf.org/html/rfc2046#section-5.1.4">alternative MIME encoding</a> of emails.
* <p>
* The output using default settings complies with the "text/plain; format=flowed" (DelSp=No) protocol described in
* <a target="_blank" href="http://tools.ietf.org/html/rfc3676">RFC3676</a>.
* <p>
* Many properties are available to customise the output, possibly the most significant of which being {@link #setMaxLineLength(int) MaxLineLength}.
* See the individual property descriptions for details.
* <p>
* Use one of the following methods to obtain the output:
* <ul>
* <li>{@link #writeTo(Writer)}</li>
* <li>{@link #appendTo(Appendable)}</li>
* <li>{@link #toString()}</li>
* <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
* </ul>
* <p>
* The rendering of some constructs, especially tables, is very rudimentary.
* No attempt is made to render nested tables properly, except to ensure that all of the text content is included in the output.
* <p>
* Rendering an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
* <p>
* Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions.
* <p>
* To extract pure text without any rendering of the markup, use the {@link TextExtractor} class instead.
*/
public class Renderer implements CharStreamSource {
private final Segment rootSegment;
private int maxLineLength=76;
private String newLine="\r\n";
private boolean includeHyperlinkURLs=true;
private boolean decorateFontStyles=false;
private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces;
private int blockIndentSize=4;
private int listIndentSize=6;
private char[] listBullets=new char[] {'*','o','+','#'};
private String tableCellSeparator=" \t";
/**
* Constructs a new <code>Renderer</code> based on the specified {@link Segment}.
* @param segment the segment containing the HTML to be rendered.
* @see Segment#getRenderer()
*/
public Renderer(final Segment segment) {
rootSegment=segment;
}
// Documentation inherited from CharStreamSource
public void writeTo(final Writer writer) throws IOException {
appendTo(writer);
writer.flush();
}
// Documentation inherited from CharStreamSource
public void appendTo(final Appendable appendable) throws IOException {
new Processor(this,rootSegment,getMaxLineLength(),getNewLine(),getIncludeHyperlinkURLs(),getDecorateFontStyles(),getConvertNonBreakingSpaces(),getBlockIndentSize(),getListIndentSize(),getListBullets(),getTableCellSeparator()).appendTo(appendable);
}
// Documentation inherited from CharStreamSource
public long getEstimatedMaximumOutputLength() {
return rootSegment.length();
}
// Documentation inherited from CharStreamSource
public String toString() {
return CharStreamSourceUtil.toString(this);
}
/**
* Sets the column at which lines are to be wrapped.
* <p>
* Lines that would otherwise exceed this length are wrapped onto a new line at a word boundary.
* <p>
* A Line may still exceed this length if it consists of a single word, where the length of the word plus the line indent exceeds the maximum length.
* In this case the line is wrapped immediately after the end of the word.
* <p>
* The default value is <code>76</code>, which reflects the maximum line length for sending
* email data specified in <a target="_blank" href="http://rfc.net/rfc2049.html#s3.">RFC2049 section 3.5</a>.
*
* @param maxLineLength the column at which lines are to be wrapped.
* @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getMaxLineLength()
*/
public Renderer setMaxLineLength(final int maxLineLength) {
this.maxLineLength=maxLineLength;
return this;
}
/**
* Returns the column at which lines are to be wrapped.
* <p>
* See the {@link #setMaxLineLength(int)} method for a full description of this property.
*
* @return the column at which lines are to be wrapped.
*/
public int getMaxLineLength() {
return maxLineLength;
}
/**
* Sets the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
* <p>
* The default value is <code>"\r\n"</code> <span title="carriage return + line feed">(CR+LF)</span> regardless of the platform on which the library is running.
* This is so that the default configuration produces valid
* <a target="_blank" href="http://tools.ietf.org/html/rfc1521#section-7.1.2">MIME plain/text</a> output, which mandates the use of CR+LF for line breaks.
* <p>
* Specifying a <code>null</code> argument causes the output to use same new line string as is used in the source document, which is
* determined via the {@link Source#getNewLine()} method.
* If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document,
* or using the value from the static {@link Config#NewLine} property.
*
* @param newLine the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output, may be <code>null</code>.
* @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getNewLine()
*/
public Renderer setNewLine(final String newLine) {
this.newLine=newLine;
return this;
}
/**
* Returns the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
* <p>
* See the {@link #setNewLine(String)} method for a full description of this property.
*
* @return the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
*/
public String getNewLine() {
if (newLine==null) newLine=rootSegment.source.getBestGuessNewLine();
return newLine;
}
/**
* Sets whether hyperlink URL's are included in the output.
* <p>
* The default value is <code>true</code>.
* <p>
* When this property is <code>true</code>, the URL of each hyperlink is included in the output as determined by the implementation of the
* {@link #renderHyperlinkURL(StartTag)} method.
* <p>
* <dl>
* <dt>Example:</dt>
* <dd>
* <p>
* Assuming the default implementation of {@link #renderHyperlinkURL(StartTag)}, when this property is <code>true</code>, the following HTML:
* <blockquote class="code">
* <code><a href="http://jericho.htmlparser.net/">Jericho HTML Parser</a></code>
* </blockquote>
* produces the following output:
* <blockquote class="code">
* <code>Jericho HTML Parser <http://jericho.htmlparser.net/></code>
* </blockquote>
* </dd>
* </dl>
*
* @param includeHyperlinkURLs specifies whether hyperlink URL's are included in the output.
* @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getIncludeHyperlinkURLs()
*/
public Renderer setIncludeHyperlinkURLs(final boolean includeHyperlinkURLs) {
this.includeHyperlinkURLs=includeHyperlinkURLs;
return this;
}
/**
* Indicates whether hyperlink URL's are included in the output.
* <p>
* See the {@link #setIncludeHyperlinkURLs(boolean)} method for a full description of this property.
*
* @return <code>true</code> if hyperlink URL's are included in the output, otherwise <code>false</code>.
*/
public boolean getIncludeHyperlinkURLs() {
return includeHyperlinkURLs;
}
/**
* Renders the hyperlink URL from the specified {@link StartTag}.
* <p>
* A return value of <code>null</code> indicates that the hyperlink URL should not be rendered at all.
* <p>
* The default implementation of this method returns <code>null</code> if the <code>href</code> attribute of the specified start tag
* is '<code>#</code>', starts with "<code>javascript:</code>", or is missing.
* In all other cases it returns the value of the <code>href</code> attribute enclosed in angle brackets.
* <p>
* See the documentation of the {@link #setIncludeHyperlinkURLs(boolean)} method for an example of how a hyperlink is rendered by the default implementation.
* <p>
* This method can be overridden in a subclass to customise the rendering of hyperlink URLs.
* <p>
* Rendering of hyperlink URLs can be disabled completely without overriding this method by setting the
* {@link #setIncludeHyperlinkURLs(boolean) IncludeHyperlinkURLs} property to <code>false</code>.
* <p>
* <dl>
* <dt>Example:</dt>
* <dd>
* To render hyperlink URLs without the enclosing angle brackets:<br /><br />
* <code>
* Renderer renderer=new Renderer(segment) {<br />
* public String renderHyperlinkURL(StartTag startTag) {<br />
* String href=startTag.getAttributeValue("href");<br />
* if (href==null || href.equals("#") || href.startsWith("javascript:")) return null;<br />
* return href;<br />
* }<br />
* };<br />
* String renderedSegment=renderer.toString();
* </code>
* </dd>
* </dl>
* @param startTag the start tag of the hyperlink element, must not be <code>null</code>.
* @return The rendered hyperlink URL from the specified {@link StartTag}, or <code>null</code> if the hyperlink URL should not be rendered.
*/
public String renderHyperlinkURL(final StartTag startTag) {
final String href=startTag.getAttributeValue("href");
if (href==null || href.equals("#") || href.startsWith("javascript:")) return null;
return '<'+href+'>';
}
/**
* Sets whether decoration characters are to be included around the content of some
* <a target="_blank" href="http://www.w3.org/TR/html401/present/graphics.html#h-15.2.1">font style elements</a> and
* <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.2.1">phrase elements</a>.
* <p>
* The default value is <code>false</code>.
* <p>
* Below is a table summarising the decorated elements.
* <p>
* <style type="text/css">
* table#FontStyleElementSummary td, table#FontStyleElementSummary th {text-align: center; padding-bottom: 2px}
* </style>
* <table id="FontStyleElementSummary" class="bordered" cellspacing="0">
* <tr><th title="HTML elements decorated">Elements</th><th title="The character placed around the element content">Character</th><th>Example Output</th></tr>
* <tr><td>{@link HTMLElementName#B B} and {@link HTMLElementName#STRONG STRONG}</td><td><code>*</code></td><td><code>*bold text*</code></td></tr>
* <tr><td>{@link HTMLElementName#I I} and {@link HTMLElementName#EM EM}</td><td><code>/</code></td><td><code>/italic text/</code></td></tr>
* <tr><td>{@link HTMLElementName#U U}</td><td><code>_</code></td><td><code>_underlined text_</code></td></tr>
* <tr><td>{@link HTMLElementName#CODE CODE}</td><td><code>|</code></td><td><code>|code|</code></td></tr>
* </table>
*
* @param decorateFontStyles specifies whether decoration characters are to be included around the content of some font style elements.
* @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
* @see #getDecorateFontStyles()
*/
public Renderer setDecorateFontStyles(final boolean decorateFontStyles) {
this.decorateFontStyles=decorateFontStyles;
return this;
}
/**
* Indicates whether decoration characters are to be included around the content of some
* <a target="_blank" href="http://www.w3.org/TR/html401/present/graphics.html#h-15.2.1">font style elements</a> and
* <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.2.1">phrase elements</a>.
* <p>
* See the {@link #setDecorateFontStyles(boolean)} method for a full description of this property.
*
* @return <code>true</code> if decoration characters are to be included around the content of some font style elements, otherwise <code>false</code>.
*/
public boolean getDecorateFontStyles() {
return decorateFontStyles;
}
/**
* Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
* <p>
* The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the <code>Renderer</code> is instantiated.
*
* @param convertNonBreakingSpaces specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -