📄 htmlsanitiser.java

📁 HTML解析器是一个Java库
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
import net.htmlparser.jericho.*;
import java.util.*;

/**
 * Provides facilities to sanitise HTML containing unwanted or invalid tags into clean HTML.
 * <p>
 * The sanitation process consists of the following steps:
 * <ul>
 *  <li>
 *   Find all potential HTML tags in the input text. For each tag:
 *   <ul>
 *    <li>If it is one of the allowed tags
 *     (<code>&lt;br&gt;</code>, <code>&lt;p&gt;</code>, <code>&lt;b&gt;</code>, <code>&lt;i&gt;</code>,
 *      <code>&lt;ol&gt;</code>, <code>&lt;ul&gt;</code>, <code>&lt;li&gt;</code>, <code>&lt;a&gt;</code>) then:
 *     <ul>
 *      <li>If a matching end tag is required, check that the end tag exists and is correctly nested. If not, reject the tag.
 *      <li>Check that the element is in a valid position (e.g. <code>&lt;li&gt;</code> elements must be inside <code>&lt;ul&gt;</code> or <code>&lt;ol&gt;</code> elements). If not, reject the element.
 *      <li>Keep only the allowed attributes (<code>id</code>, <code>class</code>, <code>href</code>, <code>target</code>, <code>title</code>) and strip any others.
 *      <li>Ensure all attributes are XHTML compliant (all values enclosed in double quotes and fully encoded)
 *      <li>Ensure tags are XHTML compliant (convert to lower case and add closing slash to empty element tag, e.g. <code>&lt;br /&gt;</code>)
 *     </ul>
 *    </li>
 *    <li>If it is not one of the allowed tags or was rejected for any reason:
 *     <ul>
 *      <li>If the method strips invalid markup, completely remove the tag or element from the output,
 *       otherwise encode it so that it renders verbatim.
 *     </ul>
 *    </li>
 *   </ul>
 *  </li>
 *  <li>
 *   If the <code>formatWhiteSpace</code> option is enabled:
 *   <ul>
 *    <li>Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C)
 *     are converted to "<code>&lt;br /&gt;</code>".  CR/LF pairs are treated as a single line break.
 *    <li>Multiple consecutive spaces are converted so that every second space is converted to "<code>&amp;nbsp;</code>"
 *     while ensuring the last is always a normal space.
 *    <li>Tab characters (U+0009) are converted as if they were four consecutive spaces.
 *   </ul>
 *  </li>
 *  <li>Ensure all remainding text is fully encoded.
 * </ul>
 */
public class HTMLSanitiser {
	private HTMLSanitiser() {} // not instantiable

	// list of HTML elements that will be retained in the final output:
	private static final Set<String> VALID_ELEMENT_NAMES=new HashSet<String>(Arrays.asList(new String[] {
		HTMLElementName.BR,
		HTMLElementName.P,
		HTMLElementName.B,
		HTMLElementName.I,
		HTMLElementName.OL,
		HTMLElementName.UL,
		HTMLElementName.LI,
		HTMLElementName.A
	}));

	// list of HTML attributes that will be retained in the final output:
	private static final Set<String> VALID_ATTRIBUTE_NAMES=new HashSet<String>(Arrays.asList(new String[] {
		"id","class","href","target","title"
	}));

	private static final Object VALID_MARKER=new Object();

	/**
	 * Returns a sanitised version of the specified HTML, encoding any unwanted tags.
	 * <p>
	 * Calling this method is equivalent to {@link #encodeInvalidMarkup(String,boolean) encodeInvalidMarkup(pseudoHTML,false)}.
	 * <p>
	 * <dl>
	 *  <dt><b>Example:</b></dt>
	 *  <dd>
	 *   <table border="1">
	 *    <tr><td>Method call:</td><td><pre style="margin:0">HTMLSanitiser.encodeInvalidMarkup("&lt;P&gt;&lt;u&gt;Line   1&lt;/u&gt;\n&lt;b&gt;Line   2&lt;/b&gt;\n&lt;script&gt;doBadStuff()&lt;/script&gt;")</pre></td></tr>
	 *    <tr><td>Output:</td><td><pre style="margin:0">&lt;p&gt;&amp;lt;u&amp;gt;Line   1&amp;lt;/u&amp;gt;\n&lt;b&gt;Line   2&lt;/b&gt;\n&amp;lt;script&amp;gt;doBadStuff()&amp;lt;/script&amp;gt;&lt;/p&gt;</pre></td></tr>
	 *    <tr><td>Rendered output:</td><td><p>&lt;u&gt;Line   1&lt;/u&gt; <b>Line   2</b> &lt;script&gt;doBadStuff()&lt;/script&gt;</p></td></tr>
	 *   </table>
	 *   In this example:
	 *   <ul>
	 *    <li>The <code>&lt;P&gt;</code> tag is kept and converted to lower case
	 *    <li>The optional end tag <code>&lt;/p&gt;</code> is added
	 *    <li>The <code>&lt;b&gt;</code> element is kept
	 *    <li>The unwanted <code>&lt;u&gt;</code> and <code>&lt;script&gt;</code> elements are encoded so that they render verbatim
	 *   </ul>
	 *  </dd>
	 * </dl>
	 * 
	 * @param pseudoHTML  The potentially invalid HTML to sanitise.
	 * @return a sanitised version of the specified HTML, encoding any unwanted tags.
	 */
	public static String encodeInvalidMarkup(String pseudoHTML) {
		return encodeInvalidMarkup(pseudoHTML,false);
	}

	/**
	 * Returns a sanitised version of the specified HTML, encoding any unwanted tags.
	 * <p>
	 * Encoding unwanted and invalid tags results in them appearing verbatim in the rendered output,
	 * helping to highlight the problem so that the source HTML can be fixed.
	 * <p>
	 * Specifying a value of <code>true</code> as an argument to the <code>formatWhiteSpace</code> parameter
	 * results in the formatting of white space as described in the sanitisation process in the class description above.
	 * <p>
	 * <dl>
	 *  <dt><b>Example:</b></dt>
	 *  <dd>
	 *   <table border="1">
	 *    <tr><td>Method call:</td><td><pre style="margin:0">HTMLSanitiser.encodeInvalidMarkup("&lt;P&gt;&lt;u&gt;Line   1&lt;/u&gt;\n&lt;b&gt;Line   2&lt;/b&gt;\n&lt;script&gt;doBadStuff()&lt;/script&gt;",true)</pre></td></tr>
	 *    <tr><td>Output:</td><td><pre style="margin:0">&lt;p&gt;&amp;lt;u&amp;gt;Line &amp;nbsp; 1&amp;lt;/u&amp;gt;&lt;br /&gt;&lt;b&gt;Line &amp;nbsp; 2&lt;/b&gt;&lt;br /&gt;&amp;lt;script&amp;gt;doBadStuff()&amp;lt;/script&amp;gt;&lt;/p&gt;</pre></td></tr>
	 *    <tr><td>Rendered output:</td><td><p>&lt;u&gt;Line &nbsp; 1&lt;/u&gt;<br /><b>Line &nbsp; 2</b><br />&lt;script&gt;doBadStuff()&lt;/script&gt;</p></td></tr>
	 *   </table>
	 *   In this example:
	 *   <ul>
	 *    <li>The <code>&lt;P&gt;</code> tag is kept and converted to lower case
	 *    <li>The optional end tag <code>&lt;/p&gt;</code> is added
	 *    <li>The <code>&lt;b&gt;</code> element is kept
	 *    <li>The unwanted <code>&lt;u&gt;</code> and <code>&lt;script&gt;</code> elements are encoded so that they render verbatim
	 *    <li>The line feed characters are converted to <code>&lt;br /&gt;</code> elements
	 *    <li>Non-breaking spaces (<code>&amp;nbsp;</code>) are added to ensure the multiple spaces are rendered as they appear in the input.
	 *   </ul>
	 *  </dd>
	 * </dl>
	 * 
	 * @param pseudoHTML  The potentially invalid HTML to sanitise.
	 * @param formatWhiteSpace  Specifies whether white space should be marked up in the output.
	 * @return a sanitised version of the specified HTML, encoding any unwanted tags.
	 */
	public static String encodeInvalidMarkup(String pseudoHTML, boolean formatWhiteSpace) {
		return sanitise(pseudoHTML,formatWhiteSpace,false);
	}

	/**
	 * Returns a sanitised version of the specified HTML, stripping any unwanted tags.
	 * <p>
	 * Calling this method is equivalent to {@link #stripInvalidMarkup(String,boolean) stripInvalidMarkup(pseudoHTML,false)}.
	 * <p>
	 * <dl>
	 *  <dt><b>Example:</b></dt>
	 *  <dd>
	 *   <table border="1">
	 *    <tr><td>Method call:</td><td><pre style="margin:0">HTMLSanitiser.stripInvalidMarkup("&lt;P&gt;&lt;u&gt;Line   1&lt;/u&gt;\n&lt;b&gt;Line   2&lt;/b&gt;\n&lt;script&gt;doBadStuff()&lt;/script&gt;")</pre></td></tr>
	 *    <tr><td>Output:</td><td><pre style="margin:0">&lt;p&gt;Line   1\n&lt;b&gt;Line   2&lt;/b&gt;\n&lt;/p&gt;</pre></td></tr>
	 *    <tr><td>Rendered output:</td><td><p>Line   1 <b>Line   2</b> </p></td></tr>
	 *   </table>
	 *   In this example:
	 *   <ul>
	 *    <li>The <code>&lt;P&gt;</code> tag is kept and converted to lower case
	 *    <li>The optional end tag <code>&lt;/p&gt;</code> is added
	 *    <li>The <code>&lt;b&gt;</code> element is kept
	 *    <li>The unwanted <code>&lt;u&gt;</code> and <code>&lt;script&gt;</code> elements are stripped from the output
	 *   </ul>
	 *  </dd>
	 * </dl>
	 * 
	 * @param pseudoHTML  The potentially invalid HTML to sanitise.
	 * @return a sanitised version of the specified HTML, stripping any unwanted tags.
	 */
	public static String stripInvalidMarkup(String pseudoHTML) {
		return stripInvalidMarkup(pseudoHTML,false);
	}

	/**
	 * Returns a sanitised version of the specified HTML, stripping any unwanted tags.
	 * <p>
	 * Stripping unwanted and invalid tags is the preferred option if the output is for public consumption.
	 * <p>
12 下一页
💿 文件大小 2098 K
👤 上传用户 Lxb500
📂 所属分类 Java编程
🏷️ 相关标签

#HTML #Java #解析器
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -