📄 htmlsanitiser.java
字号:
import net.htmlparser.jericho.*;
import java.util.*;
/**
* Provides facilities to sanitise HTML containing unwanted or invalid tags into clean HTML.
* <p>
* The sanitation process consists of the following steps:
* <ul>
* <li>
* Find all potential HTML tags in the input text. For each tag:
* <ul>
* <li>If it is one of the allowed tags
* (<code><br></code>, <code><p></code>, <code><b></code>, <code><i></code>,
* <code><ol></code>, <code><ul></code>, <code><li></code>, <code><a></code>) then:
* <ul>
* <li>If a matching end tag is required, check that the end tag exists and is correctly nested. If not, reject the tag.
* <li>Check that the element is in a valid position (e.g. <code><li></code> elements must be inside <code><ul></code> or <code><ol></code> elements). If not, reject the element.
* <li>Keep only the allowed attributes (<code>id</code>, <code>class</code>, <code>href</code>, <code>target</code>, <code>title</code>) and strip any others.
* <li>Ensure all attributes are XHTML compliant (all values enclosed in double quotes and fully encoded)
* <li>Ensure tags are XHTML compliant (convert to lower case and add closing slash to empty element tag, e.g. <code><br /></code>)
* </ul>
* </li>
* <li>If it is not one of the allowed tags or was rejected for any reason:
* <ul>
* <li>If the method strips invalid markup, completely remove the tag or element from the output,
* otherwise encode it so that it renders verbatim.
* </ul>
* </li>
* </ul>
* </li>
* <li>
* If the <code>formatWhiteSpace</code> option is enabled:
* <ul>
* <li>Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C)
* are converted to "<code><br /></code>". CR/LF pairs are treated as a single line break.
* <li>Multiple consecutive spaces are converted so that every second space is converted to "<code>&nbsp;</code>"
* while ensuring the last is always a normal space.
* <li>Tab characters (U+0009) are converted as if they were four consecutive spaces.
* </ul>
* </li>
* <li>Ensure all remainding text is fully encoded.
* </ul>
*/
public class HTMLSanitiser {
private HTMLSanitiser() {} // not instantiable
// list of HTML elements that will be retained in the final output:
private static final Set<String> VALID_ELEMENT_NAMES=new HashSet<String>(Arrays.asList(new String[] {
HTMLElementName.BR,
HTMLElementName.P,
HTMLElementName.B,
HTMLElementName.I,
HTMLElementName.OL,
HTMLElementName.UL,
HTMLElementName.LI,
HTMLElementName.A
}));
// list of HTML attributes that will be retained in the final output:
private static final Set<String> VALID_ATTRIBUTE_NAMES=new HashSet<String>(Arrays.asList(new String[] {
"id","class","href","target","title"
}));
private static final Object VALID_MARKER=new Object();
/**
* Returns a sanitised version of the specified HTML, encoding any unwanted tags.
* <p>
* Calling this method is equivalent to {@link #encodeInvalidMarkup(String,boolean) encodeInvalidMarkup(pseudoHTML,false)}.
* <p>
* <dl>
* <dt><b>Example:</b></dt>
* <dd>
* <table border="1">
* <tr><td>Method call:</td><td><pre style="margin:0">HTMLSanitiser.encodeInvalidMarkup("<P><u>Line 1</u>\n<b>Line 2</b>\n<script>doBadStuff()</script>")</pre></td></tr>
* <tr><td>Output:</td><td><pre style="margin:0"><p>&lt;u&gt;Line 1&lt;/u&gt;\n<b>Line 2</b>\n&lt;script&gt;doBadStuff()&lt;/script&gt;</p></pre></td></tr>
* <tr><td>Rendered output:</td><td><p><u>Line 1</u> <b>Line 2</b> <script>doBadStuff()</script></p></td></tr>
* </table>
* In this example:
* <ul>
* <li>The <code><P></code> tag is kept and converted to lower case
* <li>The optional end tag <code></p></code> is added
* <li>The <code><b></code> element is kept
* <li>The unwanted <code><u></code> and <code><script></code> elements are encoded so that they render verbatim
* </ul>
* </dd>
* </dl>
*
* @param pseudoHTML The potentially invalid HTML to sanitise.
* @return a sanitised version of the specified HTML, encoding any unwanted tags.
*/
public static String encodeInvalidMarkup(String pseudoHTML) {
return encodeInvalidMarkup(pseudoHTML,false);
}
/**
* Returns a sanitised version of the specified HTML, encoding any unwanted tags.
* <p>
* Encoding unwanted and invalid tags results in them appearing verbatim in the rendered output,
* helping to highlight the problem so that the source HTML can be fixed.
* <p>
* Specifying a value of <code>true</code> as an argument to the <code>formatWhiteSpace</code> parameter
* results in the formatting of white space as described in the sanitisation process in the class description above.
* <p>
* <dl>
* <dt><b>Example:</b></dt>
* <dd>
* <table border="1">
* <tr><td>Method call:</td><td><pre style="margin:0">HTMLSanitiser.encodeInvalidMarkup("<P><u>Line 1</u>\n<b>Line 2</b>\n<script>doBadStuff()</script>",true)</pre></td></tr>
* <tr><td>Output:</td><td><pre style="margin:0"><p>&lt;u&gt;Line &nbsp; 1&lt;/u&gt;<br /><b>Line &nbsp; 2</b><br />&lt;script&gt;doBadStuff()&lt;/script&gt;</p></pre></td></tr>
* <tr><td>Rendered output:</td><td><p><u>Line 1</u><br /><b>Line 2</b><br /><script>doBadStuff()</script></p></td></tr>
* </table>
* In this example:
* <ul>
* <li>The <code><P></code> tag is kept and converted to lower case
* <li>The optional end tag <code></p></code> is added
* <li>The <code><b></code> element is kept
* <li>The unwanted <code><u></code> and <code><script></code> elements are encoded so that they render verbatim
* <li>The line feed characters are converted to <code><br /></code> elements
* <li>Non-breaking spaces (<code>&nbsp;</code>) are added to ensure the multiple spaces are rendered as they appear in the input.
* </ul>
* </dd>
* </dl>
*
* @param pseudoHTML The potentially invalid HTML to sanitise.
* @param formatWhiteSpace Specifies whether white space should be marked up in the output.
* @return a sanitised version of the specified HTML, encoding any unwanted tags.
*/
public static String encodeInvalidMarkup(String pseudoHTML, boolean formatWhiteSpace) {
return sanitise(pseudoHTML,formatWhiteSpace,false);
}
/**
* Returns a sanitised version of the specified HTML, stripping any unwanted tags.
* <p>
* Calling this method is equivalent to {@link #stripInvalidMarkup(String,boolean) stripInvalidMarkup(pseudoHTML,false)}.
* <p>
* <dl>
* <dt><b>Example:</b></dt>
* <dd>
* <table border="1">
* <tr><td>Method call:</td><td><pre style="margin:0">HTMLSanitiser.stripInvalidMarkup("<P><u>Line 1</u>\n<b>Line 2</b>\n<script>doBadStuff()</script>")</pre></td></tr>
* <tr><td>Output:</td><td><pre style="margin:0"><p>Line 1\n<b>Line 2</b>\n</p></pre></td></tr>
* <tr><td>Rendered output:</td><td><p>Line 1 <b>Line 2</b> </p></td></tr>
* </table>
* In this example:
* <ul>
* <li>The <code><P></code> tag is kept and converted to lower case
* <li>The optional end tag <code></p></code> is added
* <li>The <code><b></code> element is kept
* <li>The unwanted <code><u></code> and <code><script></code> elements are stripped from the output
* </ul>
* </dd>
* </dl>
*
* @param pseudoHTML The potentially invalid HTML to sanitise.
* @return a sanitised version of the specified HTML, stripping any unwanted tags.
*/
public static String stripInvalidMarkup(String pseudoHTML) {
return stripInvalidMarkup(pseudoHTML,false);
}
/**
* Returns a sanitised version of the specified HTML, stripping any unwanted tags.
* <p>
* Stripping unwanted and invalid tags is the preferred option if the output is for public consumption.
* <p>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -