📄 htmlrewriter.java
字号:
/*
* Copyright 2000-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
*
* COMPATIBILITY
*
* [28.01.2001, RammerI] Tested on W2K, with J2SE, JDK 1.3
* [29.01.2001, RammerI] Tested on W2K, with JDK 1.2.2
*
*
*
* FEATURES
* = Rewriting of <A HREFs, <IMG SRCes, <FORM ACTIONs, <TD BACKGROUNDs,
* <INPUT SRCs, <APPLET CODEBASEs
* = Removal of <SCRIPT>, <STYLE>, <HEAD>, <EMBED>, <OBJECT>, <APPLET>,
* <NOSCRIPT>
*
****
* Please include the following section in the WebPagePortlet documentation
****
* <CODE>
*
* The following describes how HTML tags are rewritten
*
* <!-- --> (HTML Comments)
* o Unless otherwise mentioned, comments are striped.
*
* <A>
* o HREF attribute - URL merged with base URL (See Note 1)
* o TARGET attribute - Set to "_BLANK" if it does not exist
* and openInNewWindow = TRUE
* <AREA>
* o HREF attribute - URL merged with base URL (See Note 1)
* o TARGET attribute - Set to "_BLANK" if it does not exist
* and openInNewWindow = TRUE
* <APPLET>
* o Optionally included
* o CODEBASE attribute - Set to the current path if it does
* not exist.
*
* <BASE>
* o <HEAD> does NOT have to be included.
* o HREF attribute - Set the Base URL of the page, but the tag
* not set in resulting HTML. URL merged with
* base URL (See Note 1)
*
* <BODY>
* o Background attribute - Always striped.
*
* <EMBED>
* o May not work. Not supported by JDK 1.3/
*
* <FORM>
* o ACTION attribute - Set to the current URL if it does
* not exist. URL merged with base
* URL (See Note 1)
*
* <IMG>
* o SRC attribute - URL merged with base URL (See Note 1)
*
* <INPUT>
* o SRC attribute - URL merged with base URL (See Note 1)
*
* <LINK>
* o HREF attribute - URL merged with base URL (See Note 1)
*
* <OBJECT>
* o Optionally included
* o CODEBASE attribute - Set to the current path if it does
* not exist. URL merged with base
* URL (See Note 1)
*
* <SCRIPT>
* o Optionally included
* o Contents may be striped if this tag appears in the <HEAD>
* and the contents are NOT in a comment
* o SRC attribute - URL merged with base URL (See Note 1)
* o Script code that is NOT enclosed in a comment (<!-- -->)
* and in the <HEAD> may NOT be in the resulting HTML. This
* is related to the HTML parser in included in the JDK
*
* <TD>
* o BACKGROUND attribute - URL merged with base URL (See Note 1)
*
* Note 1: URL Merging.
* This is done because the source of the page sent to the
* user's browser is different then source the current page.
* Example:
* Base URL........ http://jakarta.apache.org/jetspeed
* URL............. logo.gif
* Resulting URL... http://jakarta.apache.org/jetspeed/logo.gif
*
* </CODE>
* KNOWN PROBLEMS
*
*
* == Seems to have problems with international characters, when the web-pages
* are not downloaded from the original URL but taken from the cache.
* (To reproduce do the following
* 1. create a new portlet from the url http://www.sycom.at/default.htm
* 2. stop tomcat & restart tomcat
* 3. login and customize your page to include this portlet
* 4. everything should appear fine, the webpage will show some german
* umlauts
* 5. shutdown tomcat and restart it
* 6. jetspeed is now taking the HTML not from www.sycom.at, but from the
* cache. Instead of the umlauts, you will see weird characters.
*
*
* == Does not yet work with XHTML-Pages but only plain-old HTMLs. I.e. Closed
* single tags like <BR /> screw the output up.
*
*
*
*/
package org.apache.jetspeed.util;
import java.io.Reader;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Enumeration;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.MutableAttributeSet;
// Jetspeed classes
import org.apache.jetspeed.services.logging.JetspeedLogFactoryService;
import org.apache.jetspeed.services.logging.JetspeedLogger;
/**
*
* @author Ingo Rammer (rammer@sycom.at)
* @author <a href="mailto:sgala@apache.org">Santiago Gala</a>
* @author <a href="mailto:paulsp@apache.org">Paul Spencer</a>
* @version 0.2
*/
public class HTMLRewriter
{
/**
* Static initialization of the logger for this class
*/
private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(HTMLRewriter.class.getName());
private HTMLRewriter.Callback cb = new HTMLRewriter.Callback();
/** Sets the parameters for the HTMLRewriter
* @param removeScript Shall SCRIPT-Tags and their content be removed
* @param removeStyle Shall STYLE-Tags and their content be removed
* @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
* @param removeMeta Shall META-Tags be removed
* @param removeApplet Shall APPLET-Tags and their content be removed
* @param removeObject Shall OBJECT-Tags and their content be removed
* @param removeHead Shall HEAD-Tags and their content be removed
* @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
*/
public HTMLRewriter(boolean removeScript,
boolean removeStyle,
boolean removeNoScript,
boolean removeMeta,
boolean removeApplet,
boolean removeObject,
boolean removeHead,
boolean removeOnSomething) {
init ( removeScript,
removeStyle,
removeNoScript,
removeMeta,
removeApplet,
removeObject,
removeHead,
removeOnSomething,
false);
}
/**
* Sets the parameters for the HTMLRewriter
* @param removeScript Shall SCRIPT-Tags and their content be removed
* @param removeStyle Shall STYLE-Tags and their content be removed
* @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
* @param removeMeta Shall META-Tags be removed
* @param removeApplet Shall APPLET-Tags and their content be removed
* @param removeObject Shall OBJECT-Tags and their content be removed
* @param removeHead Shall HEAD-Tags and their content be removed
* @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
*/
public HTMLRewriter(boolean removeScript,
boolean removeStyle,
boolean removeNoScript,
boolean removeMeta,
boolean removeApplet,
boolean removeObject,
boolean removeHead,
boolean removeOnSomething,
boolean openInNewWindow ) {
init ( removeScript,
removeStyle,
removeNoScript,
removeMeta,
removeApplet,
removeObject,
removeHead,
removeOnSomething,
openInNewWindow );
}
/**
* Sets the parameters for the HTMLRewriter
*
* @param removeScript Shall SCRIPT-Tags and their content be removed
* @param removeStyle Shall STYLE-Tags and their content be removed
* @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
* @param removeMeta Shall META-Tags be removed
* @param removeApplet Shall APPLET-Tags and their content be removed
* @param removeObject Shall OBJECT-Tags and their content be removed
* @param removeHead Shall HEAD-Tags and their content be removed
* @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
* @param openInNewWindow Shall links set Target="_blank"
*/
private void init (boolean removeScript,
boolean removeStyle,
boolean removeNoScript,
boolean removeMeta,
boolean removeApplet,
boolean removeObject,
boolean removeHead,
boolean removeOnSomething,
boolean openInNewWindow )
{
cb.removeScript = removeScript;
cb.removeStyle = removeStyle;
cb.removeNoScript = removeNoScript;
cb.removeMeta = removeMeta;
cb.removeApplet = removeApplet;
cb.removeObject = removeObject;
cb.removeHead = removeHead;
cb.removeOnSomething = removeOnSomething;
cb.openInNewWindow = openInNewWindow;
}
/**
* Does the conversion of the HTML
* @param HTMLrdr Reader for HTML to be converted
* @param BaseUrl URL from which this HTML was taken. We be the base-Url
* for all URL-rewritings.
* @throws MalformedURLException If the BaseUrl is not a valid URL or if an URL inside
* the document could not be converted. Should not happen
* normally, even in badly formatted HTML.
* @return HTML-String with rewritten URLs and removed (according
* to constructor-settings) tags
*/
public synchronized String convertURLs(Reader HTMLrdr, String BaseUrl) throws MalformedURLException
{
HTMLEditorKit.Parser parse = new HTMLRewriter.ParserGetter().getParser();
String res ="";
try {
if (cb.result != null) {
cb.result = null;
cb.result = new StringWriter();
}
cb.baseUrl = new URL(BaseUrl);
parse.parse(HTMLrdr,cb,true);
res = cb.getResult();
} catch (Exception e)
{
logger.error( "Unable to convertURLS", e );
throw new MalformedURLException(e.toString());
}
return res;
}
/** That Class is needed, because getParser is protected and therefore
* only accessibly by a subclass
*/
class ParserGetter extends HTMLEditorKit {
/** This is needed, because getParser is protected
* @return Html Parser
*/
public HTMLEditorKit.Parser getParser(){
return super.getParser();
}
}
class Callback extends HTMLEditorKit.ParserCallback {
// the base-url of which the given html comes from.
private URL baseUrl;
// either handling of <FORM> is buggy, or I made some weird mistake ...
// ... JDK 1.3 sends double "</form>"-tags on closing <form>
private boolean inForm = false;
// when in multi-part ignored tags (like <script> foobar </script>,
// <style> foobar </style>, a counter for the nesting-level will be
// kept here
private int ignoreLevel = 0;
private boolean removeScript = true;
private boolean removeStyle = true;
private boolean removeNoScript = true;
private boolean removeMeta = true;
private boolean removeApplet = true;
private boolean removeObject = true;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -