📄 htmlrewriter.java

📁 jetspeed源代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 * Copyright 2000-2004 The Apache Software Foundation.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 *
 *
 *  COMPATIBILITY
 *  
 *      [28.01.2001, RammerI] Tested on W2K, with J2SE, JDK 1.3
 *      [29.01.2001, RammerI] Tested on W2K, with JDK 1.2.2
 *
 *
 *
 *  FEATURES
 *      = Rewriting of <A HREFs, <IMG SRCes, <FORM ACTIONs, <TD BACKGROUNDs,
 *          <INPUT SRCs, <APPLET CODEBASEs
 *      = Removal of <SCRIPT>, <STYLE>, <HEAD>, <EMBED>, <OBJECT>, <APPLET>,
 *          <NOSCRIPT>
 * 
 ****
 * Please include the following section in the WebPagePortlet documentation     
 ****
 * <CODE>
 *
 * The following describes how HTML tags are rewritten
 *
 * <!-- --> (HTML Comments)
 *   o Unless otherwise mentioned, comments are striped.
 * 
 * <A>
 *   o HREF attribute   - URL merged with base URL (See Note 1)
 *   o TARGET attribute - Set to "_BLANK" if it does not exist 
 *                        and openInNewWindow = TRUE
 * <AREA>
 *   o HREF attribute   - URL merged with base URL (See Note 1)
 *   o TARGET attribute - Set to "_BLANK" if it does not exist 
 *                        and openInNewWindow = TRUE
 * <APPLET>
 *   o Optionally included
 *   o CODEBASE attribute - Set to the current path if it does
 *                          not exist.
 * 
 * <BASE>
 *   o <HEAD> does NOT have to be included.
 *   o HREF attribute  - Set the Base URL of the page, but the tag
 *                       not set in resulting HTML. URL merged with
 *                       base URL (See Note 1)
 * 
 * <BODY>
 *   o Background attribute - Always striped.
 * 
 * <EMBED>
 *   o May not work.  Not supported by JDK 1.3/
 * 
 * <FORM>
 *   o ACTION attribute - Set to the current URL if it does
 *                        not exist. URL merged with base
 *                        URL (See Note 1)
 * 
 * <IMG>
 *   o SRC attribute - URL merged with base URL (See Note 1)
 * 
 * <INPUT>
 *   o SRC attribute - URL merged with base URL (See Note 1)
 * 
 * <LINK>
 *   o HREF attribute - URL merged with base URL (See Note 1)
 *
 * <OBJECT>
 *   o Optionally included
 *   o CODEBASE attribute - Set to the current path if it does
 *                          not exist. URL merged with base
 *                          URL (See Note 1)
 * 
 * <SCRIPT>
 *   o Optionally included
 *   o Contents may be striped if this tag appears in the <HEAD>
 *     and the contents are NOT in a comment
 *   o SRC attribute - URL merged with base URL (See Note 1)
 *   o Script code that is NOT enclosed in a comment (<!-- -->)
 *     and in the <HEAD> may NOT be in the resulting HTML.  This
 *     is related to the HTML parser in included in the JDK 
 * 
 * <TD>
 *   o BACKGROUND attribute - URL merged with base URL (See Note 1)
 * 
 * Note 1: URL Merging.
 *   This is done because the source of the page sent to the
 *   user's browser is different then source the current page.
 *   Example:
 *     Base URL........ http://jakarta.apache.org/jetspeed
 *     URL............. logo.gif
 *     Resulting URL... http://jakarta.apache.org/jetspeed/logo.gif
 * 
 * </CODE>
 *  KNOWN PROBLEMS
 *
 *
 *  == Seems to have problems with international characters, when the web-pages
 *     are not downloaded from the original URL but taken from the cache.
 *     (To reproduce do the following
 *      1. create a new portlet from the url http://www.sycom.at/default.htm
 *      2. stop tomcat & restart tomcat
 *      3. login and customize your page to include this portlet
 *      4. everything should appear fine, the webpage will show some german 
 *         umlauts
 *      5. shutdown tomcat and restart it
 *      6. jetspeed is now taking the HTML not from www.sycom.at, but from the
 *         cache. Instead of the umlauts, you will see weird characters. 
 *
 *
 *  == Does not yet work with XHTML-Pages but only plain-old HTMLs. I.e. Closed
 *     single tags like <BR /> screw the output up.
 *      
 *
 *
 */
package org.apache.jetspeed.util;

import java.io.Reader;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Enumeration;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.MutableAttributeSet;

// Jetspeed classes
import org.apache.jetspeed.services.logging.JetspeedLogFactoryService;
import org.apache.jetspeed.services.logging.JetspeedLogger;

/**
 *
 * @author  Ingo Rammer (rammer@sycom.at)
 * @author <a href="mailto:sgala@apache.org">Santiago Gala</a>
 * @author <a href="mailto:paulsp@apache.org">Paul Spencer</a>
 * @version 0.2
 */

public class HTMLRewriter 
{
    /**
     * Static initialization of the logger for this class
     */    
    private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(HTMLRewriter.class.getName());
    
    private HTMLRewriter.Callback cb = new HTMLRewriter.Callback();
    
/** Sets the parameters for the HTMLRewriter
 * @param removeScript Shall SCRIPT-Tags and their content be removed
 * @param removeStyle Shall STYLE-Tags and their content be removed
 * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
 * @param removeMeta Shall META-Tags be removed
 * @param removeApplet Shall APPLET-Tags and their content be removed
 * @param removeObject Shall OBJECT-Tags and their content be removed
 * @param removeHead Shall HEAD-Tags and their content be removed
 * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
 */    
    public HTMLRewriter(boolean removeScript,
                        boolean removeStyle,
                        boolean removeNoScript,
                        boolean removeMeta,
                        boolean removeApplet,
                        boolean removeObject,
                        boolean removeHead,
                        boolean removeOnSomething) {
        init ( removeScript,
        removeStyle,
        removeNoScript,
        removeMeta,
        removeApplet,
        removeObject,
        removeHead,
        removeOnSomething,
        false);
    }
        
    /**
     * Sets the parameters for the HTMLRewriter
     * @param removeScript Shall SCRIPT-Tags and their content be removed
     * @param removeStyle Shall STYLE-Tags and their content be removed
     * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
     * @param removeMeta Shall META-Tags be removed
     * @param removeApplet Shall APPLET-Tags and their content be removed
     * @param removeObject Shall OBJECT-Tags and their content be removed
     * @param removeHead Shall HEAD-Tags and their content be removed
     * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
     */
    public HTMLRewriter(boolean removeScript,
                        boolean removeStyle,
                        boolean removeNoScript,
                        boolean removeMeta,
                        boolean removeApplet,
                        boolean removeObject,
                        boolean removeHead,
                        boolean removeOnSomething,
                        boolean openInNewWindow ) {
        init ( removeScript,
        removeStyle,
        removeNoScript,
        removeMeta,
        removeApplet,
        removeObject,
        removeHead,
        removeOnSomething,
        openInNewWindow ); 
    }

    /**
     * Sets the parameters for the HTMLRewriter
     *
     * @param removeScript Shall SCRIPT-Tags and their content be removed
     * @param removeStyle Shall STYLE-Tags and their content be removed
     * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
     * @param removeMeta Shall META-Tags be removed
     * @param removeApplet Shall APPLET-Tags and their content be removed
     * @param removeObject Shall OBJECT-Tags and their content be removed
     * @param removeHead Shall HEAD-Tags and their content be removed
     * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
     * @param openInNewWindow Shall links set Target="_blank"
     */
    private void init (boolean removeScript,
                       boolean removeStyle,
                       boolean removeNoScript,
                       boolean removeMeta,
                       boolean removeApplet,
                       boolean removeObject,
                       boolean removeHead,
                       boolean removeOnSomething,
                       boolean openInNewWindow ) 
    {
        cb.removeScript = removeScript;
        cb.removeStyle = removeStyle; 
        cb.removeNoScript = removeNoScript;
        cb.removeMeta = removeMeta;
        cb.removeApplet = removeApplet;
        cb.removeObject = removeObject;
        cb.removeHead = removeHead;
        cb.removeOnSomething = removeOnSomething;    
        cb.openInNewWindow = openInNewWindow;    
    }
    
    /**
     * Does the conversion of the HTML
     * @param HTMLrdr Reader for HTML to be converted
     * @param BaseUrl URL from which this HTML was taken. We be the base-Url
     * for all URL-rewritings.
     * @throws MalformedURLException If the BaseUrl is not a valid URL or if an URL inside
     * the document could not be converted. Should not happen
     * normally, even in badly formatted HTML.
     * @return HTML-String with rewritten URLs and removed (according
     * to constructor-settings) tags
     */
    public synchronized String convertURLs(Reader HTMLrdr, String BaseUrl) throws MalformedURLException
    {
        HTMLEditorKit.Parser parse = new HTMLRewriter.ParserGetter().getParser();        
        String res ="";
        try {
            if (cb.result != null) {
              cb.result = null;
              cb.result = new StringWriter();
            }
            cb.baseUrl = new URL(BaseUrl);
            parse.parse(HTMLrdr,cb,true);
            res = cb.getResult(); 
        } catch (Exception e)
        {
            logger.error( "Unable to convertURLS", e );
            throw new MalformedURLException(e.toString());
        }
        return res;
    }

    
    /** That Class is needed, because getParser is protected and therefore 
     *  only accessibly by a subclass
     */
    class ParserGetter extends HTMLEditorKit {
    /** This is needed, because getParser is protected
     * @return Html Parser
     */        
      public HTMLEditorKit.Parser getParser(){
        return super.getParser();
      }
    } 

    
    class Callback extends HTMLEditorKit.ParserCallback {

        // the base-url of which the given html comes from.
        private URL baseUrl;

        // either handling of <FORM> is buggy, or I made some weird mistake ... 
        // ... JDK 1.3 sends double "</form>"-tags on closing <form>
        private boolean inForm = false; 

        
        // when in multi-part ignored tags (like <script> foobar </script>, 
        // <style> foobar </style>, a counter for the nesting-level will be
        // kept here
        private int ignoreLevel = 0;
        
        private boolean removeScript = true;
        private boolean removeStyle = true; 
        private boolean removeNoScript = true;
        private boolean removeMeta = true;
        private boolean removeApplet = true;
        private boolean removeObject = true;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -