htmlcleaner.java

来自「html过滤 html过滤 html过滤」· Java 代码 · 共 902 行 · 第 1/2 页

JAVA
902
字号
/*  Copyright (c) 2006-2007, Vladimir Nikic
    All rights reserved.
	
    Redistribution and use of this software in source and binary forms, 
    with or without modification, are permitted provided that the following 
    conditions are met:
	
    * Redistributions of source code must retain the above
      copyright notice, this list of conditions and the
      following disclaimer.
	
    * Redistributions in binary form must reproduce the above
      copyright notice, this list of conditions and the
      following disclaimer in the documentation and/or other
      materials provided with the distribution.
	
    * The name of HtmlCleaner may not be used to endorse or promote 
      products derived from this software without specific prior
      written permission.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
    POSSIBILITY OF SUCH DAMAGE.
	
    You can contact Vladimir Nikic by sending e-mail to
    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
    subject line.
*/

package org.htmlcleaner;

import java.io.*;
import java.net.URL;
import java.util.*;

/**
 * Main HtmlCleaner class.
 *
 * <p>It represents public interface to the user. It's task is to call tokenizer with
 * specified source HTML, traverse list of produced token list and create internal
 * object model. It also offers a set of methods to write resulting XML to string,
 * file or any output stream.</p>
 * <p>Typical usage is the following:</p>
 *
 * <xmp>
 *      HtmlCleaner cleaner = new HtmlCleaner(...);     // one of few constructors
 *      cleaner.setXXX(...)                             // optionally, set cleaner's behaviour
 *      clener.clean();                                 // calls cleaning process
 *      cleaner.writeXmlXXX(...)                        // writes resulting XML to string, file or any output stream
 * </xmp>
 *
 * Created by: Vladimir Nikic <br/>
 * Date: November, 2006
 */
public class HtmlCleaner {

    public static final String DEFAULT_CHARSET = System.getProperty("file.encoding");
    
    private static final int WRITE_METHOD_SIMPLE = 0;  
    private static final int WRITE_METHOD_COMPACT = 1;  
    private static final int WRITE_METHOD_PRETTY = 2;  

    /**
     * Contains information about single open tag
     */
    private class TagPos {
		private int position;
		private String name;
		private TagInfo info;

		TagPos(int position, String name) {
			this.position = position;
			this.name = name;
            this.info = tagInfoProvider.getTagInfo(name);
        }
	}

    /**
     * Class that contains information and mathods for managing list of open,
     * but unhandled tags.
     */
    private class OpenTags {
        private List list = new ArrayList();
        private TagPos last = null;
        private Set set = new HashSet();

        private boolean isEmpty() {
            return list.isEmpty();
        }

        private void addTag(String tagName, int position) {
            last = new TagPos(position, tagName);
            list.add(last);
            set.add(tagName);
        }

        private void removeTag(String tagName) {
            ListIterator it = list.listIterator( list.size() );
            while ( it.hasPrevious() ) {
                TagPos currTagPos = (TagPos) it.previous();
                if (tagName.equals(currTagPos.name)) {
                    it.remove();
                    break;
                }
            }

            last =  list.isEmpty() ? null : (TagPos) list.get( list.size() - 1 );
        }

        private TagPos findFirstTagPos() {
            return list.isEmpty() ? null : (TagPos) list.get(0);
        }

        private TagPos getLastTagPos() {
            return last;
        }

        private TagPos findTag(String tagName) {
            if (tagName != null) {
                ListIterator it = list.listIterator( list.size() );
                while ( it.hasPrevious() ) {
                    TagPos currTagPos = (TagPos) it.previous();
                    if (tagName.equals(currTagPos.name)) {
                        return currTagPos;
                    }
                }
            }

            return null;
        }

        private boolean tagExists(String tagName) {
            TagPos tagPos = findTag(tagName);
            return tagPos != null;
        }

        private TagPos findTagToPlaceRubbish() {
            TagPos result = null, prev = null;

            if ( !isEmpty() ) {
                ListIterator it = list.listIterator( list.size() );
                while ( it.hasPrevious() ) {
                    result = (TagPos) it.previous();
                    if ( result.info == null || result.info.allowsAnything() ) {
                    	if (prev != null) {
                            return prev;
                        }
                    }
                    prev = result;
                }
            }

            return result;
        }
        
        private boolean tagEncountered(String tagName) {
        	return set.contains(tagName);
        }
        
        /**
         * Checks if any of tags specified in the set are already open.
         * @param tags
         */
        private boolean someAlreadyOpen(Set tags) {
        	Iterator it = list.iterator();
            while ( it.hasNext() ) {
            	TagPos curr = (TagPos) it.next();
            	if ( tags.contains(curr.name) ) {
            		return true;
            	}
            }
            
            
            return false;
        }
    }

    private ITagInfoProvider tagInfoProvider;

    private Reader reader;
    private transient OpenTags _openTags = new OpenTags();
    private transient DoctypeToken _docType = null;
    private Set allTags = new TreeSet(); 

    private boolean advancedXmlEscape = true;
    private boolean useCdataForScriptAndStyle = true;
    private boolean translateSpecialEntities = true;
    private boolean recognizeUnicodeChars = true;
    private boolean omitUnknownTags = false;
    private boolean omitDeprecatedTags = false;
    private boolean omitComments = false;
    private boolean omitXmlDeclaration = false;
    private boolean omitDoctypeDeclaration = true;
    private boolean omitXmlnsAttributes = false;
    private String hyphenReplacementInComment = "=";

    private TagNode htmlNode;
    private TagNode bodyNode;
    private TagNode headNode;

	/**
	 * Constructor - creates the instance with specified html 
	 * content as String.
	 * @param htmlContent
	 */
	public HtmlCleaner(String htmlContent, ITagInfoProvider tagInfoProvider) {
		this.reader = new StringReader(htmlContent);
        this.tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider.getInstance() : tagInfoProvider;
    }
	/**
	 * Constructor - creates the instance with specified html
	 * content as String.
	 * @param htmlContent
	 */
	public HtmlCleaner(String htmlContent) {
		this(htmlContent, HtmlTagProvider.getInstance());
	}

	/**
	 * Constructor - creates the instance for specified file.
	 * @param file
	 * @param charset
	 * @throws IOException
	 */
	public HtmlCleaner(File file, String charset, ITagInfoProvider tagInfoProvider) throws IOException {
		FileInputStream in = new FileInputStream(file);
		this.reader = new InputStreamReader(in, charset);
        this.tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider.getInstance() : tagInfoProvider;
    }

	/**
	 * Constructor - creates the instance for specified file.
	 * @param file
	 * @param charset
	 * @throws IOException
	 */
	public HtmlCleaner(File file, String charset) throws IOException {
		this(file, charset, HtmlTagProvider.getInstance());
    }

    /**
	 * Constructor - creates the instance for specified file and charset.
	 * @param file
	 * @throws IOException
	 */
	public HtmlCleaner(File file, ITagInfoProvider tagInfoProvider) throws IOException {
        this(file, DEFAULT_CHARSET, tagInfoProvider);
    }

	/**
	 * Constructor - creates the instance for specified file and charset.
	 * @param file
	 * @throws IOException
	 */
	public HtmlCleaner(File file) throws IOException {
		this(file, DEFAULT_CHARSET, HtmlTagProvider.getInstance());
	}

	/**
	 * Constructor - creates the instance for specified URL and charset.
	 * @param url
	 * @param charset
	 * @throws IOException 
	 */
	public HtmlCleaner(URL url, String charset, ITagInfoProvider tagInfoProvider) throws IOException {
		StringBuffer content = Utils.readUrl(url, charset);
		this.reader = new StringReader( content.toString() );
        this.tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider.getInstance() : tagInfoProvider;
    }

	/**
	 * Constructor - creates the instance for specified URL and charset.
	 * @param url
	 * @param tagInfoProvider
	 * @throws IOException
	 */
	public HtmlCleaner(URL url, ITagInfoProvider tagInfoProvider) throws IOException {
		this(url, DEFAULT_CHARSET, tagInfoProvider);
    }

	/**
	 * Constructor - creates the instance for specified URL and charset.
	 * @param url
	 * @param charset
	 * @throws IOException
	 */
	public HtmlCleaner(URL url, String charset) throws IOException {
		this(url, charset, HtmlTagProvider.getInstance());
    }

	/**
	 * Constructor - creates the instance for specified URL and charset.
	 * @param url
	 * @throws IOException
	 */
	public HtmlCleaner(URL url) throws IOException {
		this(url, DEFAULT_CHARSET, HtmlTagProvider.getInstance());
    }

    /**
     * Constructor - creates the instance for the specified inpout stream
     * @param in
     * @param tagInfoProvider
     */
    public HtmlCleaner(InputStream in, ITagInfoProvider tagInfoProvider) {
    	this.reader = new InputStreamReader(in);
        this.tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider.getInstance() : tagInfoProvider;
    }

    /**
     * Constructor - creates the instance for the specified inpout stream
     * @param in
     */
    public HtmlCleaner(InputStream in) {
    	this(in, HtmlTagProvider.getInstance());
    }

    DoctypeToken getDoctype() {
		return _docType;
	}

	void setDoctype(DoctypeToken type) {
		_docType = type;
	}

	/**
     * Constructor - creates the instance for the specified inpout stream
     * and the charset
     * @param in
     * @param charset
     * @throws IOException
     */
    public HtmlCleaner(InputStream in, String charset) throws IOException {
    	reader = new InputStreamReader(in, charset);
    }

    public void clean() throws IOException {
        allTags.clear();

        htmlNode = new TagNode("html");
        bodyNode = new TagNode("body");
        headNode = new TagNode("head");
        htmlNode.addChild(headNode);
        htmlNode.addChild(bodyNode);

        HtmlTokenizer htmlTokenizer = new HtmlTokenizer(this);

		htmlTokenizer.start();

        List nodeList = htmlTokenizer.getTokenList();
        closeAll(nodeList);
        createDocumentNodes(nodeList);
    }
    
    Reader getReader() {
    	return reader;
    }

    /**
     * Add attributes from specified map to the specified tag.
     * If some attribute already exist it is preserved.
     * @param tag
     * @param attributes
     */
	private void addAttributesToTag(TagNode tag, Map attributes) {
		if (attributes != null) {
			Map tagAttributes = tag.getAttributes();
			Iterator it = attributes.entrySet().iterator();
			while (it.hasNext()) {
				Map.Entry currEntry = (Map.Entry) it.next();
				String attName = (String) currEntry.getKey();
				if ( !tagAttributes.containsKey(attName) ) {
					String attValue = (String) currEntry.getValue();
					tag.addAttribute(attName, attValue);
				}
			}
		}
	}

    /**
     * Checks if open fatal tag is missing if there is a fatal tag for
     * the specified tag.
     * @param tag
     */
    private boolean isFatalTagSatisfied(TagInfo tag) {
    	if (tag != null) {
            String fatalTagName = tag.getFatalTag();
            return fatalTagName == null ? true : _openTags.tagExists(fatalTagName);
    	}

    	return true;
    }

    /**
     * Check if specified tag requires parent tag, but that parent
     * tag is missing in the appropriate context.
     * @param tag
     */
    private boolean mustAddRequiredParent(TagInfo tag) {
    	if (tag != null) {
    		String requiredParent = tag.getRequiredParent();
    		if (requiredParent != null) {
	    		String fatalTag = tag.getFatalTag();
                int fatalTagPositon = -1;
                if (fatalTag != null) {
                    TagPos tagPos =_openTags.findTag(fatalTag);
                    if (tagPos != null) {
                        fatalTagPositon = tagPos.position;
                    }
                }

	    		// iterates through the list of open tags from the end and check if there is some higher
	    		ListIterator it = _openTags.list.listIterator( _openTags.list.size() );
	            while ( it.hasPrevious() ) {
	            	TagPos currTagPos = (TagPos) it.previous();
	            	if (tag.isHigher(currTagPos.name)) {
	            		return currTagPos.position <= fatalTagPositon;
	            	}
	            }

	            return true;
    		}
    	}

    	return false;
    }

    private TagNode createTagNode(TagNode startTagToken) {
    	startTagToken.setFormed();
    	return startTagToken;
    }

    private boolean isAllowedInLastOpenTag(BaseToken token) {
        TagPos last = _openTags.getLastTagPos();
        if (last != null) {
			 if (last.info != null) {
                 return last.info.allowsItem(token);
			 }
		}

		return true;
    }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?