htmlcleaner.java

来自「html过滤 html过滤 html过滤」· Java 代码 · 共 902 行 · 第 1/2 页

JAVA
902
字号
    private void saveToLastOpenTag(List nodeList, Object tokenToAdd) {
        TagPos last = _openTags.getLastTagPos();
        if ( last != null && last.info != null && last.info.isIgnorePermitted() ) {
            return;
        }

        TagPos rubbishPos = _openTags.findTagToPlaceRubbish();
        if (rubbishPos != null) {
    		TagNode startTagToken = (TagNode) nodeList.get(rubbishPos.position);
            startTagToken.addItemForMoving(tokenToAdd);
        }
    }
    
    private boolean isStartToken(Object o) {
    	return (o instanceof TagNode) && !((TagNode)o).isFormed(); 
    }

	void makeTree(List nodeList, ListIterator nodeIterator) {
		// process while not reach the end of the list
		while ( nodeIterator.hasNext() ) {
			BaseToken token = (BaseToken) nodeIterator.next();

			if (token instanceof EndTagToken) {
				EndTagToken endTagToken = (EndTagToken) token;
				String tagName = endTagToken.getName();
				TagInfo tag = tagInfoProvider.getTagInfo(tagName);

				if ( (tag == null && omitUnknownTags) || (tag != null && tag.isDeprecated() && omitDeprecatedTags) ) {
					nodeIterator.set(null);
				} else if ( tag != null && !tag.allowsBody() ) {
					nodeIterator.set(null);
				} else {
					TagPos matchingPosition = _openTags.findTag(tagName);

					if (matchingPosition != null) {
                        closeSnippet(nodeList, matchingPosition, endTagToken);
					} else if ( !isAllowedInLastOpenTag(token) ) {
                        saveToLastOpenTag(nodeList, token);
                    }

                    nodeIterator.set(null);
                }
			} else if ( isStartToken(token) ) {
                TagNode startTagToken = (TagNode) token;
				String tagName = startTagToken.getName();
				TagInfo tag = tagInfoProvider.getTagInfo(tagName);

				// add tag to set of all tags
				allTags.add(tagName);

                // HTML open tag
                if ( "html".equals(tagName) ) {
					addAttributesToTag(htmlNode, startTagToken.getAttributes());
					nodeIterator.set(null);
                // BODY open tag
                } else if ( "body".equals(tagName) ) {
					addAttributesToTag(bodyNode, startTagToken.getAttributes());
					nodeIterator.set(null);
                // HEAD open tag
                } else if ( "head".equals(tagName) ) {
					addAttributesToTag(headNode, startTagToken.getAttributes());
					nodeIterator.set(null);
                // unknows HTML tag and unknown tags are not allowed
                } else if ( (tag == null && omitUnknownTags) || (tag != null && tag.isDeprecated() && omitDeprecatedTags) ) {
                    nodeIterator.set(null);
                } else if ( tag != null && tag.hasPermittedTags() && _openTags.someAlreadyOpen(tag.getPermittedTags()) ) {
                	nodeIterator.set(null);
                // if tag that must be unique, ignore this occurence
                } else if ( tag != null && tag.isUnique() && _openTags.tagEncountered(tagName) ) {
                	nodeIterator.set(null);
                // if there is no required outer tag without that this open tag is ignored
                } else if ( !isFatalTagSatisfied(tag) ) {
					nodeIterator.set(null);
                // if there is no required parent tag - it must be added before this open tag
                } else if ( mustAddRequiredParent(tag) ) {
					String requiredParent = tag.getRequiredParent();
					TagNode requiredParentStartToken = new TagNode(requiredParent);
					nodeIterator.previous();
					nodeIterator.add(requiredParentStartToken);
					nodeIterator.previous();
                // if last open tag has lower presidence then this, it must be closed
                } else if ( tag != null && !_openTags.isEmpty() && tag.isMustCloseTag( tagInfoProvider.getTagInfo(_openTags.getLastTagPos().name)) ) {
					List closed = closeSnippet(nodeList, _openTags.getLastTagPos(), startTagToken);
					int closedCount = closed.size();

					// it is needed to copy some tags again in front of current, if there are any
					if ( tag.hasCopyTags() && closedCount > 0 ) {
						// first iterates over list from the back and collects all start tokens
						// in sequence that must be copied
						ListIterator closedIt = closed.listIterator(closedCount);
						List toBeCopied = new ArrayList();
						while (closedIt.hasPrevious()) {
							TagNode currStartToken = (TagNode) closedIt.previous();
							if ( tag.isCopy(currStartToken.getName()) ) {
								toBeCopied.add(0, currStartToken);
							} else {
								break;
							}
						}

						if (toBeCopied.size() > 0) {
							Iterator copyIt = toBeCopied.iterator();
							while (copyIt.hasNext()) {
								TagNode currStartToken = (TagNode) copyIt.next();
								nodeIterator.add( currStartToken.makeCopy() );
							}
                            
                            // back to the previous place, before adding new start tokens
							for (int i = 0; i < toBeCopied.size(); i++) {
								nodeIterator.previous();
							}
                        }
					}

                    nodeIterator.previous();
				// if this open tag is not allowed inside last open tag, then it must be moved to the place where it can be
                } else if ( !isAllowedInLastOpenTag(token) ) {
                    saveToLastOpenTag(nodeList, token);
                    nodeIterator.set(null);
				// if it is known HTML tag but doesn't allow body, it is immidiately closed
                } else if ( tag != null && !tag.allowsBody() ) {
					TagNode newTagNode = createTagNode(startTagToken);
					if ( tag.isHeadTag() ) {
						headNode.addChild(newTagNode);
						nodeIterator.set(null);
					} else {
						nodeIterator.set(newTagNode);
					}
				// default case - just remember this open tag and go further
                } else {
                    _openTags.addTag( tagName, nodeIterator.previousIndex() );
				}
			} else {
				if ( !isAllowedInLastOpenTag(token) ) {
                    saveToLastOpenTag(nodeList, token);
                    nodeIterator.set(null);
				}
			}
		}
    }

	private void createDocumentNodes(List listNodes) {
		Iterator it = listNodes.iterator();
        while (it.hasNext()) {
            Object child = it.next();

            if (child == null) {
            	continue;
            }

			TagNode parent = bodyNode;
			boolean toAdd = true;

			if (child instanceof TagNode) {
				TagInfo tag = tagInfoProvider.getTagInfo( ((TagNode)child).getName() );
				if (tag != null) {
					if ( tag.isHeadTag() || (tag.isHeadAndBodyTag() && bodyNode.getChildren().isEmpty()) ) {
						parent = headNode;
					}
				}
			} else {
				if (child instanceof ContentToken) {
					toAdd = !"".equals( ((ContentToken)child).toString() );
				}
			}

			if (toAdd) {
				parent.addChild(child);
			}
        }
	}

	private List closeSnippet(List nodeList, TagPos tagPos, Object toNode) {
		List closed = new ArrayList();
		ListIterator it = nodeList.listIterator(tagPos.position);

		TagNode tagNode = null;
		Object item = it.next();
		boolean isListEnd = false;

		while ( (toNode == null && !isListEnd) || (toNode != null && item != toNode) ) {
			if ( isStartToken(item) ) {
                TagNode startTagToken = (TagNode) item;
                closed.add(startTagToken);
                List itemsToMove = startTagToken.getItemsToMove();
                if (itemsToMove != null) {
            		OpenTags prevOpenTags = _openTags;
            		_openTags = new OpenTags();
            		makeTree(itemsToMove, itemsToMove.listIterator(0));
                    closeAll(itemsToMove);
                    startTagToken.setItemsToMove(null);
                    _openTags = prevOpenTags;
                }
                
                TagNode newTagNode = createTagNode(startTagToken);

                TagInfo tag = tagInfoProvider.getTagInfo( newTagNode.getName() );
                if ( tag != null && tag.isHeadTag() ) {
					headNode.addChild(newTagNode);
					it.set(null);
				} else if (tagNode != null) {
					tagNode.addChildren(itemsToMove);
                    tagNode.addChild(newTagNode);
                    it.set(null);
                } else {
                	if (itemsToMove != null) {
                		itemsToMove.add(newTagNode);
                		it.set(itemsToMove);
                	} else {
                		it.set(newTagNode);
                	}
                }

                _openTags.removeTag( newTagNode.getName() );
                tagNode = newTagNode;
            } else {
            	if (tagNode != null) {
            		it.set(null);
            		if (item != null) {
            			tagNode.addChild(item);
                    }
                }
            }
			
			if ( it.hasNext() ) {
				item = it.next();
			} else {
				isListEnd = true;
			}
		}
		
		return closed;
    }

    /**
     * Close all unclosed tags if there are any.
     */
    private void closeAll(List nodeList) {
        TagPos firstTagPos = _openTags.findFirstTagPos();
        if (firstTagPos != null) {
            closeSnippet(nodeList, firstTagPos, null);
        }
    }

    // setters and getters

    public boolean isOmitUnknownTags() {
        return omitUnknownTags;
    }

    public void setOmitUnknownTags(boolean omitUnknownTags) {
        this.omitUnknownTags = omitUnknownTags;
    }
    
    public boolean isOmitDeprecatedTags() {
    	return omitDeprecatedTags;
    }
    
    public void setOmitDeprecatedTags(boolean omitDeprecatedTags) {
    	this.omitDeprecatedTags = omitDeprecatedTags;
    }

    public boolean isAdvancedXmlEscape() {
        return advancedXmlEscape;
    }

    public void setAdvancedXmlEscape(boolean advancedXmlEscape) {
        this.advancedXmlEscape = advancedXmlEscape;
    }

    public boolean isUseCdataForScriptAndStyle() {
        return useCdataForScriptAndStyle;
    }

    public void setUseCdataForScriptAndStyle(boolean useCdataForScriptAndStyle) {
        this.useCdataForScriptAndStyle = useCdataForScriptAndStyle;
    }

    public boolean isTranslateSpecialEntities() {
        return translateSpecialEntities;
    }

    public void setTranslateSpecialEntities(boolean translateSpecialEntities) {
        this.translateSpecialEntities = translateSpecialEntities;
    }

    public boolean isRecognizeUnicodeChars() {
        return recognizeUnicodeChars;
    }

    public void setRecognizeUnicodeChars(boolean recognizeUnicodeChars) {
        this.recognizeUnicodeChars = recognizeUnicodeChars;
    }

    public boolean isOmitComments() {
        return omitComments;
    }

    public void setOmitComments(boolean omitComments) {
        this.omitComments = omitComments;
    }

    public boolean isOmitXmlDeclaration() {
        return omitXmlDeclaration;
    }

    public void setOmitXmlDeclaration(boolean omitXmlDeclaration) {
        this.omitXmlDeclaration = omitXmlDeclaration;
    }
    
    public boolean isOmitDoctypeDeclaration() {
		return omitDoctypeDeclaration;
	}

	public void setOmitDoctypeDeclaration(boolean omitDoctypeDeclaration) {
		this.omitDoctypeDeclaration = omitDoctypeDeclaration;
	}

	public boolean isOmitXmlnsAttributes() {
		return omitXmlnsAttributes;
	}

	public void setOmitXmlnsAttributes(boolean omitXmlnsAttributes) {
		this.omitXmlnsAttributes = omitXmlnsAttributes;
	}

	public String getHyphenReplacementInComment() {
        return hyphenReplacementInComment;
    }

    public void setHyphenReplacementInComment(String hyphenReplacementInComment) {
        this.hyphenReplacementInComment = hyphenReplacementInComment;
    }

    public Set getAllTags() {
		return allTags;
	}

    // methods for writing result

    /**
     * The most general way to serialize resulting XML.
     * @param xmlSerializer
     * @throws IOException
     */
    public void writeXml(XmlSerializer xmlSerializer) throws IOException {
        xmlSerializer.createXml(htmlNode);
    }
    
    private void writeXml(Writer writer, int method) throws IOException {
        XmlSerializer xmlSerializer = null;
        
        if (WRITE_METHOD_COMPACT == method) {
        	xmlSerializer = new CompactXmlSerializer(writer, this);
        } else if (WRITE_METHOD_PRETTY == method) {
        	xmlSerializer = new PrettyXmlSerializer(writer, this);
        } else {
        	xmlSerializer = new SimpleXmlSerializer(writer, this);
        }

        xmlSerializer.createXml(htmlNode);
    }

	private void writeToStream(OutputStream out, String charset, int method) throws IOException {
        BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(out, charset) );
		writeXml(writer, method);
    }

	private void writeToStream(OutputStream out, int method) throws IOException {
        BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(out) );
		writeXml(writer, method);
    }

    public void writeXmlToStream(OutputStream out) throws IOException {
        writeToStream(out, WRITE_METHOD_SIMPLE);
    }

	public void writeXmlToStream(OutputStream out, String charset) throws IOException {
		writeToStream(out, charset, WRITE_METHOD_SIMPLE);
	}

    public void writeCompactXmlToStream(OutputStream out) throws IOException {
    	writeToStream(out, WRITE_METHOD_COMPACT);
    }
    
    public void writeCompactXmlToStream(OutputStream out, String charset) throws IOException {
    	writeToStream(out, charset, WRITE_METHOD_COMPACT);
    }

    public void writePrettyXmlToStream(OutputStream out) throws IOException {
    	writeToStream(out, WRITE_METHOD_PRETTY);
    }

    public void writePrettyXmlToStream(OutputStream out, String charset) throws IOException {
    	writeToStream(out, charset, WRITE_METHOD_PRETTY);
    }

    private void writeToFile(String fileName, String charset, int method) throws IOException {
        writeToStream(new FileOutputStream(fileName), charset, method );
    }

	private void writeToFile(String fileName, int method) throws IOException {
        writeToStream( new FileOutputStream(fileName), method );
    }

    public void writeXmlToFile(String fileName) throws IOException {
        writeToFile(fileName, WRITE_METHOD_SIMPLE);
    }

	public void writeXmlToFile(String fileName, String charset) throws IOException {
		writeToFile(fileName, charset, WRITE_METHOD_SIMPLE);
	}
    
    public void writeCompactXmlToFile(String fileName) throws IOException {
    	writeToFile(fileName, WRITE_METHOD_COMPACT);
    }
    
    public void writeCompactXmlToFile(String fileName, String charset) throws IOException {
    	writeToFile(fileName, charset, WRITE_METHOD_COMPACT);
    }

    public void writePrettyXmlToFile(String fileName) throws IOException {
    	writeToFile(fileName, WRITE_METHOD_PRETTY);
    }

    public void writePrettyXmlToFile(String fileName, String charset) throws IOException {
    	writeToFile(fileName, charset, WRITE_METHOD_PRETTY);
    }

    public String getXmlAsString() throws IOException {
        StringWriter writer = new StringWriter();
        writeXml(writer, WRITE_METHOD_SIMPLE);

        return writer.getBuffer().toString();
    }

    public String getCompactXmlAsString() throws IOException {
        StringWriter writer = new StringWriter();
        writeXml(writer, WRITE_METHOD_COMPACT);

        return writer.getBuffer().toString();
    }
    
    public String getPrettyXmlAsString() throws IOException {
    	StringWriter writer = new StringWriter();
    	writeXml(writer, WRITE_METHOD_PRETTY);
    	
    	return writer.getBuffer().toString();
    }

}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?