standardhtmlparser.java

来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 413 行

JAVA
413
字号
/*
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.parsers;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;

import javax.mail.internet.MimeMessage;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.parser.ParserDelegator;

import org.jasen.core.StandardParserData;
import org.jasen.error.JasenException;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;


/**
 * <p>
 * Parses the HTML part of an email for two main purposes.
 * <ul>
 * <li>To extract the plain text components of the message for tokenizing</li>
 * <li>To inspect the html for anomalies like HTML concealment and mail bugs</li>
 * </ul>
 * </p>
 */
public class StandardHTMLParser extends HTMLEditorKit.ParserCallback implements org.jasen.interfaces.HTMLParser {

	protected Reader reader = null;
	protected Writer writer = null;

	protected String encoding = "ISO8859_1";
	protected Throwable exception = null;

	protected StringBuffer buffer;
	protected boolean strOutput = false;

	protected boolean comment = false;
	protected boolean start = true;
	protected boolean ignoreNext = false; // set to true when the next text element should be ignored

	// Text within these tags is ignored
	protected static final Tag[] IGNORED_TAGS = new Tag[]{Tag.SCRIPT, Tag.STYLE, Tag.TITLE, Tag.HEAD, Tag.META};

	protected boolean spaceBefore = false;
	protected boolean spaceAfter = false;

	protected boolean quit = false;

	protected String debugIgnoreReason = null;

	// We need to record a simple tag occurrence so we can
	// trick the parser into recording whitespace between tags
	protected int tagCount = 0;
	protected int lastPosition = 0; // The last position a tag was encountered

	public StandardHTMLParser() {
	    super();
	}

	/**
	 * Extracts the plain text components of the html given by the 
	 * input stream and writes this plain text to the given output stream
	 * @param in The input stream from which the html is read
	 * @param out The ouput stream to which the plain text is written
	 * @throws JasenException
	 */
	public void extractText(InputStream in, OutputStream out) throws JasenException {

		try {

			reader = new InputStreamReader(in);
			writer = new OutputStreamWriter(out, encoding);

			parse(reader);

			if (exception != null) {
				throw new JasenException(exception);
			}

		}
		catch (IOException ex) {
			throw new JasenException(ex);
		}
		finally {
			try {
				out.flush();
			}
			catch (IOException ex) {
				throw new JasenException(ex);
			}
		}
	}

	/**
	 * Sets the encoding to use on the output stream (optional)
	 * @param encoding
	 */
	public void setEncoding(String encoding) {
		this.encoding = encoding;
	}


	/**
	 * Extracts plain text from the html given by the input stream and returns it as a String
	 * @param in The input stream from which the html is read
	 * @return A String containing the plain text of the html
	 * @throws JasenException
	 */
	public String extractText(InputStream in) throws JasenException {
		reader = new InputStreamReader(in);
		return extractText(reader);
	}

	/**
	 * Extracts plain text from the given html String and returns it as a String
	 * @param html The String containing the html
	 * @return The String containing the plain text
	 * @throws JasenException
	 */
	public String extractText(String html) throws JasenException {
		reader = new StringReader(html);
		return extractText(reader);
	}

	
	private String extractText(Reader reader) throws JasenException {

		strOutput = true;
		String text = null;

		try {
			parse(reader);
		}
		catch (IOException ex) {
			throw new JasenException(ex);
		}

		if(buffer != null) {
			// Replace new lines
			text = buffer.toString();
			//text = text.replaceAll("\\r", "");
			//text = text.replaceAll("\\n", "");
		}

		return text;

	}

	protected boolean isSpaceRequired(Tag t, int pos) {

		boolean spaceRequired = false;

		// A space is required if we hit an end tag followed by
		// a start tag without encountering text
		// OR, if we hit two simple tags in the same way
		if(tagCount > 1) {
			// We hit two sequential tags
			// If the end position of the last tag is more than 1 char away from the start
			// position of the current tag, we need a space
			if(lastPosition < pos) {
				spaceRequired = true;
			}
		}

		if(!spaceRequired) {
			spaceRequired = (t.equals(HTML.Tag.P) ||
			t.equals(HTML.Tag.TD) ||
			t.equals(HTML.Tag.TR) ||
			t.equals(HTML.Tag.TITLE) ||
			t.equals(HTML.Tag.LI) ||
			t.equals(HTML.Tag.BR) ||
			t.equals(HTML.Tag.H1) ||
			t.equals(HTML.Tag.H2) ||
			t.equals(HTML.Tag.H3) ||
			t.equals(HTML.Tag.H4) ||
			t.equals(HTML.Tag.H5) ||
			t.equals(HTML.Tag.H6) ||
			t.equals(HTML.Tag.IMG) ||
			t.equals(HTML.Tag.OBJECT) ||
			t.equals(HTML.Tag.HR) ||
			t.equals(HTML.Tag.UL));
		}


		return spaceRequired;
	}

	protected void parse(Reader in) throws IOException {

		if (strOutput) {
			buffer = new StringBuffer();
		}

		ParserDelegator delegator = new ParserDelegator();
		delegator.parse(in, this, true);
	}


	/*
	 * (non-Javadoc)
	 * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleComment(char[], int)
	 */
	public void handleComment(char[] text, int pos) {
		// Do nothing
		tagCount++;
		lastPosition = pos;
		comment = true;
	}


	/* (non-Javadoc)
	 * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleEndTag(javax.swing.text.html.HTML.Tag, int)
	 */
	public void handleEndTag(Tag t, int pos) {
		if(!spaceBefore && isSpaceRequired(t, pos)) {
			spaceBefore = true;
		}

		// We are in a tag
		tagCount++;

		// Update the last position
		// We want the END position of the tag (plus 2 for <>)
		lastPosition = pos + t.toString().length() + 2;

		// reset
		ignoreNext = false;
	}

	/* (non-Javadoc)
	 * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleSimpleTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
	 */
	public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
		// Test to see if we are at an end tag
		String end = (String)a.getAttribute(HTML.Attribute.ENDTAG);

		if("true".equalsIgnoreCase(end)) {
			handleEndTag(t, pos);
		}
		else
		{
			handleStartTag(t, a, pos);
		}
	}


	/* (non-Javadoc)
	 * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleStartTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
	 */
	public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {

		// We are in a tag
		tagCount++;

		// If we have a new line or spacer tag, add a space
		if(!spaceBefore && isSpaceRequired(t, pos)) {
			spaceBefore = true;
		}

		// Update the last position
		// We want the END position of the tag (plus 2 for <>)
		lastPosition = pos + t.toString().length() + 2;

		// If we have hit a script or style tag.. ignore next
		// Now check for ignored tags
		if(!ignoreNext) {

		    for (int i = 0; i < IGNORED_TAGS.length; i++)
            {
                if(t.equals(IGNORED_TAGS[i])) {
                    ignoreNext = true;
                    break;
                }
            }
		}

	}

	/*
	 * (non-Javadoc)
	 * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleText(char[], int)
	 */
	public void handleText(char[] text, int pos) {

		if(!quit) {
			 // We are not in a tag
			 tagCount = 0;

			 // It seems the parser does not recognise text prior to the commencement
			 // of HTML code.  The following flag caters for this
			 if(start) {
				 start = false;
			 }

			 if (strOutput) {

				 if(!ignoreNext) {
					 if(spaceBefore) {
						 buffer.append(' ');
						 spaceBefore = false;
					 }
					 buffer.append(text);
				 }
				// else
				//{
					 // Append a space to cater for the ignored text
					 //buffer.append(' ');
					 //System.out.println("Ignoring: " + new String(text) + ": " + debugIgnoreReason);
				// }
			 }
			 else {
				 // We must be output stream
				 try {

					 if(!ignoreNext) {
						 if(spaceBefore) {
							 writer.write(" ");
							 spaceBefore = false;
						 }
						 writer.write(text);
					 }
					// else
					 //{
						 // Append a space to cater for the ignored text
						 //writer.write(" ");
						 //System.out.println("Ignoring: " + new String(text) + ": " + debugIgnoreReason);
					// }

					 writer.flush(); // If we don't flush the writer we don't get any data!
				 }
				 catch (IOException ex) {
					 exception = ex;
				 }
			 }

			comment = false;

		}
	}

	/*
	 * (non-Javadoc)
	 * @see org.jasen.interfaces.HTMLParser#parse(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.MimeMessageTokenizer)
	 */
    public ParserData parse(MimeMessage mm, JasenMessage message, MimeMessageTokenizer tokenizer) throws JasenException {
        String rawHtml = message.getHtmlPart();
        String rawText = message.getTextPart();
        StandardParserData parserData = new StandardParserData();

        if(rawHtml != null) {
            String htmlText = extractText(rawHtml);
            parserData.setHtmlAsText(htmlText);
            quit = false;
        }

        if(rawText != null) {
            String text = extractText(rawText);
            parserData.setTextParsed(text);
        }

        // Now, tokenize the html and text parts of the message
        parserData.setMessageTokens(tokenizer.tokenize(mm, message, parserData));

        return parserData;
    }
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?