spamhtmlparser.java

来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 1,082 行 · 第 1/2 页

JAVA
1,082
字号
/*
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.parsers;

import java.util.Arrays;
import java.util.List;
import java.util.Stack;
import java.util.Vector;

import javax.mail.internet.MimeMessage;
import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Attribute;
import javax.swing.text.html.HTML.Tag;

import org.jasen.core.StandardParserData;
import org.jasen.core.engine.JasenEngineConfiguration;
import org.jasen.core.parsers.handlers.ImageTagHandler;
import org.jasen.core.parsers.handlers.SrcCgiTagHandler;
import org.jasen.core.parsers.handlers.URLPortTagHandler;
import org.jasen.core.parsers.handlers.URLPortTagHandlerResult;
import org.jasen.error.JasenException;
import org.jasen.interfaces.HTMLTagHandler;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;
import org.jasen.util.WebUtils;

/**
 * <p>Extracts plain text elements from an HTML document.</p>
 * <p>This implementation is specific to parsing the text out of spam emails</p>
 * @author Jason Polites
 */
public class SpamHTMLParser extends StandardHTMLParser {

    /**
     * The default numerical bacjground color (white)
     */
	public static final int DEFAULT_BGCOLOR = 765;
	
	/**
	 * The default numerical foreground color (black)
	 */
	public static final int DEFAULT_COLOR = 0;

	/**
	 * String (hex) value for the default background color (white)
	 */
	public static final String DEFAULT_STR_BGCOLOR = "FFFFFF";
	
	/**
	 * String (hex) value for the default foreground color (black)
	 */
	public static final String DEFAULT_STR_COLOR = "000000";

	/**
	 * The contrast threshold below which content is deemed concealed
	 * @deprecated Use getContrastThreshold
	 */
	public static final float COLOR_THRESHOLD = 0.075f;
	
	/**
	 * The font size threshold below which content is deemed concealed
	 * @deprecated Use getMicroFontSize
	 */
	public static final int FONTSIZE_THRESHOLD = 1;
	
	/**
	 * The size (in pixels) below which an element is considered concealed
	 * @deprecated Use getMicroElementSize
	 */
	public static final int ELEMENT_THRESHOLD = 5; // pixel width / height
	
	/**
	 * @deprecated Not used
	 */
	public static final double TOKEN_RECOGNITION_THRESHOLD = 0.1d;

	/**
	 * The CSS name for background colors (background-color)
	 */
	public static final String BGCOLOR_NAME = "backgound-color";
	
	/**
	 * The CSS name for foreground colors (color)
	 */
	public static final String COLOR_NAME = "color";

	/**
	 * @deprecated Not used
	 */
	public static final String URL_REGEX = "";

	private int currentBGColor = DEFAULT_BGCOLOR; // 255 x 3
	private int currentTextColor = DEFAULT_COLOR;

	private Stack activeColorStack;
	private Stack activeBGColorStack;
	private Stack activeColorTagStack;
	private Stack activeBGColorTagStack;

	private int inertColorTagCount = 0;
	private int inertBGColorTagCount = 0;
	
	
	float contrastThreshold = 0.075f;
	int microFontSize = 1;
	int microElementSize = 5;

	private String[] currentStyleAttributes = null;

	private int concealedHtmlCount = 0;
	private int srcCgiCount = 0;
	private int imageCount = 0;
	private int srcPortCount = 0;
	private int falseAnchorCount = 0;

	private List urlPorts;

	// Stores the value of an href from an anchor tag
	private String currentAnchorUrl = null;

	// Holds the value of a BASE tag if one is found
	private String urlBase = null;

	private ImageTagHandler imageHandler = null;
	private SrcCgiTagHandler cgiHandler = null;
	private URLPortTagHandler portHandler = null;

	public SpamHTMLParser() {
	    super();

	    imageHandler = new ImageTagHandler();
	    cgiHandler = new SrcCgiTagHandler();
	    portHandler = new URLPortTagHandler();
	    
	    // Set the default config
		contrastThreshold = JasenEngineConfiguration.getInstance().getParserContrastThreshold();
		microFontSize = JasenEngineConfiguration.getInstance().getParserMicroFontSize();
		microElementSize = JasenEngineConfiguration.getInstance().getParserMicroElementSize();	    
	    
	}

	// These color names MUST be in natural sort order, but also MUST be in the same order
	// as the corresponding color names below
	public static String[] HTML_COLOR_NAMES =
		{
			"aliceblue",
			"antiquewhite",
			"aqua",
			"aquamarine",
			"azure",
			"beige",
			"bisque",
			"black",
			"blanchedalmond",
			"blue",
			"blueviolet",
			"brown",
			"burlywood",
			"cadetblue",
			"chartreuse",
			"chocolate",
			"coral",
			"cornflowerblue",
			"cornsilk",
			"crimson",
			"cyan",
			"darkblue",
			"darkcyan",
			"darkgoldenrod",
			"darkgray",
			"darkgreen",
			"darkkhaki",
			"darkmagenta",
			"darkolivegreen",
			"darkorange",
			"darkorchid",
			"darkred",
			"darksalmon",
			"darkseagreen",
			"darkslateblue",
			"darkslategray",
			"darkturquoise",
			"darkviolet",
			"deeppink",
			"deepskyblue",
			"dimgray",
			"dodgerblue",
			"firebrick",
			"floralwhite",
			"forestgreen",
			"fuchsia",
			"gainsboro",
			"ghostwhite",
			"gold",
			"goldenrod",
			"gray",
			"green",
			"greenyellow",
			"honeydew",
			"hotpink",
			"indianred",
			"indigo",
			"ivory",
			"khaki",
			"lavender",
			"lavenderblush",
			"lawngreen",
			"lemonchiffon",
			"lightblue",
			"lightcoral",
			"lightcyan",
			"lightgoldenrodyellow",
			"lightgreen",
			"lightgrey",
			"lightpink",
			"lightsalmon",
			"lightseagreen",
			"lightskyblue",
			"lightslategray",
			"lightsteelblue",
			"lightyellow",
			"lime",
			"limegreen",
			"linen",
			"magenta",
			"maroon",
			"mediumaquamarine",
			"mediumblue",
			"mediumorchid",
			"mediumpurple",
			"mediumseagreen",
			"mediumslateblue",
			"mediumspringgreen",
			"mediumturquoise",
			"mediumvioletred",
			"midnightblue",
			"mintcream",
			"mistyrose",
			"moccasin",
			"navajowhite",
			"navy",
			"navyblue",
			"oldlace",
			"olive",
			"olivedrab",
			"orange",
			"orangered",
			"orchid",
			"palegoldenrod",
			"palegreen",
			"paleturquoise",
			"palevioletred",
			"papayawhip",
			"peachpuff",
			"peru",
			"pink",
			"plum",
			"powderblue",
			"purple",
			"red",
			"rosybrown",
			"royalblue",
			"saddlebrown",
			"salmon",
			"sandybrown",
			"seagreen",
			"seashell",
			"sienna",
			"silver",
			"skyblue",
			"slateblue",
			"slategray",
			"snow",
			"springgreen",
			"steelblue",
			"tan",
			"teal",
			"thistle",
			"tomato",
			"turquoise",
			"violet",
			"wheat",
			"white",
			"whitesmoke",
			"yellow",
			"yellowgreen" };

	// These are the hex values corresponding to the named values above
	public static String[] HTML_COLOR_VALUES =
		{
			"F0F8FF",
			"FAEBD7",
			"00FFFF",
			"7FFFD4",
			"F0FFFF",
			"F5F5DC",
			"FFE4C4",
			"000000",
			"FFEBCD",
			"0000FF",
			"8A2BE2",
			"A52A2A",
			"DEB887",
			"5F9EA0",
			"7FFF00",
			"D2691E",
			"FF7F50",
			"6495ED",
			"FFF8DC",
			"DC143C",
			"00FFFF",
			"00008B",
			"008B8B",
			"B8860B",
			"A9A9A9",
			"006400",
			"BDB76B",
			"8B008B",
			"556B2F",
			"FF8C00",
			"9932CC",
			"8B0000",
			"E9967A",
			"8FBC8F",
			"483D8B",
			"2F4F4F",
			"00CED1",
			"9400D3",
			"FF1493",
			"00BFFF",
			"696969",
			"1E90FF",
			"B22222",
			"FFFAF0",
			"228B22",
			"FF00FF",
			"DCDCDC",
			"F8F8FF",
			"FFD700",
			"DAA520",
			"7F7F7F",
			"008000",
			"ADFF2F",
			"F0FFF0",
			"FF69B4",
			"CD5C5C",
			"4B0082",
			"FFFFF0",
			"F0E68C",
			"E6E6FA",
			"FFF0F5",
			"7CFC00",
			"FFFACD",
			"ADD8E6",
			"F08080",
			"E0FFFF",
			"FAFAD2",
			"90EE90",
			"D3D3D3",
			"FFB6C1",
			"FFA07A",
			"20B2AA",
			"87CEFA",
			"778899",
			"B0C4DE",
			"FFFFE0",
			"00FF00",
			"32CD32",
			"FAF0E6",
			"FF00FF",
			"800000",
			"66CDAA",
			"0000CD",
			"BA55D3",
			"9370DB",
			"3CB371",
			"7B68EE",
			"00FA9A",
			"48D1CC",
			"C71585",
			"191970",
			"F5FFFA",
			"FFE4E1",
			"FFE4B5",
			"FFDEAD",
			"000080",
			"9FAFDF",
			"FDF5E6",
			"808000",
			"6B8E23",
			"FFA500",
			"FF4500",
			"DA70D6",
			"EEE8AA",
			"98FB98",
			"AFEEEE",
			"DB7093",
			"FFEFD5",
			"FFDAB9",
			"CD853F",
			"FFC0CB",
			"DDA0DD",
			"B0E0E6",
			"800080",
			"FF0000",
			"BC8F8F",
			"4169E1",
			"8B4513",
			"FA8072",
			"F4A460",
			"2E8B57",
			"FFF5EE",
			"A0522D",
			"C0C0C0",
			"87CEEB",
			"6A5ACD",
			"708090",
			"FFFAFA",
			"00FF7F",
			"4682B4",
			"D2B48C",
			"008080",
			"D8BFD8",
			"FF6347",
			"40E0D0",
			"EE82EE",
			"F5DEB3",
			"FFFFFF",
			"F5F5F5",
			"FFFF00",
			"9ACD32" };

	private static String[] INVALID_FONT_SIZES = { "x-small", "xx-small" };

	/* (non-Javadoc)
	 * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleStartTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
	 */
	public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {

		if (!quit) {

			// First, we need to find the default body text color
			String color = null;

			int defaultColor = DEFAULT_COLOR;

			if (t.equals(HTML.Tag.BODY)) {
				color = getColor(a, HTML.Attribute.TEXT, COLOR_NAME);

				if (color != null) {
					// Set this as the default text color
					defaultColor = getIntColor(color);
				}
			} else {
				color = getColor(a, HTML.Attribute.COLOR, COLOR_NAME);
			}

			// We need to determine the current BGColor or Text Color attributes
			String bgcolor = getColor(a, HTML.Attribute.BGCOLOR, BGCOLOR_NAME);

			int iBGColor = DEFAULT_BGCOLOR;
			int iTextColor = defaultColor;

			if (bgcolor != null) {
				iBGColor = getIntColor(bgcolor);

				// Set the current BG Color
				currentBGColor = iBGColor;

				// Add the color to the stack
				if (activeBGColorStack == null) {
					activeBGColorStack = new Stack();
				}

				activeBGColorStack.push(String.valueOf(iBGColor));

				// Add the tag to the active tag stack
				if (activeBGColorTagStack == null) {
					activeBGColorTagStack = new Stack();
				}

				activeBGColorTagStack.push(t);
			}

			if (color != null) {
				iTextColor = getIntColor(color);

				// Set the current BG Color
				currentTextColor = iTextColor;

				// Add the color to the stack
				if (activeColorStack == null) {
					activeColorStack = new Stack();
				}

				activeColorStack.push(String.valueOf(iTextColor));

				// Add the tag to the active tag stack
				if (activeColorTagStack == null) {
					activeColorTagStack = new Stack();
				}

				activeColorTagStack.push(t);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?