spamhtmlparser.java
来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 1,082 行 · 第 1/2 页
JAVA
1,082 行
/*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.parsers;
import java.util.Arrays;
import java.util.List;
import java.util.Stack;
import java.util.Vector;
import javax.mail.internet.MimeMessage;
import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Attribute;
import javax.swing.text.html.HTML.Tag;
import org.jasen.core.StandardParserData;
import org.jasen.core.engine.JasenEngineConfiguration;
import org.jasen.core.parsers.handlers.ImageTagHandler;
import org.jasen.core.parsers.handlers.SrcCgiTagHandler;
import org.jasen.core.parsers.handlers.URLPortTagHandler;
import org.jasen.core.parsers.handlers.URLPortTagHandlerResult;
import org.jasen.error.JasenException;
import org.jasen.interfaces.HTMLTagHandler;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;
import org.jasen.util.WebUtils;
/**
* <p>Extracts plain text elements from an HTML document.</p>
* <p>This implementation is specific to parsing the text out of spam emails</p>
* @author Jason Polites
*/
public class SpamHTMLParser extends StandardHTMLParser {
/**
* The default numerical bacjground color (white)
*/
public static final int DEFAULT_BGCOLOR = 765;
/**
* The default numerical foreground color (black)
*/
public static final int DEFAULT_COLOR = 0;
/**
* String (hex) value for the default background color (white)
*/
public static final String DEFAULT_STR_BGCOLOR = "FFFFFF";
/**
* String (hex) value for the default foreground color (black)
*/
public static final String DEFAULT_STR_COLOR = "000000";
/**
* The contrast threshold below which content is deemed concealed
* @deprecated Use getContrastThreshold
*/
public static final float COLOR_THRESHOLD = 0.075f;
/**
* The font size threshold below which content is deemed concealed
* @deprecated Use getMicroFontSize
*/
public static final int FONTSIZE_THRESHOLD = 1;
/**
* The size (in pixels) below which an element is considered concealed
* @deprecated Use getMicroElementSize
*/
public static final int ELEMENT_THRESHOLD = 5; // pixel width / height
/**
* @deprecated Not used
*/
public static final double TOKEN_RECOGNITION_THRESHOLD = 0.1d;
/**
* The CSS name for background colors (background-color)
*/
public static final String BGCOLOR_NAME = "backgound-color";
/**
* The CSS name for foreground colors (color)
*/
public static final String COLOR_NAME = "color";
/**
* @deprecated Not used
*/
public static final String URL_REGEX = "";
private int currentBGColor = DEFAULT_BGCOLOR; // 255 x 3
private int currentTextColor = DEFAULT_COLOR;
private Stack activeColorStack;
private Stack activeBGColorStack;
private Stack activeColorTagStack;
private Stack activeBGColorTagStack;
private int inertColorTagCount = 0;
private int inertBGColorTagCount = 0;
float contrastThreshold = 0.075f;
int microFontSize = 1;
int microElementSize = 5;
private String[] currentStyleAttributes = null;
private int concealedHtmlCount = 0;
private int srcCgiCount = 0;
private int imageCount = 0;
private int srcPortCount = 0;
private int falseAnchorCount = 0;
private List urlPorts;
// Stores the value of an href from an anchor tag
private String currentAnchorUrl = null;
// Holds the value of a BASE tag if one is found
private String urlBase = null;
private ImageTagHandler imageHandler = null;
private SrcCgiTagHandler cgiHandler = null;
private URLPortTagHandler portHandler = null;
public SpamHTMLParser() {
super();
imageHandler = new ImageTagHandler();
cgiHandler = new SrcCgiTagHandler();
portHandler = new URLPortTagHandler();
// Set the default config
contrastThreshold = JasenEngineConfiguration.getInstance().getParserContrastThreshold();
microFontSize = JasenEngineConfiguration.getInstance().getParserMicroFontSize();
microElementSize = JasenEngineConfiguration.getInstance().getParserMicroElementSize();
}
// These color names MUST be in natural sort order, but also MUST be in the same order
// as the corresponding color names below
public static String[] HTML_COLOR_NAMES =
{
"aliceblue",
"antiquewhite",
"aqua",
"aquamarine",
"azure",
"beige",
"bisque",
"black",
"blanchedalmond",
"blue",
"blueviolet",
"brown",
"burlywood",
"cadetblue",
"chartreuse",
"chocolate",
"coral",
"cornflowerblue",
"cornsilk",
"crimson",
"cyan",
"darkblue",
"darkcyan",
"darkgoldenrod",
"darkgray",
"darkgreen",
"darkkhaki",
"darkmagenta",
"darkolivegreen",
"darkorange",
"darkorchid",
"darkred",
"darksalmon",
"darkseagreen",
"darkslateblue",
"darkslategray",
"darkturquoise",
"darkviolet",
"deeppink",
"deepskyblue",
"dimgray",
"dodgerblue",
"firebrick",
"floralwhite",
"forestgreen",
"fuchsia",
"gainsboro",
"ghostwhite",
"gold",
"goldenrod",
"gray",
"green",
"greenyellow",
"honeydew",
"hotpink",
"indianred",
"indigo",
"ivory",
"khaki",
"lavender",
"lavenderblush",
"lawngreen",
"lemonchiffon",
"lightblue",
"lightcoral",
"lightcyan",
"lightgoldenrodyellow",
"lightgreen",
"lightgrey",
"lightpink",
"lightsalmon",
"lightseagreen",
"lightskyblue",
"lightslategray",
"lightsteelblue",
"lightyellow",
"lime",
"limegreen",
"linen",
"magenta",
"maroon",
"mediumaquamarine",
"mediumblue",
"mediumorchid",
"mediumpurple",
"mediumseagreen",
"mediumslateblue",
"mediumspringgreen",
"mediumturquoise",
"mediumvioletred",
"midnightblue",
"mintcream",
"mistyrose",
"moccasin",
"navajowhite",
"navy",
"navyblue",
"oldlace",
"olive",
"olivedrab",
"orange",
"orangered",
"orchid",
"palegoldenrod",
"palegreen",
"paleturquoise",
"palevioletred",
"papayawhip",
"peachpuff",
"peru",
"pink",
"plum",
"powderblue",
"purple",
"red",
"rosybrown",
"royalblue",
"saddlebrown",
"salmon",
"sandybrown",
"seagreen",
"seashell",
"sienna",
"silver",
"skyblue",
"slateblue",
"slategray",
"snow",
"springgreen",
"steelblue",
"tan",
"teal",
"thistle",
"tomato",
"turquoise",
"violet",
"wheat",
"white",
"whitesmoke",
"yellow",
"yellowgreen" };
// These are the hex values corresponding to the named values above
public static String[] HTML_COLOR_VALUES =
{
"F0F8FF",
"FAEBD7",
"00FFFF",
"7FFFD4",
"F0FFFF",
"F5F5DC",
"FFE4C4",
"000000",
"FFEBCD",
"0000FF",
"8A2BE2",
"A52A2A",
"DEB887",
"5F9EA0",
"7FFF00",
"D2691E",
"FF7F50",
"6495ED",
"FFF8DC",
"DC143C",
"00FFFF",
"00008B",
"008B8B",
"B8860B",
"A9A9A9",
"006400",
"BDB76B",
"8B008B",
"556B2F",
"FF8C00",
"9932CC",
"8B0000",
"E9967A",
"8FBC8F",
"483D8B",
"2F4F4F",
"00CED1",
"9400D3",
"FF1493",
"00BFFF",
"696969",
"1E90FF",
"B22222",
"FFFAF0",
"228B22",
"FF00FF",
"DCDCDC",
"F8F8FF",
"FFD700",
"DAA520",
"7F7F7F",
"008000",
"ADFF2F",
"F0FFF0",
"FF69B4",
"CD5C5C",
"4B0082",
"FFFFF0",
"F0E68C",
"E6E6FA",
"FFF0F5",
"7CFC00",
"FFFACD",
"ADD8E6",
"F08080",
"E0FFFF",
"FAFAD2",
"90EE90",
"D3D3D3",
"FFB6C1",
"FFA07A",
"20B2AA",
"87CEFA",
"778899",
"B0C4DE",
"FFFFE0",
"00FF00",
"32CD32",
"FAF0E6",
"FF00FF",
"800000",
"66CDAA",
"0000CD",
"BA55D3",
"9370DB",
"3CB371",
"7B68EE",
"00FA9A",
"48D1CC",
"C71585",
"191970",
"F5FFFA",
"FFE4E1",
"FFE4B5",
"FFDEAD",
"000080",
"9FAFDF",
"FDF5E6",
"808000",
"6B8E23",
"FFA500",
"FF4500",
"DA70D6",
"EEE8AA",
"98FB98",
"AFEEEE",
"DB7093",
"FFEFD5",
"FFDAB9",
"CD853F",
"FFC0CB",
"DDA0DD",
"B0E0E6",
"800080",
"FF0000",
"BC8F8F",
"4169E1",
"8B4513",
"FA8072",
"F4A460",
"2E8B57",
"FFF5EE",
"A0522D",
"C0C0C0",
"87CEEB",
"6A5ACD",
"708090",
"FFFAFA",
"00FF7F",
"4682B4",
"D2B48C",
"008080",
"D8BFD8",
"FF6347",
"40E0D0",
"EE82EE",
"F5DEB3",
"FFFFFF",
"F5F5F5",
"FFFF00",
"9ACD32" };
private static String[] INVALID_FONT_SIZES = { "x-small", "xx-small" };
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleStartTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
*/
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
if (!quit) {
// First, we need to find the default body text color
String color = null;
int defaultColor = DEFAULT_COLOR;
if (t.equals(HTML.Tag.BODY)) {
color = getColor(a, HTML.Attribute.TEXT, COLOR_NAME);
if (color != null) {
// Set this as the default text color
defaultColor = getIntColor(color);
}
} else {
color = getColor(a, HTML.Attribute.COLOR, COLOR_NAME);
}
// We need to determine the current BGColor or Text Color attributes
String bgcolor = getColor(a, HTML.Attribute.BGCOLOR, BGCOLOR_NAME);
int iBGColor = DEFAULT_BGCOLOR;
int iTextColor = defaultColor;
if (bgcolor != null) {
iBGColor = getIntColor(bgcolor);
// Set the current BG Color
currentBGColor = iBGColor;
// Add the color to the stack
if (activeBGColorStack == null) {
activeBGColorStack = new Stack();
}
activeBGColorStack.push(String.valueOf(iBGColor));
// Add the tag to the active tag stack
if (activeBGColorTagStack == null) {
activeBGColorTagStack = new Stack();
}
activeBGColorTagStack.push(t);
}
if (color != null) {
iTextColor = getIntColor(color);
// Set the current BG Color
currentTextColor = iTextColor;
// Add the color to the stack
if (activeColorStack == null) {
activeColorStack = new Stack();
}
activeColorStack.push(String.valueOf(iTextColor));
// Add the tag to the active tag stack
if (activeColorTagStack == null) {
activeColorTagStack = new Stack();
}
activeColorTagStack.push(t);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?