spamhtmlparser.java
来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 1,082 行 · 第 1/2 页
JAVA
1,082 行
}
if (bgcolor == null && color == null) {
Tag current = null;
// neither were found, we need determine if we should increment our inert counter
if (activeBGColorTagStack != null && activeBGColorTagStack.size() > 0) {
current = (Tag) activeBGColorTagStack.peek();
if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
inertBGColorTagCount++;
}
}
if (activeColorTagStack != null && activeColorTagStack.size() > 0) {
current = (Tag) activeColorTagStack.peek();
if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
inertColorTagCount++;
}
}
}
// Now, we need to determine if we should ignore the next text element
if (!ignoreNext) {
ignoreNext = (calculateColorThreshold() <= contrastThreshold);
if(ignoreNext) {
concealedHtmlCount++;
}
else
{
// Now test for font size in a style tag
String fontSize = getStyleAttributeValue(a, "font-size");
if(fontSize == null) {
if(t.equals(HTML.Tag.FONT)) {
fontSize = (String)a.getAttribute(HTML.Attribute.SIZE);
}
}
if (fontSize != null) {
fontSize = fontSize.replaceAll("px", "");
fontSize = fontSize.replaceAll("pt", "");
fontSize = fontSize.trim();
try {
int iFontSize = (int) Float.parseFloat(fontSize);
ignoreNext = (iFontSize <= microFontSize);
if(iFontSize <= 0) {
concealedHtmlCount++;
}
} catch (NumberFormatException e) {
// We weren't able to treat the size as a number, it may be a valid CSS string
if (Arrays.binarySearch(INVALID_FONT_SIZES, fontSize) > -1) {
//concealedHtmlCount++;
ignoreNext = true;
}
}
}
}
}
// Now test element size
if(!ignoreNext) {
ignoreNext = ignoreElement(t, a);
if(ignoreNext) {
concealedHtmlCount++;
}
}
// reset
currentStyleAttributes = null;
// Now check for cgi urls and images
if(cgiHandler.handleTag(t, a, null) == HTMLTagHandler.MATCH) {
srcCgiCount++;
}
if(imageHandler.handleTag(t, a, null) == HTMLTagHandler.MATCH) {
imageCount++;
}
URLPortTagHandlerResult result = new URLPortTagHandlerResult();
if(portHandler.handleTag(t, a, result) == HTMLTagHandler.MATCH) {
srcPortCount++;
// Add the port to the list
if(urlPorts == null) {
urlPorts = new Vector(5);
}
urlPorts.add(result.getPort());
}
// Check for an anchor tag to get the href
if(t.equals(Tag.A)) {
currentAnchorUrl = (String)a.getAttribute(Attribute.HREF);
if(urlBase != null) {
// The anchor has a base..
currentAnchorUrl = urlBase + currentAnchorUrl;
}
}
else
{
currentAnchorUrl = null;
}
// Check for BASE href
if(t.equals(Tag.BASE)) {
urlBase = (String)a.getAttribute(Attribute.HREF);
if(urlBase != null && !urlBase.endsWith("/")) {
urlBase += "/";
}
}
// We MUST call the super class
super.handleStartTag(t, a, pos);
}
}
public void handleText(char[] text, int pos) {
// If we are in an anchor tag, check the text against the url...
if(currentAnchorUrl != null) {
// Parse the text looking for a url...
String strText = new String(text);
if(strText != null) {
strText = strText.trim().toLowerCase();
if(strText.startsWith("www") || WebUtils.isUrl(strText)) {
// We have a direct url reference, check against the recorded value
if(!strText.equalsIgnoreCase(currentAnchorUrl)) {
// The URL text does not match the ACTUAL url
// This could be a deception
falseAnchorCount++;
}
}
}
}
// Now pass control to the super class
super.handleText (text, pos);
}
/**
* Attempts to find a color/bgcolor attribute from a tag
* @param a
* @param htmlTagAttribute
* @param styleTagAttribute
* @return
*/
private String getColor(AttributeSet a, HTML.Attribute htmlTagAttribute, String styleTagAttribute) {
String color = null;
// style tags will override HTML tags
color = getStyleAttributeValue(a, styleTagAttribute);
if (color == null) {
// We didn't find a style color, look for an HTML one
color = (String) a.getAttribute(htmlTagAttribute);
}
if (color != null) {
// Ensure we have removed hashes
color = color.replaceAll("#", "");
}
// There seems to be a situation where the parser
// cannot determine the color, and so gives it a
// "DEFAULT" value. Why it doesn't just give it null
// I don't know!
if(color != null && color.equalsIgnoreCase("DEFAULT")) {
if(styleTagAttribute.equals(BGCOLOR_NAME)) {
color = DEFAULT_STR_BGCOLOR;
}
else {
color = DEFAULT_STR_COLOR;
}
}
return color;
}
private String[] getCurrentStyleAttributes(AttributeSet a) {
String attValue = (String) a.getAttribute(HTML.Attribute.STYLE);
String[] styleAttributes = null;
if (attValue != null && attValue.trim().length() > 0) {
attValue = attValue.toLowerCase();
styleAttributes = attValue.split(";");
}
return styleAttributes;
}
private String getStyleAttributeValue(AttributeSet a, String styleKey) {
String value = null;
String attribute = null;
if (currentStyleAttributes == null) {
currentStyleAttributes = getCurrentStyleAttributes(a);
}
if (currentStyleAttributes != null) {
for (int i = 0; i < currentStyleAttributes.length; i++) {
attribute = currentStyleAttributes[i].trim().toLowerCase();
if (attribute.indexOf(styleKey) == 0) {
// We have found our attribute, get the value
value = currentStyleAttributes[i].substring(currentStyleAttributes[i].indexOf(":") + 1, currentStyleAttributes[i].length());
if(value != null) value = value.trim();
break;
}
}
}
return value;
}
private int getIntColor(String strColor) {
int color = 0;
strColor = strColor.replaceAll("#", "").trim();
int strLength = strColor.length();
String strPart;
// See if it's a named color first
int index = Arrays.binarySearch(HTML_COLOR_NAMES, strColor.toLowerCase());
if (index > -1) {
return getIntColor(HTML_COLOR_VALUES[index]);
} else {
// Analyze each character
char[] chars = strColor.toCharArray();
char chr;
String rgb = "";
boolean add = false;
for (int i = 0; i < chars.length; i++) {
// The color value should be between 0 and F (hex)
chr = chars[i];
if (chr < 0x0030 || (chr > 0x0039 && chr < 0x0041) || (chr > 0x0046 && chr < 0x0061) || (chr > 0x0066)) {
chr = '0';
}
rgb += chr;
if (add) {
try {
color += Integer.parseInt(rgb, 16);
rgb = "";
} catch (NumberFormatException e) {
// Ignore the exception here
e.printStackTrace();
}
}
add = !add;
}
}
return color;
}
private float calculateColorThreshold() {
float threshhold = 0.0f;
if (currentTextColor > currentBGColor) {
threshhold = (((float) currentTextColor - (float) currentBGColor) / (float) currentTextColor);
} else if (currentBGColor > currentTextColor) {
threshhold = (((float) currentBGColor - (float) currentTextColor) / (float) currentBGColor);
}
return threshhold;
}
/**
* Returns true if the text within this element should be ignored based on the element size
* @param tag
* @return
*/
private boolean ignoreElement(HTML.Tag tag, AttributeSet a) {
boolean ignoreElem = false;
String strWidth = getStyleAttributeValue(a, "width");
String strHeight = getStyleAttributeValue(a, "height");
if(strWidth == null) {
strWidth = (String)a.getAttribute(Attribute.WIDTH);
}
if(strHeight == null) {
strHeight = (String)a.getAttribute(Attribute.HEIGHT);
}
try
{
if(strHeight != null) {
strHeight = strHeight.replaceAll("px", "");
ignoreElem = (Integer.parseInt(strHeight) <= microElementSize);
}
if(!ignoreElem && strWidth != null) {
strWidth = strWidth.replaceAll("px", "");
ignoreElem = (Integer.parseInt(strWidth) <= microElementSize);
}
}
catch (NumberFormatException ignore){}
return ignoreElem;
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleEndTag(javax.swing.text.html.HTML.Tag, int)
*/
public void handleEndTag(Tag t, int pos) {
// If the end tag is a /html, we want to ignore everything else
if (t.equals(HTML.Tag.HTML)) {
quit = true;
}
if (!quit) {
// If the current tag equals the last tag on either our color stack or
// our bgcolor stack, we may need to pop
Tag current = null;
if (activeBGColorTagStack != null && activeBGColorTagStack.size() > 0) {
current = (Tag) activeBGColorTagStack.peek();
if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
if (inertBGColorTagCount > 0) {
inertBGColorTagCount--;
} else {
// We have to remove the current color from the stack
activeBGColorTagStack.pop();
activeBGColorStack.pop();
if (activeBGColorTagStack.size() > 0) {
currentBGColor = Integer.parseInt((String) activeBGColorStack.peek());
} else {
currentBGColor = DEFAULT_BGCOLOR;
}
}
}
} else {
// Set to default
currentBGColor = DEFAULT_BGCOLOR;
}
if (activeColorTagStack != null && activeColorTagStack.size() > 0) {
current = (Tag) activeColorTagStack.peek();
if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
if (inertColorTagCount > 0) {
inertColorTagCount--;
} else {
// We have to remove the current color from the stack
activeColorTagStack.pop();
activeColorStack.pop();
if (activeColorTagStack.size() > 0) {
currentTextColor = Integer.parseInt((String) activeColorStack.peek());
} else {
currentTextColor = DEFAULT_COLOR;
}
}
}
} else {
// Set to default
currentTextColor = DEFAULT_COLOR;
}
//Clear the href
currentAnchorUrl = null;
super.handleEndTag(t, pos);
}
}
/**
* Gets the number of times concealed html was found
* @return An integer representing the number of times a concealment was discovered
*/
public int getConcealedHtmlCount() {
return concealedHtmlCount;
}
/**
* Gets the number of times images were found
* @return The number of images in the document
*/
public int getImageCount() {
return imageCount;
}
/**
* Gets the number of times the source attribute of a tag referenced a remote CGI script
* @return
*/
public int getSrcCgiCount() {
return srcCgiCount;
}
/**
* Gets the list of url ports found in tags with a src attribute
* @return
*/
public int getSrcPortCount() {
return srcPortCount;
}
/**
* Gets the list of url ports found in anchor tags in the message html part
* @return
*/
public List getUrlPorts() {
return urlPorts;
}
/**
* Gets the number if occurrences of "false" anchor tags.
* <p>
* These exist where an anchor tag displays a url as the text component,
* <br/>
* but this url does not match the actual url of the href.
* </p>
* @return The number of times a false anchor reference was discovered
*/
public int getFalseAnchorCount() {
return falseAnchorCount;
}
/**
* Gets the threshold for contrast between foreground and background content elements.
* <br/>
* In HTML emails, and particularly spam, content is often obscured via the use of low
* contrast colors or tones between background and foreground elements. For example,
* the text of the email may be white, and the background white indicating a contrast of 0
* @return A value between 0.0 and 1.0 such that 0.0 indicates no contrast, and 1.0 indicates
* complete contrast (eg white on black)
*/
public float getContrastThreshold() {
return contrastThreshold;
}
/**
* Sets the threshold for contrast between foreground and background content elements.
* @see SpamHTMLParser#getContrastThreshold()
* @param contrastThreshold A value between 0.0 and 1.0
*/
public void setContrastThreshold(float contrastThreshold) {
this.contrastThreshold = contrastThreshold;
}
/**
* Gets the size (in pixels) of the minimum allowable element dimension (usually height).
* <br/>
* Content found inside elements smaller than this size is deemed concealed
* @return The size in pixels of the smallest allowable element dimension
*/
public int getMicroElementSize() {
return microElementSize;
}
/**
* Sets the size (in pixels) of the minimum allowable element dimension (usually height).
* @param microElementSize The size in pixels. It is recommended that this be less than 10.
* Default is 5.
*/
public void setMicroElementSize(int microElementSize) {
this.microElementSize = microElementSize;
}
/**
* Gets the size (in points) of the minimum allowable font size.
* <br/>
* Content found inside font tags with smaller point size than this size is deemed concealed
* @return The size in points of the smallest allowable font. Default is 1
*/
public int getMicroFontSize() {
return microFontSize;
}
/**
* Sets the size (in points) of the minimum allowable font size.
* @param microFontSize A size in points. Default is 1
*/
public void setMicroFontSize(int microFontSize) {
this.microFontSize = microFontSize;
}
/*
* (non-Javadoc)
* @see org.jasen.interfaces.HTMLParser#parse(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.MimeMessageTokenizer)
*/
public ParserData parse(MimeMessage mm, JasenMessage message, MimeMessageTokenizer tokenizer) throws JasenException {
StandardParserData parserData = (StandardParserData)super.parse(mm, message, tokenizer);
parserData.setConcealedHtmlCount(getConcealedHtmlCount());
parserData.setImageCount(getImageCount());
parserData.setSrcCgiCount(getSrcCgiCount());
parserData.setSrcPortCount(getSrcPortCount());
parserData.setPorts(getUrlPorts());
parserData.setFalseAnchorCount(getFalseAnchorCount());
return parserData;
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?