spamhtmlparser.java

来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 1,082 行 · 第 1/2 页

JAVA
1,082
字号
			}

			if (bgcolor == null && color == null) {

				Tag current = null;

				// neither were found, we need determine if we should increment our inert counter
				if (activeBGColorTagStack != null && activeBGColorTagStack.size() > 0) {

					current = (Tag) activeBGColorTagStack.peek();

					if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
						inertBGColorTagCount++;
					}
				}

				if (activeColorTagStack != null && activeColorTagStack.size() > 0) {

					current = (Tag) activeColorTagStack.peek();

					if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
						inertColorTagCount++;
					}
				}
			}

			// Now, we need to determine if we should ignore the next text element
			if (!ignoreNext) {
				ignoreNext = (calculateColorThreshold() <= contrastThreshold);

				if(ignoreNext) {
				    concealedHtmlCount++;
				}
				else
				{

				    // Now test for font size in a style tag

					String fontSize = getStyleAttributeValue(a, "font-size");

					if(fontSize == null) {
						if(t.equals(HTML.Tag.FONT)) {
							fontSize = (String)a.getAttribute(HTML.Attribute.SIZE);
						}
					}

					if (fontSize != null) {
						fontSize = fontSize.replaceAll("px", "");
						fontSize = fontSize.replaceAll("pt", "");
						fontSize = fontSize.trim();

						try {
							int iFontSize = (int) Float.parseFloat(fontSize);

							ignoreNext = (iFontSize <= microFontSize);

							if(iFontSize <= 0) {
							    concealedHtmlCount++;
							}

						} catch (NumberFormatException e) {
							// We weren't able to treat the size as a number, it may be a valid CSS string
							if (Arrays.binarySearch(INVALID_FONT_SIZES, fontSize) > -1) {
							    //concealedHtmlCount++;
								ignoreNext = true;
							}
						}
					}
				}
			}

			// Now test element size
			if(!ignoreNext) {
			    ignoreNext = ignoreElement(t, a);

			    if(ignoreNext) {
			        concealedHtmlCount++;
			    }
			}

			// reset
			currentStyleAttributes = null;

			// Now check for cgi urls and images
			if(cgiHandler.handleTag(t, a, null) == HTMLTagHandler.MATCH) {
			    srcCgiCount++;
			}

			if(imageHandler.handleTag(t, a, null) == HTMLTagHandler.MATCH) {
			    imageCount++;
			}

			URLPortTagHandlerResult result = new URLPortTagHandlerResult();

			if(portHandler.handleTag(t, a, result) == HTMLTagHandler.MATCH) {
			    srcPortCount++;

			    // Add the port to the list
			    if(urlPorts == null) {
			        urlPorts = new Vector(5);
			    }

			    urlPorts.add(result.getPort());
			}


			// Check for an anchor tag to get the href
			if(t.equals(Tag.A)) {
			    currentAnchorUrl = (String)a.getAttribute(Attribute.HREF);

                if(urlBase != null) {
                    // The anchor has a base..
                    currentAnchorUrl = urlBase + currentAnchorUrl;
                }
			}
			else
			{
			    currentAnchorUrl = null;
			}

			// Check for BASE href
			if(t.equals(Tag.BASE)) {
			    urlBase = (String)a.getAttribute(Attribute.HREF);

			    if(urlBase != null && !urlBase.endsWith("/")) {
			        urlBase += "/";
			    }
			}


			// We MUST call the super class
			super.handleStartTag(t, a, pos);
		}
	}



    public void handleText(char[] text, int pos) {
        // If we are in an anchor tag, check the text against the url...
        if(currentAnchorUrl != null) {

            // Parse the text looking for a url...
            String strText = new String(text);

            if(strText != null) {
                strText = strText.trim().toLowerCase();
                if(strText.startsWith("www") || WebUtils.isUrl(strText)) {
                    // We have a direct url reference, check against the recorded value
                    if(!strText.equalsIgnoreCase(currentAnchorUrl)) {
                        // The URL text does not match the ACTUAL url
                        // This could be a deception
                        falseAnchorCount++;
                    }
                }
            }
        }

        // Now pass control to the super class
        super.handleText (text, pos);
    }
	/**
	 * Attempts to find a color/bgcolor attribute from a tag
	 * @param a
	 * @param htmlTagAttribute
	 * @param styleTagAttribute
	 * @return
	 */
	private String getColor(AttributeSet a, HTML.Attribute htmlTagAttribute, String styleTagAttribute) {
		String color = null;

		// style tags will override HTML tags
		color = getStyleAttributeValue(a, styleTagAttribute);

		if (color == null) {
			// We didn't find a style color, look for an HTML one
			color = (String) a.getAttribute(htmlTagAttribute);
		}

		if (color != null) {
			// Ensure we have removed hashes
			color = color.replaceAll("#", "");
		}

		// There seems to be a situation where the parser
		// cannot determine the color, and so gives it a
		// "DEFAULT" value.  Why it doesn't just give it null
		// I don't know!
		if(color != null && color.equalsIgnoreCase("DEFAULT")) {
		    if(styleTagAttribute.equals(BGCOLOR_NAME)) {
		        color = DEFAULT_STR_BGCOLOR;
		    }
		    else {
		        color = DEFAULT_STR_COLOR;
		    }
		}

		return color;
	}

	private String[] getCurrentStyleAttributes(AttributeSet a) {
		String attValue = (String) a.getAttribute(HTML.Attribute.STYLE);
		String[] styleAttributes = null;

		if (attValue != null && attValue.trim().length() > 0) {
			attValue = attValue.toLowerCase();
			styleAttributes = attValue.split(";");
		}

		return styleAttributes;
	}

	private String getStyleAttributeValue(AttributeSet a, String styleKey) {

		String value = null;
		String attribute = null;

		if (currentStyleAttributes == null) {
			currentStyleAttributes = getCurrentStyleAttributes(a);
		}

		if (currentStyleAttributes != null) {
			for (int i = 0; i < currentStyleAttributes.length; i++) {

				attribute = currentStyleAttributes[i].trim().toLowerCase();

				if (attribute.indexOf(styleKey) == 0) {
					// We have found our attribute, get the value
					value = currentStyleAttributes[i].substring(currentStyleAttributes[i].indexOf(":") + 1, currentStyleAttributes[i].length());

					if(value != null) value = value.trim();

					break;
				}
			}
		}
		return value;
	}



	private int getIntColor(String strColor) {

		int color = 0;

		strColor = strColor.replaceAll("#", "").trim();

		int strLength = strColor.length();

		String strPart;

		// See if it's a named color first
		int index = Arrays.binarySearch(HTML_COLOR_NAMES, strColor.toLowerCase());

		if (index > -1) {
			return getIntColor(HTML_COLOR_VALUES[index]);
		} else {
			// Analyze each character
			char[] chars = strColor.toCharArray();
			char chr;
			String rgb = "";
			boolean add = false;

			for (int i = 0; i < chars.length; i++) {
				// The color value should be between 0 and F (hex)
				chr = chars[i];

				if (chr < 0x0030 || (chr > 0x0039 && chr < 0x0041) || (chr > 0x0046 && chr < 0x0061) || (chr > 0x0066)) {
					chr = '0';
				}

				rgb += chr;

				if (add) {
					try {
						color += Integer.parseInt(rgb, 16);
						rgb = "";
					} catch (NumberFormatException e) {
						// Ignore the exception here
						e.printStackTrace();
					}
				}
				add = !add;
			}
		}

		return color;
	}

	private float calculateColorThreshold() {

		float threshhold = 0.0f;

		if (currentTextColor > currentBGColor) {
			threshhold = (((float) currentTextColor - (float) currentBGColor) / (float) currentTextColor);
		} else if (currentBGColor > currentTextColor) {
			threshhold = (((float) currentBGColor - (float) currentTextColor) / (float) currentBGColor);
		}

		return threshhold;
	}

	/**
	 * Returns true if the text within this element should be ignored based on the element size
	 * @param tag
	 * @return
	 */
	private boolean ignoreElement(HTML.Tag tag, AttributeSet a) {

	    boolean ignoreElem = false;

	    String strWidth = getStyleAttributeValue(a, "width");
	    String strHeight = getStyleAttributeValue(a, "height");

	    if(strWidth == null) {
	        strWidth = (String)a.getAttribute(Attribute.WIDTH);
	    }

	    if(strHeight == null) {
	        strHeight = (String)a.getAttribute(Attribute.HEIGHT);
	    }


	    try
        {
		    if(strHeight != null) {
		        strHeight = strHeight.replaceAll("px", "");
		        ignoreElem = (Integer.parseInt(strHeight) <= microElementSize);
		    }
		    if(!ignoreElem && strWidth != null) {
		        strWidth = strWidth.replaceAll("px", "");
		        ignoreElem = (Integer.parseInt(strWidth) <= microElementSize);
		    }
        }
        catch (NumberFormatException ignore){}

		return ignoreElem;
	}

	/* (non-Javadoc)
	 * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleEndTag(javax.swing.text.html.HTML.Tag, int)
	 */
	public void handleEndTag(Tag t, int pos) {

		// If the end tag is a /html, we want to ignore everything else

		if (t.equals(HTML.Tag.HTML)) {
			quit = true;
		}

		if (!quit) {
			// If the current tag equals the last tag on either our color stack or
			// our bgcolor stack, we may need to pop
			Tag current = null;

			if (activeBGColorTagStack != null && activeBGColorTagStack.size() > 0) {
				current = (Tag) activeBGColorTagStack.peek();

				if (current != null && current.toString().equalsIgnoreCase(t.toString())) {

					if (inertBGColorTagCount > 0) {
						inertBGColorTagCount--;
					} else {
						// We have to remove the current color from the stack
						activeBGColorTagStack.pop();
						activeBGColorStack.pop();

						if (activeBGColorTagStack.size() > 0) {
							currentBGColor = Integer.parseInt((String) activeBGColorStack.peek());
						} else {
							currentBGColor = DEFAULT_BGCOLOR;
						}
					}
				}
			} else {
				// Set to default
				currentBGColor = DEFAULT_BGCOLOR;
			}

			if (activeColorTagStack != null && activeColorTagStack.size() > 0) {
				current = (Tag) activeColorTagStack.peek();

				if (current != null && current.toString().equalsIgnoreCase(t.toString())) {

					if (inertColorTagCount > 0) {
						inertColorTagCount--;
					} else {
						// We have to remove the current color from the stack
						activeColorTagStack.pop();
						activeColorStack.pop();

						if (activeColorTagStack.size() > 0) {
							currentTextColor = Integer.parseInt((String) activeColorStack.peek());
						} else {
							currentTextColor = DEFAULT_COLOR;
						}
					}
				}
			} else {
				// Set to default
				currentTextColor = DEFAULT_COLOR;
			}

			//Clear the href
			currentAnchorUrl = null;

			super.handleEndTag(t, pos);
		}
	}

	/**
	 * Gets the number of times concealed html was found
	 * @return An integer representing the number of times a concealment was discovered
	 */
    public int getConcealedHtmlCount() {
        return concealedHtmlCount;
    }

    /**
     * Gets the number of times images were found
     * @return The number of images in the document
     */
    public int getImageCount() {
        return imageCount;
    }

    /**
     * Gets the number of times the source attribute of a tag referenced a remote CGI script
     * @return
     */
    public int getSrcCgiCount() {
        return srcCgiCount;
    }


    /**
     * Gets the list of url ports found in tags with a src attribute
     * @return
     */
    public int getSrcPortCount() {
        return srcPortCount;
    }
    
    /**
     * Gets the list of url ports found in anchor tags in the message html part
     * @return
     */
    public List getUrlPorts() {
        return urlPorts;
    }


    /**
     * Gets the number if occurrences of "false" anchor tags.
     * <p>
     * These exist where an anchor tag displays a url as the text component,
     * <br/>
     * but this url does not match the actual url of the href.
     * </p>
     * @return The number of times a false anchor reference was discovered
     */
    public int getFalseAnchorCount() {
        return falseAnchorCount;
    }
    
    
    /** 
     * Gets the threshold for contrast between foreground and background content elements.
     * <br/>
     * In HTML emails, and particularly spam, content is often obscured via the use of low 
     * contrast colors or tones between background and foreground elements.  For example, 
     * the text of the email may be white, and the background white indicating a contrast of 0
     * @return A value between 0.0 and 1.0 such that 0.0 indicates no contrast, and 1.0 indicates 
     * complete contrast (eg white on black)
     */
    public float getContrastThreshold() {
        return contrastThreshold;
    }
    
    /**
     * Sets the threshold for contrast between foreground and background content elements.
     * @see SpamHTMLParser#getContrastThreshold()
     * @param contrastThreshold A value between 0.0 and 1.0
     */
    public void setContrastThreshold(float contrastThreshold) {
        this.contrastThreshold = contrastThreshold;
    }
    
    /** 
     * Gets the size (in pixels) of the minimum allowable element dimension (usually height).
     * <br/>
     * Content found inside elements smaller than this size is deemed concealed
     * @return The size in pixels of the smallest allowable element dimension
     */
    public int getMicroElementSize() {
        return microElementSize;
    }
    
    /**
     * Sets the size (in pixels) of the minimum allowable element dimension (usually height).
     * @param microElementSize The size in pixels.  It is recommended that this be less than 10.
     * Default is 5.
     */
    public void setMicroElementSize(int microElementSize) {
        this.microElementSize = microElementSize;
    }
    
    /**
     * Gets the size (in points) of the minimum allowable font size.
     * <br/>
     * Content found inside font tags with smaller point size than this size is deemed concealed
     * @return The size in points of the smallest allowable font.  Default is 1
     */
    public int getMicroFontSize() {
        return microFontSize;
    }
    
    /**
     * Sets the size (in points) of the minimum allowable font size.
     * @param microFontSize A size in points.  Default is 1
     */
    public void setMicroFontSize(int microFontSize) {
        this.microFontSize = microFontSize;
    }
    
    /*
     * (non-Javadoc)
     * @see org.jasen.interfaces.HTMLParser#parse(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.MimeMessageTokenizer)
     */
    public ParserData parse(MimeMessage mm, JasenMessage message, MimeMessageTokenizer tokenizer) throws JasenException {
        StandardParserData parserData = (StandardParserData)super.parse(mm, message, tokenizer);
        parserData.setConcealedHtmlCount(getConcealedHtmlCount());
        parserData.setImageCount(getImageCount());
        parserData.setSrcCgiCount(getSrcCgiCount());
        parserData.setSrcPortCount(getSrcPortCount());
        parserData.setPorts(getUrlPorts());
        parserData.setFalseAnchorCount(getFalseAnchorCount());
        return parserData;
    }


}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?