spamtokenizer.java

来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 837 行 · 第 1/2 页

JAVA
837
字号
/*
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.token;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;

import org.jasen.core.linguistics.LinguisticAnalyzer;
import org.jasen.core.parsers.URLParser;
import org.jasen.interfaces.TokenErrorRecorder;
import org.jasen.util.DNSUtils;
import org.jasen.util.MimeUtils;


/**
 * <p>
 * This class is used exlusively by the EmailTokenizer.
 * </p>
 * @see org.jasen.core.token.EmailTokenizer
 * @author Jason Polites
 */
public class SpamTokenizer {

	public static int MIN_TOKEN_LENGTH = 3;

	// The longest english word according to Oxford Dictionary is 29 characters in length
	// SpamBayes recommends 12 as a length
	public static int MAX_TOKEN_LENGTH = 12;

	public static double TOKEN_RECOGNITION_THRESHOLD = 0.1d;
	//public static int TOKEN_LINGUISTIC_ERROR_THRESHOLD = 3;

	private static final int VALID_TOKEN = 0;
	private static final int INVALID_TOKEN_TOO_LONG = 1;
	private static final int INVALID_TOKEN_TOO_SHORT = 2;
	private static final int INVALID_TOKEN_STOP_WORD = 3;
	private static final int INVALID_TOKEN_LINGUISTIC_ERROR = 4;
	private static final int INVALID_TOKEN_ONLY_NUMERIC = 5;
	private static final int INVALID_TOKEN_MAX_TOKENS_EXCEEDED = 6;

	// The number of tokens to capture before exiting
	protected int maxTokens = 50;

	 // The number of sequential linguistic errors that are allowed before tokenization is ceased
	protected int linguisticLimit = 3;

	public SpamTokenizer() {
		// Initialise the linguistics engine
		LinguisticAnalyzer.getInstance();
	}

	// Common Words (To be tested)
	// This list MUST be sorter alphabetically to facilitate a binary array search
	public static String[] STOP_WORDS =
		{
			"about",
			"again",
			"after",
			"all",
			"and",
			"another",
			"are",
			"arial",
			"because",
			"been",
			"but",
			"can",
			"did",
			"div",
			"does",
			"down",
			"each",
			"file",
			"find",
			"font",
			"for",
			"from",
			"ftp",
			"had",
			"has",
			"have",
			"helvetica",
			"her",
			"him",
			"his",
			"how",
			"href",
			"html",
			"http",
			"into",
			"its",
			"just",
			"know",
			"like",
			"made",
			"mailto",
			"make",
			"many",
			"may",
			"more",
			"most",
			"not",
			"one",
			"only",
			"other",
			"our",
			"out",
			"over",
			"said",
			"sans",
			"see",
			"serif",
			"she",
			"some",
			"such",
			"than",
			"that",
			"the",
			"their",
			"them",
			"then",
			"there",
			"these",
			"they",
			"this",
			"two",
			"use",
			"very",
			"was",
			"way",
			"we",
			"we",
			"were",
			"what",
			"when",
			"where",
			"which",
			"who",
			"will",
			"with",
			"would",
			"you",
			"your" };

	/**
	 * List list does NOT contain "$,@,?,!" as we want to retain these.
	 * This array MUST be sorted to faciliate a binary search.
	 */
	public static char[] STOP_CHARS = { '"', '#', // Could indicate an HTML character ref
		'\'', '(', ')', '*', '+', ':', ';', '<', '>', '[', '\\', ']', '^', '`', '{', '|', '}' };

	/**
	 * These are characters which should always be treated as delimiters
	 * except when within a url
	 * This array MUST be sorted to faciliate a binary search
	 */
	public static char[] DELIMITER_CHARS = {
		//'&', This can be a prefix for a character entity reference in HTML
		'-', '=', '?', '_', '~' };

	// Sort the relevant arrays
	static {
		Arrays.sort(STOP_WORDS);
		Arrays.sort(STOP_CHARS);
		Arrays.sort(DELIMITER_CHARS);
	}

	/**
	 * Custom implementation which only returns urls
	 * <BR><BR>
	 * This is used for mail headers specifically
	 * @param str
	 * @param onlyUrls
	 * @return The reduced set of tokens (words)
	 * @throws IOException
	 */
	public String[] tokenize(String str, boolean onlyUrls, TokenErrorRecorder recorder) throws IOException {
		return tokenize(new StringReader(str), onlyUrls, recorder);
	}

	public String[] tokenize(String str, TokenErrorRecorder recorder) throws IOException {

		if (str != null) {
			return tokenize(new StringReader(str), false, recorder);
		}
		else {
			return null;
		}

	}

	public String[] tokenize(Reader reader, boolean onlyUrls, TokenErrorRecorder recorder) throws IOException {

		int tokenCount = 0;

		List tokens = null;
		String[] tokenArray = null;
		String[] emailTokens = null;

		char[] chrs = new char[1];
		char[] peekBuffer = new char[1];

		char lastChar; // The previous character
		char thisChar; // The current character
		char nextChar; // The next character

		boolean dotFound = false; // Set to true when we hit a '.'
		boolean atFound = false; // Set to true when we hit a '@'
		boolean urlFound = false; // Set to true when we hit a URL word like HTTP, or MAILTO
		boolean urlProcessing = false; // Set to true when we are in the middle of processing a URL
		boolean ignoreToSpace = false; // Set to true when we want to ignore all chars until the next space
		boolean urlMatch = false;
		boolean htmlCharRef = false; // Indicates we have discovered a likely candidate for an HTML character reference
		String strHtmlCharRef = null; // Used to store the html character reference

		boolean tokenize = false; // Set to true if the current buffer should be tokenized

		boolean keepTokenizing = true; // Set to false to force an exit

		String token = null;

		int read = reader.read(chrs);
		int index; // The current index in the current token

		int linguisticErrors = 0;

		int charIntValue = -1;

		// Record char replacements
		int iTmpCharReplace = 0;
		int iCharReplace = 0;

		// Used to find the index into to URL_WORDS array for a matching token
		int urlKeyIndex = -1;

		if (read != -1) {
			thisChar = chrs[0];
			lastChar = thisChar;

			StringBuffer buffer = new StringBuffer();

			do {
				read = reader.read(chrs);
				nextChar = chrs[0];

				charIntValue = (int) thisChar;

				token = buffer.toString();
				index = token.length();

				if (!urlMatch)
					urlMatch = urlFound;

				// We need to see if we are inside a url
				urlKeyIndex = Arrays.binarySearch(URLParser.URL_WORDS, token);

				if (urlKeyIndex > -1) {
					// Our current token indicates a URL
					urlFound = true;
				}

				if (Arrays.binarySearch(STOP_CHARS, thisChar) > -1) {

					if (thisChar == '#') {
						// We could have a character entity reference
						if (lastChar == '&' && isInteger(nextChar)) {
							// This usually indicates an HTML character entity ref
							// We ultimately want to replace the number with the actual character
							// Ignore the character until we hit a ';'
							htmlCharRef = true;
						}
						//else It's just normal #, ignore it
					}
					else if (thisChar == ';' && htmlCharRef) {
						// We found a valid htmlRefChar, add the character equivalent to the buffer
						if (strHtmlCharRef != null) {
							try {
								int charRef = Integer.parseInt(strHtmlCharRef);

								// Append the character equivalent
								appendChar((char) charRef, buffer, ignoreToSpace);

							}
							catch (NumberFormatException e) {
								// Ignore this error
								e.printStackTrace();
							}

							strHtmlCharRef = null;
							htmlCharRef = false;
						}

					}
					// We have some abnormal character, check the next char
					else if (!isSpace(nextChar)) {
						// The next character is not a space, we may need to treat the char as a delimiter
						// If this char is an apostrophe (or equivalent) there are only a limited
						// set of valid next characters
						if (isApostrophe(thisChar)) {

							// The next char should only be one of:
							// s, t, r
							if (!isValidApostropheNextChar(nextChar)) {
								// We should treat this as a token
								tokenize = true;
							}
						}
					}
					else if (urlFound) {
						// Any of these characters is invalid in a url
						tokenize = true;

						// And we aren't urling any more
						urlFound = false;
						urlProcessing = false;
					}
				}
				else if (Arrays.binarySearch(DELIMITER_CHARS, thisChar) > -1) {

					// These may not be deimiters if we are in a url
					if (urlFound && (thisChar == '-' || thisChar == '_')) {
						//	We are in a url add the character
						appendChar(thisChar, buffer, ignoreToSpace);
					}
					else {
						// Treat as a delimiter
						tokenize = true;
					}
				}
				else {
					// The character is not a stop/delimiter character

					if (isSpace(thisChar)) { // Space

						// We are either at a delimiter, or we are just hitting a double space

						// A delmiter is indicated by the fact that the last character of the current
						// buffer is a "normal" character
						// Otherwise we just ignore it
						if (!isSpace(lastChar)) {
							// The space is a delimiter, add the buffer to the token list
							tokenize = true;
						}

						// If we are ignoring, stop
						ignoreToSpace = false;

						// And we aren't urling any more
						urlFound = false;
						urlProcessing = false;
					}
					else if (thisChar == ',') {
						// We could be in a currency amount
						if (isInteger(lastChar) && isInteger(nextChar)) {
							appendChar(thisChar, buffer, ignoreToSpace);
						}
						else if (!isSpace(nextChar)) {
							// The next character is not a space, we should treat this as a delimiter
							tokenize = true;
						}
					}
					else if (thisChar == '@') {
						// We could be in an email address
						urlFound = true;
						appendChar(thisChar, buffer, ignoreToSpace);
					}
					else if (thisChar == '!') {
						// We only want to include one !
						if (lastChar != '!') {
							appendChar(thisChar, buffer, ignoreToSpace);
						}
					}
					else if (thisChar == '$') {
						// We only want to include one $
						if (lastChar != '$') {
							appendChar(thisChar, buffer, ignoreToSpace);
						}
					}

					else if (thisChar == '/') {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?