spamtokenizer.java
来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 837 行 · 第 1/2 页
JAVA
837 行
/*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.token;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import org.jasen.core.linguistics.LinguisticAnalyzer;
import org.jasen.core.parsers.URLParser;
import org.jasen.interfaces.TokenErrorRecorder;
import org.jasen.util.DNSUtils;
import org.jasen.util.MimeUtils;
/**
* <p>
* This class is used exlusively by the EmailTokenizer.
* </p>
* @see org.jasen.core.token.EmailTokenizer
* @author Jason Polites
*/
public class SpamTokenizer {
public static int MIN_TOKEN_LENGTH = 3;
// The longest english word according to Oxford Dictionary is 29 characters in length
// SpamBayes recommends 12 as a length
public static int MAX_TOKEN_LENGTH = 12;
public static double TOKEN_RECOGNITION_THRESHOLD = 0.1d;
//public static int TOKEN_LINGUISTIC_ERROR_THRESHOLD = 3;
private static final int VALID_TOKEN = 0;
private static final int INVALID_TOKEN_TOO_LONG = 1;
private static final int INVALID_TOKEN_TOO_SHORT = 2;
private static final int INVALID_TOKEN_STOP_WORD = 3;
private static final int INVALID_TOKEN_LINGUISTIC_ERROR = 4;
private static final int INVALID_TOKEN_ONLY_NUMERIC = 5;
private static final int INVALID_TOKEN_MAX_TOKENS_EXCEEDED = 6;
// The number of tokens to capture before exiting
protected int maxTokens = 50;
// The number of sequential linguistic errors that are allowed before tokenization is ceased
protected int linguisticLimit = 3;
public SpamTokenizer() {
// Initialise the linguistics engine
LinguisticAnalyzer.getInstance();
}
// Common Words (To be tested)
// This list MUST be sorter alphabetically to facilitate a binary array search
public static String[] STOP_WORDS =
{
"about",
"again",
"after",
"all",
"and",
"another",
"are",
"arial",
"because",
"been",
"but",
"can",
"did",
"div",
"does",
"down",
"each",
"file",
"find",
"font",
"for",
"from",
"ftp",
"had",
"has",
"have",
"helvetica",
"her",
"him",
"his",
"how",
"href",
"html",
"http",
"into",
"its",
"just",
"know",
"like",
"made",
"mailto",
"make",
"many",
"may",
"more",
"most",
"not",
"one",
"only",
"other",
"our",
"out",
"over",
"said",
"sans",
"see",
"serif",
"she",
"some",
"such",
"than",
"that",
"the",
"their",
"them",
"then",
"there",
"these",
"they",
"this",
"two",
"use",
"very",
"was",
"way",
"we",
"we",
"were",
"what",
"when",
"where",
"which",
"who",
"will",
"with",
"would",
"you",
"your" };
/**
* List list does NOT contain "$,@,?,!" as we want to retain these.
* This array MUST be sorted to faciliate a binary search.
*/
public static char[] STOP_CHARS = { '"', '#', // Could indicate an HTML character ref
'\'', '(', ')', '*', '+', ':', ';', '<', '>', '[', '\\', ']', '^', '`', '{', '|', '}' };
/**
* These are characters which should always be treated as delimiters
* except when within a url
* This array MUST be sorted to faciliate a binary search
*/
public static char[] DELIMITER_CHARS = {
//'&', This can be a prefix for a character entity reference in HTML
'-', '=', '?', '_', '~' };
// Sort the relevant arrays
static {
Arrays.sort(STOP_WORDS);
Arrays.sort(STOP_CHARS);
Arrays.sort(DELIMITER_CHARS);
}
/**
* Custom implementation which only returns urls
* <BR><BR>
* This is used for mail headers specifically
* @param str
* @param onlyUrls
* @return The reduced set of tokens (words)
* @throws IOException
*/
public String[] tokenize(String str, boolean onlyUrls, TokenErrorRecorder recorder) throws IOException {
return tokenize(new StringReader(str), onlyUrls, recorder);
}
public String[] tokenize(String str, TokenErrorRecorder recorder) throws IOException {
if (str != null) {
return tokenize(new StringReader(str), false, recorder);
}
else {
return null;
}
}
public String[] tokenize(Reader reader, boolean onlyUrls, TokenErrorRecorder recorder) throws IOException {
int tokenCount = 0;
List tokens = null;
String[] tokenArray = null;
String[] emailTokens = null;
char[] chrs = new char[1];
char[] peekBuffer = new char[1];
char lastChar; // The previous character
char thisChar; // The current character
char nextChar; // The next character
boolean dotFound = false; // Set to true when we hit a '.'
boolean atFound = false; // Set to true when we hit a '@'
boolean urlFound = false; // Set to true when we hit a URL word like HTTP, or MAILTO
boolean urlProcessing = false; // Set to true when we are in the middle of processing a URL
boolean ignoreToSpace = false; // Set to true when we want to ignore all chars until the next space
boolean urlMatch = false;
boolean htmlCharRef = false; // Indicates we have discovered a likely candidate for an HTML character reference
String strHtmlCharRef = null; // Used to store the html character reference
boolean tokenize = false; // Set to true if the current buffer should be tokenized
boolean keepTokenizing = true; // Set to false to force an exit
String token = null;
int read = reader.read(chrs);
int index; // The current index in the current token
int linguisticErrors = 0;
int charIntValue = -1;
// Record char replacements
int iTmpCharReplace = 0;
int iCharReplace = 0;
// Used to find the index into to URL_WORDS array for a matching token
int urlKeyIndex = -1;
if (read != -1) {
thisChar = chrs[0];
lastChar = thisChar;
StringBuffer buffer = new StringBuffer();
do {
read = reader.read(chrs);
nextChar = chrs[0];
charIntValue = (int) thisChar;
token = buffer.toString();
index = token.length();
if (!urlMatch)
urlMatch = urlFound;
// We need to see if we are inside a url
urlKeyIndex = Arrays.binarySearch(URLParser.URL_WORDS, token);
if (urlKeyIndex > -1) {
// Our current token indicates a URL
urlFound = true;
}
if (Arrays.binarySearch(STOP_CHARS, thisChar) > -1) {
if (thisChar == '#') {
// We could have a character entity reference
if (lastChar == '&' && isInteger(nextChar)) {
// This usually indicates an HTML character entity ref
// We ultimately want to replace the number with the actual character
// Ignore the character until we hit a ';'
htmlCharRef = true;
}
//else It's just normal #, ignore it
}
else if (thisChar == ';' && htmlCharRef) {
// We found a valid htmlRefChar, add the character equivalent to the buffer
if (strHtmlCharRef != null) {
try {
int charRef = Integer.parseInt(strHtmlCharRef);
// Append the character equivalent
appendChar((char) charRef, buffer, ignoreToSpace);
}
catch (NumberFormatException e) {
// Ignore this error
e.printStackTrace();
}
strHtmlCharRef = null;
htmlCharRef = false;
}
}
// We have some abnormal character, check the next char
else if (!isSpace(nextChar)) {
// The next character is not a space, we may need to treat the char as a delimiter
// If this char is an apostrophe (or equivalent) there are only a limited
// set of valid next characters
if (isApostrophe(thisChar)) {
// The next char should only be one of:
// s, t, r
if (!isValidApostropheNextChar(nextChar)) {
// We should treat this as a token
tokenize = true;
}
}
}
else if (urlFound) {
// Any of these characters is invalid in a url
tokenize = true;
// And we aren't urling any more
urlFound = false;
urlProcessing = false;
}
}
else if (Arrays.binarySearch(DELIMITER_CHARS, thisChar) > -1) {
// These may not be deimiters if we are in a url
if (urlFound && (thisChar == '-' || thisChar == '_')) {
// We are in a url add the character
appendChar(thisChar, buffer, ignoreToSpace);
}
else {
// Treat as a delimiter
tokenize = true;
}
}
else {
// The character is not a stop/delimiter character
if (isSpace(thisChar)) { // Space
// We are either at a delimiter, or we are just hitting a double space
// A delmiter is indicated by the fact that the last character of the current
// buffer is a "normal" character
// Otherwise we just ignore it
if (!isSpace(lastChar)) {
// The space is a delimiter, add the buffer to the token list
tokenize = true;
}
// If we are ignoring, stop
ignoreToSpace = false;
// And we aren't urling any more
urlFound = false;
urlProcessing = false;
}
else if (thisChar == ',') {
// We could be in a currency amount
if (isInteger(lastChar) && isInteger(nextChar)) {
appendChar(thisChar, buffer, ignoreToSpace);
}
else if (!isSpace(nextChar)) {
// The next character is not a space, we should treat this as a delimiter
tokenize = true;
}
}
else if (thisChar == '@') {
// We could be in an email address
urlFound = true;
appendChar(thisChar, buffer, ignoreToSpace);
}
else if (thisChar == '!') {
// We only want to include one !
if (lastChar != '!') {
appendChar(thisChar, buffer, ignoreToSpace);
}
}
else if (thisChar == '$') {
// We only want to include one $
if (lastChar != '$') {
appendChar(thisChar, buffer, ignoreToSpace);
}
}
else if (thisChar == '/') {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?