spamtokenizer.java
来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 837 行 · 第 1/2 页
JAVA
837 行
// Treat as a delimiter if we are not in a url
// If we are in a url, and we are processing, ignore everything
// until the next space
if (urlProcessing) {
ignoreToSpace = true;
}
else {
// Otherwize treat as a delimiter
tokenize = true;
}
}
else if (thisChar == '.') {
// We could be at the end of a sentence,
// or in a domain name/IP address
// or in a currency amount
// If the dot is the end of a sentence, it should be treated as a delimiter
// Otherwise it is just added as normal
// We didn't get a dot last time
// If the last character was a number, we might be in an IP address or currency
if (isInteger(lastChar)) {
// The last character was a number, we could be an IP or currency
// If the next char is a number, we are
if (isInteger(nextChar) || urlFound) {
// Add the dot
appendChar(thisChar, buffer, ignoreToSpace);
}
} // We may be in an email address or URL
else if ((atFound || urlFound)) {
// We still may be a delimiter
if (isSpace(nextChar)) {
// We are at a delmiter
tokenize = true;
}
else {
// Add the dot
appendChar(thisChar, buffer, ignoreToSpace);
}
} // We may be a delimiter
else {
// We are at a delimiter, don't add the dot, but add the current buffer
tokenize = true;
}
}
else if (thisChar == '\n' || thisChar == '\r') {
// End of line, do nothing
}
else if (thisChar == '%') {
// Only include if the preceding character was an integer
if (isInteger(lastChar)) {
appendChar(thisChar, buffer, ignoreToSpace);
}
}
else if (isExtendedAscii(thisChar)) {
// The character is an "extended" ascii character
// Get the replacement
char replace = getExtendedReplacement(thisChar);
if (replace != 0x0000) {
urlProcessing = urlFound;
appendChar(replace, buffer, ignoreToSpace);
}
// If the char was replaced, record the replacement
if(replace != thisChar) {
// Increment the temp var. We will reset this at the end
iTmpCharReplace++;
}
}
else if (isNormalAscii(thisChar)) {
// Valid ASCII char
urlProcessing = urlFound;
if (htmlCharRef && isInteger((thisChar))) {
// Just add the character to the buffer for the htmlref
if (strHtmlCharRef == null) {
strHtmlCharRef = String.valueOf(thisChar);
}
else {
strHtmlCharRef += thisChar;
}
}
else {
appendChar(thisChar, buffer, ignoreToSpace);
}
}
else {
// We have some abnormal character, treat as a delimiter
tokenize = true;
}
}
// Are we at the end of the stream?
if(read == -1) {
token = buffer.toString();
tokenize = true;
}
if (tokenize) {
// One last check, is the token valid?
switch (isValidToken(token, urlMatch, maxTokens, tokenCount)) {
case VALID_TOKEN :
tokens = addToken(buffer, token, tokens, urlMatch, onlyUrls);
// Record any char replacements
if(iTmpCharReplace > 0) {
iCharReplace += iTmpCharReplace;
}
// Reset the linguistic errors
linguisticErrors = 0;
break;
case INVALID_TOKEN_LINGUISTIC_ERROR :
// We got a linguistic error.. should we quit?
linguisticErrors++;
if (linguisticErrors > linguisticLimit) {
// We have too many linguistic errors, stop tokenizing...
keepTokenizing = false;
// and remove the last THRESHOLD number of tokens
if (tokens != null) {
if (tokens.size() >= linguisticLimit) {
tokens = tokens.subList(0, tokens.size() - linguisticLimit);
}
else {
tokens = null;
}
}
}
else {
// Add the token
tokens = addToken(buffer, token, tokens, urlMatch, onlyUrls);
}
break;
case INVALID_TOKEN_MAX_TOKENS_EXCEEDED :
// We have exceeded the maximum token threshold, just exit
keepTokenizing = false;
break;
}
if(tokens != null) {
tokenCount = tokens.size();
}
// And empty the buffer
buffer.delete(0, buffer.length());
// Reset
tokenize = false;
atFound = false;
dotFound = false;
urlMatch = false;
// Reset char replacement counter
iTmpCharReplace = 0;
}
lastChar = thisChar;
thisChar = nextChar;
}
while (read != -1 && keepTokenizing);
}
if (tokens != null) {
tokenArray = (String[]) tokens.toArray(new String[tokens.size()]);
}
// Now, record any errors...
if(recorder != null) {
CountTokenErrorReport report = new CountTokenErrorReport();
report.increment(iCharReplace);
recorder.record(report);
}
return tokenArray;
}
/**
* Verifies whether the token is a valid token
* @param token
* @return
*/
private int isValidToken(String token, boolean urlMatch, int maxTokens, int tokenCount) {
int tokenLength = token.trim().length();
if (tokenLength < MIN_TOKEN_LENGTH)
return INVALID_TOKEN_TOO_SHORT;
if (Arrays.binarySearch(STOP_WORDS, token.toLowerCase()) > -1)
return INVALID_TOKEN_STOP_WORD;
if (isOnlyNumeric(token))
return INVALID_TOKEN_ONLY_NUMERIC;
// Don't bother trying to understand a URL
if (!urlMatch) {
if (!LinguisticAnalyzer.getInstance().isWord(token))
return INVALID_TOKEN_LINGUISTIC_ERROR;
if ((tokenCount + 1) > maxTokens) {
return INVALID_TOKEN_MAX_TOKENS_EXCEEDED;
}
}
// We are valid, what's our max tokens status
if ((tokenCount + 1) > maxTokens) {
return INVALID_TOKEN_MAX_TOKENS_EXCEEDED;
}
return VALID_TOKEN;
}
/**
* Adds a token to the current list
* @param token
* @param tokens
* @param atFound
* @return
*/
private List addToken(StringBuffer buffer, String token, List tokens, boolean urlMatch, boolean onlyUrls) {
boolean ipAddress = DNSUtils.isIPAddress(token);
if (onlyUrls && (urlMatch || ipAddress)) {
if (tokens == null)
tokens = new LinkedList();
if (!ipAddress) {
token = DNSUtils.getValidDomainOnly(token);
}
if (token != null && tokens != null) {
tokens.add(token.trim());
}
}
else if (!onlyUrls) {
if (tokens == null)
tokens = new LinkedList();
// If we are in an email address, just add the domain
int index = token.indexOf('@');
if (index > -1 && MimeUtils.isValidAddress(token)) {
token = token.substring(index + 1, token.length());
token = URLParser.URL_PREFIX + token;
}
else if (urlMatch && !ipAddress) {
// Just add the last part of the domain
token = DNSUtils.getValidDomainOnly(token);
if (token != null)
token = URLParser.URL_PREFIX + token;
}
else if (ipAddress) {
token = URLParser.URL_PREFIX + token;
}
if (token != null && tokens != null) {
tokens.add(token.trim());
}
}
return tokens;
}
/**
* Returns true if the token is only numbers
* @param token
* @return
*/
private boolean isOnlyNumeric(String token) {
char[] chars = token.toCharArray();
boolean numeric = true;
for (int i = 0; i < chars.length; i++) {
if (!isInteger(chars[i])) {
numeric = false;
break;
}
}
return numeric;
}
/**
* Peeks at the next character in the reader
* @param reader
* @return
*/
private char peek(Reader reader, char[] peekBuffer) throws IOException {
char peeked;
reader.mark(1);
if (reader.read(peekBuffer) > -1) {
peeked = peekBuffer[0];
}
else {
peeked = (char) - 1;
}
reader.reset();
return peeked;
}
private boolean isSpace(char chr) {
return (chr == 0x0020 || chr == 0x00A0 || chr == 0x2007 || chr == 0x202F);
}
private boolean isApostrophe(char chr) {
return (chr == 0x0027 || chr == 0x0060 || chr == 0x00B4);
}
/**
* Returns true if the character is a valid character to be following an apostrophe
* @param chr
* @return
*/
private boolean isValidApostropheNextChar(char chr) {
return (chr == 0x0073 || // s
chr == 0x0053 || // S
chr == 0x0074 || // t
chr == 0x0054 || // T
chr == 0x0072 || // r
chr == 0x0052 || // R
chr == 0x00AE);
}
/**
* Returns true if the character is a "standard" ascii character
* @param chr
* @return
*/
private boolean isNormalAscii(char chr) {
return (chr >= 33 && chr <= 122);
}
/**
* Extended ascii characters are chars outside the normal range which
* have "likely" replacements
* @param chr
* @return
*/
private boolean isExtendedAscii(char chr) {
return (chr >= 128 && chr <= 566);
}
/**
* Returns true if the character is 0-9
* @param chr
* @return
*/
private boolean isInteger(char chr) {
return (chr >= 48 && chr <= 57);
}
private void appendChar(char chr, StringBuffer buffer, boolean ignoreToSpace) {
if (!ignoreToSpace) {
buffer.append(chr);
}
}
private char getExtendedReplacement(char chr) {
//int iChr = (int)chr;
//char matchedChr = 0x0000; // invalid char
char matchedChr = chr;
int index = Arrays.binarySearch(LinguisticAnalyzer.EXTENDED_UNICODE_SEARCH, chr);
if (index > -1) {
matchedChr = (char) LinguisticAnalyzer.EXTENDED_UNICODE_REPLACE[index];
}
return matchedChr;
}
/**
* Gets the maximum number of tokens to be extracted prior to aborting the tokenization process
* @return The max number of tokens
*/
public int getMaxTokens() {
return maxTokens;
}
/**
* @param i
*/
public void setMaxTokens(int i) {
maxTokens = i;
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?