spamtokenizer.java

来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 837 行 · 第 1/2 页

JAVA
837
字号
						// Treat as a delimiter if we are not in a url

						// If we are in a url, and we are processing, ignore everything
						// until the next space
						if (urlProcessing) {
							ignoreToSpace = true;
						}
						else {
							// Otherwize treat as a delimiter
							tokenize = true;
						}

					}
					else if (thisChar == '.') {

						// We could be at the end of a sentence,
						//	or in a domain name/IP address
						//	or in a currency amount

						// If the dot is the end of a sentence, it should be treated as a delimiter
						// Otherwise it is just added as normal

						// We didn't get a dot last time
						// If the last character was a number, we might be in an IP address or currency
						if (isInteger(lastChar)) {
							// The last character was a number, we could be an IP or currency
							// If the next char is a number, we are
							if (isInteger(nextChar) || urlFound) {
								// Add the dot
								appendChar(thisChar, buffer, ignoreToSpace);
							}
						} // We may be in an email address or URL
						else if ((atFound || urlFound)) {

							// We still may be a delimiter
							if (isSpace(nextChar)) {
								// We are at a delmiter
								tokenize = true;
							}
							else {
								// Add the dot
								appendChar(thisChar, buffer, ignoreToSpace);
							}

						} // We may be a delimiter
						else {
							// We are at a delimiter, don't add the dot, but add the current buffer
							tokenize = true;
						}

					}
					else if (thisChar == '\n' || thisChar == '\r') {
						// End of line, do nothing
					}
					else if (thisChar == '%') {
						// Only include if the preceding character was an integer
						if (isInteger(lastChar)) {
							appendChar(thisChar, buffer, ignoreToSpace);
						}
					}
					else if (isExtendedAscii(thisChar)) {

						// The character is an "extended" ascii character
						// Get the replacement
						char replace = getExtendedReplacement(thisChar);

						if (replace != 0x0000) {

							urlProcessing = urlFound;
							appendChar(replace, buffer, ignoreToSpace);
						}

						// If the char was replaced, record the replacement
						if(replace != thisChar) {
						    // Increment the temp var.  We will reset this at the end
						    iTmpCharReplace++;
						}

					}
					else if (isNormalAscii(thisChar)) {
						// Valid ASCII char
						urlProcessing = urlFound;

						if (htmlCharRef && isInteger((thisChar))) {
							// Just add the character to the buffer for the htmlref
							if (strHtmlCharRef == null) {
								strHtmlCharRef = String.valueOf(thisChar);
							}
							else {
								strHtmlCharRef += thisChar;
							}
						}
						else {
							appendChar(thisChar, buffer, ignoreToSpace);
						}
					}
					else {
						// We have some abnormal character, treat as a delimiter
						tokenize = true;
					}
				}

				// Are we at the end of the stream?
				if(read == -1) {
				    token = buffer.toString();
				    tokenize = true;
				}

				if (tokenize) {

					// One last check, is the token valid?

					switch (isValidToken(token, urlMatch, maxTokens, tokenCount)) {
						case VALID_TOKEN :
							tokens = addToken(buffer, token, tokens, urlMatch, onlyUrls);

							// Record any char replacements
							if(iTmpCharReplace > 0) {
							    iCharReplace += iTmpCharReplace;
							}

							// Reset the linguistic errors
							linguisticErrors = 0;
							break;

						case INVALID_TOKEN_LINGUISTIC_ERROR :
							// We got a linguistic error.. should we quit?
							linguisticErrors++;

							if (linguisticErrors > linguisticLimit) {
								// We have too many linguistic errors, stop tokenizing...
								keepTokenizing = false;

								// and remove the last THRESHOLD number of tokens
								if (tokens != null) {
									if (tokens.size() >= linguisticLimit) {
										tokens = tokens.subList(0, tokens.size() - linguisticLimit);
									}
									else {
										tokens = null;
									}
								}
							}
							else {
								// Add the token
								tokens = addToken(buffer, token, tokens, urlMatch, onlyUrls);
							}

							break;

						case INVALID_TOKEN_MAX_TOKENS_EXCEEDED :

							// We have exceeded the maximum token threshold, just exit
							keepTokenizing = false;
							break;
					}

					if(tokens != null) {
						tokenCount = tokens.size();
					}

					// And empty the buffer
					buffer.delete(0, buffer.length());

					// Reset
					tokenize = false;
					atFound = false;
					dotFound = false;
					urlMatch = false;

					// Reset char replacement counter
					iTmpCharReplace = 0;
				}

				lastChar = thisChar;
				thisChar = nextChar;
			}
			while (read != -1 && keepTokenizing);
		}

		if (tokens != null) {
			tokenArray = (String[]) tokens.toArray(new String[tokens.size()]);
		}

		// Now, record any errors...
		if(recorder != null) {
		    CountTokenErrorReport report = new CountTokenErrorReport();
		    report.increment(iCharReplace);
		    recorder.record(report);
		}

		return tokenArray;
	}

	/**
	 * Verifies whether the token is a valid token
	 * @param token
	 * @return
	 */
	private int isValidToken(String token, boolean urlMatch, int maxTokens, int tokenCount) {

		int tokenLength = token.trim().length();

		if (tokenLength < MIN_TOKEN_LENGTH)
			return INVALID_TOKEN_TOO_SHORT;

		if (Arrays.binarySearch(STOP_WORDS, token.toLowerCase()) > -1)
			return INVALID_TOKEN_STOP_WORD;

		if (isOnlyNumeric(token))
			return INVALID_TOKEN_ONLY_NUMERIC;

		// Don't bother trying to understand a URL
		if (!urlMatch) {
			if (!LinguisticAnalyzer.getInstance().isWord(token))
				return INVALID_TOKEN_LINGUISTIC_ERROR;

			if ((tokenCount + 1) > maxTokens) {
				return INVALID_TOKEN_MAX_TOKENS_EXCEEDED;
			}
		}

		// We are valid, what's our max tokens status
		if ((tokenCount + 1) > maxTokens) {
			return INVALID_TOKEN_MAX_TOKENS_EXCEEDED;
		}

		return VALID_TOKEN;
	}

	/**
	 * Adds a token to the current list
	 * @param token
	 * @param tokens
	 * @param atFound
	 * @return
	 */
	private List addToken(StringBuffer buffer, String token, List tokens, boolean urlMatch, boolean onlyUrls) {

		boolean ipAddress = DNSUtils.isIPAddress(token);

		if (onlyUrls && (urlMatch || ipAddress)) {

			if (tokens == null)
				tokens = new LinkedList();

			if (!ipAddress) {
				token = DNSUtils.getValidDomainOnly(token);
			}

			if (token != null && tokens != null) {
				tokens.add(token.trim());
			}
		}
		else if (!onlyUrls) {
			if (tokens == null)
				tokens = new LinkedList();

			// If we are in an email address, just add the domain
			int index = token.indexOf('@');

			if (index > -1 && MimeUtils.isValidAddress(token)) {
				token = token.substring(index + 1, token.length());
				token = URLParser.URL_PREFIX + token;
			}
			else if (urlMatch && !ipAddress) {
				// Just add the last part of the domain
				token = DNSUtils.getValidDomainOnly(token);
				if (token != null)
					token = URLParser.URL_PREFIX + token;

			}
			else if (ipAddress) {
				token = URLParser.URL_PREFIX + token;
			}

			if (token != null && tokens != null) {
				tokens.add(token.trim());
			}
		}

		return tokens;
	}

	/**
	 * Returns true if the token is only numbers
	 * @param token
	 * @return
	 */
	private boolean isOnlyNumeric(String token) {

		char[] chars = token.toCharArray();

		boolean numeric = true;

		for (int i = 0; i < chars.length; i++) {
			if (!isInteger(chars[i])) {
				numeric = false;
				break;
			}
		}

		return numeric;
	}

	/**
	 * Peeks at the next character in the reader
	 * @param reader
	 * @return
	 */
	private char peek(Reader reader, char[] peekBuffer) throws IOException {
		char peeked;

		reader.mark(1);
		if (reader.read(peekBuffer) > -1) {
			peeked = peekBuffer[0];
		}
		else {
			peeked = (char) - 1;
		}

		reader.reset();

		return peeked;
	}

	private boolean isSpace(char chr) {
		return (chr == 0x0020 || chr == 0x00A0 || chr == 0x2007 || chr == 0x202F);
	}

	private boolean isApostrophe(char chr) {
		return (chr == 0x0027 || chr == 0x0060 || chr == 0x00B4);
	}

	/**
	 * Returns true if the character is a valid character to be following an apostrophe
	 * @param chr
	 * @return
	 */
	private boolean isValidApostropheNextChar(char chr) {
		return (chr == 0x0073 || // s
		chr == 0x0053 || // S
		chr == 0x0074 || // t
		chr == 0x0054 || // T
		chr == 0x0072 || // r
		chr == 0x0052 || // R
		chr == 0x00AE);
	}

	/**
	 * Returns true if the character is a "standard" ascii character
	 * @param chr
	 * @return
	 */
	private boolean isNormalAscii(char chr) {
		return (chr >= 33 && chr <= 122);
	}

	/**
	 * Extended ascii characters are chars outside the normal range which
	 * have "likely" replacements
	 * @param chr
	 * @return
	 */
	private boolean isExtendedAscii(char chr) {
		return (chr >= 128 && chr <= 566);
	}

	/**
	 * Returns true if the character is 0-9
	 * @param chr
	 * @return
	 */
	private boolean isInteger(char chr) {
		return (chr >= 48 && chr <= 57);
	}

	private void appendChar(char chr, StringBuffer buffer, boolean ignoreToSpace) {
		if (!ignoreToSpace) {
			buffer.append(chr);
		}
	}

	private char getExtendedReplacement(char chr) {

		//int iChr = (int)chr;

		//char matchedChr = 0x0000; // invalid char
		char matchedChr = chr;

		int index = Arrays.binarySearch(LinguisticAnalyzer.EXTENDED_UNICODE_SEARCH, chr);

		if (index > -1) {
			matchedChr = (char) LinguisticAnalyzer.EXTENDED_UNICODE_REPLACE[index];
		}

		return matchedChr;
	}

	/**
	 * Gets the maximum number of tokens to be extracted prior to aborting the tokenization process
	 * @return The max number of tokens
	 */
	public int getMaxTokens() {
		return maxTokens;
	}

	/**
	 * @param i
	 */
	public void setMaxTokens(int i) {
		maxTokens = i;
	}



}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?