📄 linguisticanalyzer.java

📁 spam source codejasen-0.9jASEN - java Anti Spam ENgine.zip 如标题所示
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
		0x0055, // U
		0x0059, // Y
		0x0070, // p
		0x0042, // B
		0x0061, // a
		0x0061, // a
		0x0061, // a
		0x0061, // a
		0x0061, // a
		0x0061, // a
		0x0061, // a
		0x0063, // c
		0x0065, // e
		0x0065, // e
		0x0065, // e
		0x0065, // e
		0x0069, // i
		0x0069, // i
		0x0069, // i
		0x0069, // i
		0x006F, // o
		0x006E, // n
		0x006F, // o
		0x006F, // o
		0x006F, // o
		0x006F, // o
		0x006F, // o
		0x006F, // o
		0x0075, // u
		0x0075, // u
		0x0075, // u
		0x0075, // u
		0x0079, // y
		0x0070, // p
		0x0079,  // y

		0x0041,
		0x0061,
		0x0041,
		0x0061,
		0x0041,
		0x0061,
		0x0043,
		0x0063,
		0x0043,
		0x0063,
		0x0043,
		0x0063,
		0x0043,
		0x0063,
		0x0044,
		0x0064,
		0x0044,
		0x0064,
		0x0045,
		0x0065,
		0x0045,
		0x0065,
		0x0045,
		0x0065,
		0x0045,
		0x0065,
		0x0045,
		0x0065,
		0x0047,
		0x0067,
		0x0047,
		0x0067,
		0x0047,
		0x0067,
		0x0047,
		0x0067,
		0x0048,
		0x0068,
		0x0048,
		0x0068,
		0x0049,
		0x0069,
		0x0049,
		0x0069,
		0x0049,
		0x0069,
		0x0049,
		0x0069,
		0x0049,
		0x0069,
		0x004A,
		0x006A,
		0x004B,
		0x006B,
		0x004B,
		0x004C,
		0x006C,
		0x004C,
		0x006C,
		0x004C,
		0x006C,
		0x004C,
		0x006C,
		0x004C,
		0x006C,
		0x004E,
		0x006E,
		0x004E,
		0x006E,
		0x004E,
		0x006E,
		0x006E,
		0x004E,
		0x006E,
		0x004F,
		0x006F,
		0x004F,
		0x006F,
		0x004F,
		0x006F,
		0x0052,
		0x0072,
		0x0052,
		0x0072,
		0x0052,
		0x0072,
		0x0053,
		0x0073,
		0x0053,
		0x0073,
		0x0053,
		0x0073,
		0x0053,
		0x0073,
		0x0054,
		0x0074,
		0x0054,
		0x0074,
		0x0054,
		0x0074,
		0x0055,
		0x0075,
		0x0055,
		0x0075,
		0x0055,
		0x0075,
		0x0055,
		0x0075,
		0x0055,
		0x0075,
		0x0055,
		0x0075,
		0x0057,
		0x0077,
		0x0059,
		0x0079,
		0x0059,
		0x005A,
		0x007A,
		0x005A,
		0x007A,
		0x005A,
		0x007A,
		0x0066,
		0x0062,
		0x0042,
		0x0062,
		0x0062,
		0x0062,
		0x0062,
		0x0043,
		0x0043,
		0x0063,
		0x0044,
		0x0044,
		0x0064,
		0x0064,
		0x0071,
		0x0045,
		0x0065,
		0x0045,
		0x0046,
		0x0066,
		0x0047,
		0x0056,
		0x0068,
		0x0069,
		0x0049,
		0x004B,
		0x006B,
		0x0069,
		0x0061,
		0x0077,
		0x004E,
		0x006E,
		0x004F,
		0x004F,
		0x006F,
		0x0071,
		0x0071,
		0x0050,
		0x0070,
		0x0052,
		0x0053,
		0x0073,
		0x0045,
		0x006C,
		0x0074,
		0x0054,
		0x0066,
		0x0054,
		0x0055,
		0x0075,
		0x0055,
		0x0056,
		0x0059,
		0x0079,
		0x005A,
		0x007A,
		0x0033,
		0x0045,
		0x0065,
		0x0033,
		0x0032,
		0x0035,
		0x0035,
		0x0074,
		0x0070,
		0x0069,
		0x0069,
		0x0074,
		0x0069,
		0x0041,
		0x0061,
		0x0049,
		0x0069,
		0x004F,
		0x006F,
		0x0055,
		0x0075,
		0x0055,
		0x0075,
		0x0055,
		0x0075,
		0x0055,
		0x0075,
		0x0055,
		0x0075,
		0x0065,
		0x0041,
		0x0061,
		0x0041,
		0x0061,
		0x0041,
		0x0061,
		0x0047,
		0x0067,
		0x0047,
		0x0067,
		0x004B,
		0x006B,
		0x0051,
		0x0071,
		0x0051,
		0x0071,
		0x0033,
		0x0033,
		0x006A,
		0x0047,
		0x0067,
		0x0048,
		0x0050,
		0x004E,
		0x006E,
		0x0041,
		0x0061,
		0x0041,
		0x0061,
		0x004F,
		0x006F,
		0x0041,
		0x0061,
		0x0041,
		0x0061,
		0x0045,
		0x0065,
		0x0045,
		0x0065,
		0x0049,
		0x0069,
		0x0049,
		0x0069,
		0x004F,
		0x006F,
		0x004F,
		0x006F,
		0x0052,
		0x0072,
		0x0052,
		0x0072,
		0x0055,
		0x0075,
		0x0055,
		0x0075,
		0x0053,
		0x0073,
		0x0054,
		0x0074,
		0x0033,
		0x0033,
		0x0048,
		0x0068,
		0x006E,
		0x0064,
		0x0038,
		0x0038,
		0x005A,
		0x007A,
		0x0041,
		0x0061,
		0x0045,
		0x0065,
		0x004F,
		0x006F,
		0x004F,
		0x006F,
		0x004F,
		0x006F,
		0x004F,
		0x006F,
		0x0059,
		0x0079,
		0x006C,
		0x006E,
		0x0074

	};



	/**
	 * Returns the current instance, or creates and initialises the internal analyzer
	 * @return The single analyzer instance
	 * @throws IOException
	 */
	public static final LinguisticAnalyzer getInstance()  {
		if(instance == null || instance.analyzer == null) {

		    synchronized(lock) {

		        if(instance == null || instance.analyzer == null) {

					try {
			            // Create the analyzer first...
			            LexicalTreeAnalyzer analyzer = new LexicalTreeAnalyzer();
			            analyzer.initialize();
			            instance = new LinguisticAnalyzer();
			            instance.analyzer = analyzer;
					}
					catch (IOException e) {
						// This should never happen
						throw new RuntimeException(e);
					}
					finally {
					    lock.notifyAll();
					}
		        }
		        else
		        {
		            lock.notifyAll();
		        }
		    }
		}
		return instance;
	}

	/**
	 * Computes the probability that the given word is a "real" word
	 * @param word
	 * @param clean If true, abberate characters (non alphabetical) are removed
	 * @return A value from 0.0 to 1.0
	 */
	public double getWordScore(String word, boolean clean) {
		if(clean) word = clean(word);
		return analyzer.computeWordValue(word);
	}

	/**
	 * Computes the probability that the given word is a "real" word
	 * @param word The word to test
	 * @return A value between 0.0 and 1.0 indicating the probability that the word is genuine English word
	 */
	public double getWordScore(String word) {
		return getWordScore(word, true);
	}

	/**
	 * Returns true if the word is valid according to the given threshold.
	 * <BR><BR>
	 * If the probability calculated is &gt;= threshold then true is returned
	 * @param word The word to test
	 * @param threshold Should be a value between 0.0 and 1.0
	 * @return True if the String passed looks like a word, false otherwise
	 */
	public boolean isWord(String word, double threshold, boolean clean) {
		return (getWordScore(word, clean) >= threshold);
	}

	/**
	 * Returns true if the word is valid according to the default threshold of 0.1.
	 * @param word The word to test
	 * @param clean If true, the word has extended ASCII characters replaced with ASCII equivalents.
	 * @return  True if the String passed looks like a word, false otherwise.
	 */
	public boolean isWord(String word, boolean clean) {
		return isWord(word, DEFAULT_THRESHOLD, clean);
	}

	/**
	 * Returns true if the word is valid according to the default threshold of 0.1.
	 * @param word The word to test
	 * @return  True if the String passed looks like a word, false otherwise.
	 */
	public boolean isWord(String word) {
		return isWord(word, DEFAULT_THRESHOLD, true);
	}

	/**
	 * Uses the replacement facilities of the analyzer to "estimate" the best character replacements to clean the word.
	 * @param word The word to investigate.
	 * @return The same word with abberant characters replaced.
	 */
	public static String clean(String word) {

		if(word != null) {
			char[] arrWord = word.toCharArray();

			for (int i = 0; i < arrWord.length; i++) {
				arrWord[i] = getFullReplacement(arrWord[i]);
			}
			word = new String(arrWord);
		}
		return word;
	}

	/**
	 * Gets the most logical standard ASCII replacement for the extended ASCII character passed
	 * @param chr The character to replace.  Usually non ASCII
	 * @return The replaced character.  The best match based on physical appearance of the character is made
	 */
	public static char getExtendedReplacement(char chr) {
		return getReplacement(chr, EXTENDED_UNICODE_SEARCH, EXTENDED_UNICODE_REPLACE);
	}

	/**
	 * Does a stanndard replacement of ASCII characters to ASCII characters.
	 * <P>
	 * This is used in situations where a word has been deliberately obfuscated 
	 * by using similar looking characters in replacement for the actual alternative.
	 * </p>
	 * <P>
	 * For example:  The word: "he||0 w0r|d" should be interpreted as "hello world"
	 * </P>
	 * @param chr The standard ASCII character to replace
	 * @return The replaced character
	 */
	public static char getStandardReplacement(char chr) {
		return getReplacement(chr, STANDARD_UNICODE_SEARCH, STANDARD_UNICODE_REPLACE);
	}

	/**
	 * Looks for either standard, or extended replacements for the given character
	 * @param chr The character to replace
	 * @return The replaced character
	 * @see LinguisticAnalyzer#getExtendedReplacement(char)
	 * @see LinguisticAnalyzer#getStandardReplacement(char)
	 */
	public static char getFullReplacement(char chr) {

		char ret = getStandardReplacement(chr);

		if(ret == chr) {
			ret = getExtendedReplacement(chr);
		}

		return ret;
	}

	private static char getReplacement(char chr, char[] search, char[] replace) {
		int index = Arrays.binarySearch(search, chr);

		if(index > -1) {
			return replace[index];
		}
		else
		{
			return chr;
		}
	}
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -