📄 linguisticanalyzer.java
字号:
0x0055, // U
0x0059, // Y
0x0070, // p
0x0042, // B
0x0061, // a
0x0061, // a
0x0061, // a
0x0061, // a
0x0061, // a
0x0061, // a
0x0061, // a
0x0063, // c
0x0065, // e
0x0065, // e
0x0065, // e
0x0065, // e
0x0069, // i
0x0069, // i
0x0069, // i
0x0069, // i
0x006F, // o
0x006E, // n
0x006F, // o
0x006F, // o
0x006F, // o
0x006F, // o
0x006F, // o
0x006F, // o
0x0075, // u
0x0075, // u
0x0075, // u
0x0075, // u
0x0079, // y
0x0070, // p
0x0079, // y
0x0041,
0x0061,
0x0041,
0x0061,
0x0041,
0x0061,
0x0043,
0x0063,
0x0043,
0x0063,
0x0043,
0x0063,
0x0043,
0x0063,
0x0044,
0x0064,
0x0044,
0x0064,
0x0045,
0x0065,
0x0045,
0x0065,
0x0045,
0x0065,
0x0045,
0x0065,
0x0045,
0x0065,
0x0047,
0x0067,
0x0047,
0x0067,
0x0047,
0x0067,
0x0047,
0x0067,
0x0048,
0x0068,
0x0048,
0x0068,
0x0049,
0x0069,
0x0049,
0x0069,
0x0049,
0x0069,
0x0049,
0x0069,
0x0049,
0x0069,
0x004A,
0x006A,
0x004B,
0x006B,
0x004B,
0x004C,
0x006C,
0x004C,
0x006C,
0x004C,
0x006C,
0x004C,
0x006C,
0x004C,
0x006C,
0x004E,
0x006E,
0x004E,
0x006E,
0x004E,
0x006E,
0x006E,
0x004E,
0x006E,
0x004F,
0x006F,
0x004F,
0x006F,
0x004F,
0x006F,
0x0052,
0x0072,
0x0052,
0x0072,
0x0052,
0x0072,
0x0053,
0x0073,
0x0053,
0x0073,
0x0053,
0x0073,
0x0053,
0x0073,
0x0054,
0x0074,
0x0054,
0x0074,
0x0054,
0x0074,
0x0055,
0x0075,
0x0055,
0x0075,
0x0055,
0x0075,
0x0055,
0x0075,
0x0055,
0x0075,
0x0055,
0x0075,
0x0057,
0x0077,
0x0059,
0x0079,
0x0059,
0x005A,
0x007A,
0x005A,
0x007A,
0x005A,
0x007A,
0x0066,
0x0062,
0x0042,
0x0062,
0x0062,
0x0062,
0x0062,
0x0043,
0x0043,
0x0063,
0x0044,
0x0044,
0x0064,
0x0064,
0x0071,
0x0045,
0x0065,
0x0045,
0x0046,
0x0066,
0x0047,
0x0056,
0x0068,
0x0069,
0x0049,
0x004B,
0x006B,
0x0069,
0x0061,
0x0077,
0x004E,
0x006E,
0x004F,
0x004F,
0x006F,
0x0071,
0x0071,
0x0050,
0x0070,
0x0052,
0x0053,
0x0073,
0x0045,
0x006C,
0x0074,
0x0054,
0x0066,
0x0054,
0x0055,
0x0075,
0x0055,
0x0056,
0x0059,
0x0079,
0x005A,
0x007A,
0x0033,
0x0045,
0x0065,
0x0033,
0x0032,
0x0035,
0x0035,
0x0074,
0x0070,
0x0069,
0x0069,
0x0074,
0x0069,
0x0041,
0x0061,
0x0049,
0x0069,
0x004F,
0x006F,
0x0055,
0x0075,
0x0055,
0x0075,
0x0055,
0x0075,
0x0055,
0x0075,
0x0055,
0x0075,
0x0065,
0x0041,
0x0061,
0x0041,
0x0061,
0x0041,
0x0061,
0x0047,
0x0067,
0x0047,
0x0067,
0x004B,
0x006B,
0x0051,
0x0071,
0x0051,
0x0071,
0x0033,
0x0033,
0x006A,
0x0047,
0x0067,
0x0048,
0x0050,
0x004E,
0x006E,
0x0041,
0x0061,
0x0041,
0x0061,
0x004F,
0x006F,
0x0041,
0x0061,
0x0041,
0x0061,
0x0045,
0x0065,
0x0045,
0x0065,
0x0049,
0x0069,
0x0049,
0x0069,
0x004F,
0x006F,
0x004F,
0x006F,
0x0052,
0x0072,
0x0052,
0x0072,
0x0055,
0x0075,
0x0055,
0x0075,
0x0053,
0x0073,
0x0054,
0x0074,
0x0033,
0x0033,
0x0048,
0x0068,
0x006E,
0x0064,
0x0038,
0x0038,
0x005A,
0x007A,
0x0041,
0x0061,
0x0045,
0x0065,
0x004F,
0x006F,
0x004F,
0x006F,
0x004F,
0x006F,
0x004F,
0x006F,
0x0059,
0x0079,
0x006C,
0x006E,
0x0074
};
/**
* Returns the current instance, or creates and initialises the internal analyzer
* @return The single analyzer instance
* @throws IOException
*/
public static final LinguisticAnalyzer getInstance() {
if(instance == null || instance.analyzer == null) {
synchronized(lock) {
if(instance == null || instance.analyzer == null) {
try {
// Create the analyzer first...
LexicalTreeAnalyzer analyzer = new LexicalTreeAnalyzer();
analyzer.initialize();
instance = new LinguisticAnalyzer();
instance.analyzer = analyzer;
}
catch (IOException e) {
// This should never happen
throw new RuntimeException(e);
}
finally {
lock.notifyAll();
}
}
else
{
lock.notifyAll();
}
}
}
return instance;
}
/**
* Computes the probability that the given word is a "real" word
* @param word
* @param clean If true, abberate characters (non alphabetical) are removed
* @return A value from 0.0 to 1.0
*/
public double getWordScore(String word, boolean clean) {
if(clean) word = clean(word);
return analyzer.computeWordValue(word);
}
/**
* Computes the probability that the given word is a "real" word
* @param word The word to test
* @return A value between 0.0 and 1.0 indicating the probability that the word is genuine English word
*/
public double getWordScore(String word) {
return getWordScore(word, true);
}
/**
* Returns true if the word is valid according to the given threshold.
* <BR><BR>
* If the probability calculated is >= threshold then true is returned
* @param word The word to test
* @param threshold Should be a value between 0.0 and 1.0
* @return True if the String passed looks like a word, false otherwise
*/
public boolean isWord(String word, double threshold, boolean clean) {
return (getWordScore(word, clean) >= threshold);
}
/**
* Returns true if the word is valid according to the default threshold of 0.1.
* @param word The word to test
* @param clean If true, the word has extended ASCII characters replaced with ASCII equivalents.
* @return True if the String passed looks like a word, false otherwise.
*/
public boolean isWord(String word, boolean clean) {
return isWord(word, DEFAULT_THRESHOLD, clean);
}
/**
* Returns true if the word is valid according to the default threshold of 0.1.
* @param word The word to test
* @return True if the String passed looks like a word, false otherwise.
*/
public boolean isWord(String word) {
return isWord(word, DEFAULT_THRESHOLD, true);
}
/**
* Uses the replacement facilities of the analyzer to "estimate" the best character replacements to clean the word.
* @param word The word to investigate.
* @return The same word with abberant characters replaced.
*/
public static String clean(String word) {
if(word != null) {
char[] arrWord = word.toCharArray();
for (int i = 0; i < arrWord.length; i++) {
arrWord[i] = getFullReplacement(arrWord[i]);
}
word = new String(arrWord);
}
return word;
}
/**
* Gets the most logical standard ASCII replacement for the extended ASCII character passed
* @param chr The character to replace. Usually non ASCII
* @return The replaced character. The best match based on physical appearance of the character is made
*/
public static char getExtendedReplacement(char chr) {
return getReplacement(chr, EXTENDED_UNICODE_SEARCH, EXTENDED_UNICODE_REPLACE);
}
/**
* Does a stanndard replacement of ASCII characters to ASCII characters.
* <P>
* This is used in situations where a word has been deliberately obfuscated
* by using similar looking characters in replacement for the actual alternative.
* </p>
* <P>
* For example: The word: "he||0 w0r|d" should be interpreted as "hello world"
* </P>
* @param chr The standard ASCII character to replace
* @return The replaced character
*/
public static char getStandardReplacement(char chr) {
return getReplacement(chr, STANDARD_UNICODE_SEARCH, STANDARD_UNICODE_REPLACE);
}
/**
* Looks for either standard, or extended replacements for the given character
* @param chr The character to replace
* @return The replaced character
* @see LinguisticAnalyzer#getExtendedReplacement(char)
* @see LinguisticAnalyzer#getStandardReplacement(char)
*/
public static char getFullReplacement(char chr) {
char ret = getStandardReplacement(chr);
if(ret == chr) {
ret = getExtendedReplacement(chr);
}
return ret;
}
private static char getReplacement(char chr, char[] search, char[] replace) {
int index = Arrays.binarySearch(search, chr);
if(index > -1) {
return replace[index];
}
else
{
return chr;
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -