📄 bayesianclassifier.java
字号:
double matchProbability = classify(category, input);
return (matchProbability >= cutoff);
}
protected double classify(String category, String words[]) throws WordsDataSourceException {
WordProbability[] wps = calcWordsProbability(category, words);
return normaliseSignificance(calculateOverallProbability(wps));
}
protected void teachMatch(String category, String words[]) throws WordsDataSourceException {
boolean categorise = false;
if (wordsData instanceof ICategorisedWordsDataSource) {
categorise = true;
}
for (int i = 0; i <= words.length - 1; i++) {
if (isClassifiableWord(words[i])) {
if (categorise) {
((ICategorisedWordsDataSource) wordsData).addMatch(category, transformWord(words[i]));
} else {
wordsData.addMatch(transformWord(words[i]));
}
}
}
}
protected void teachNonMatch(String category, String words[]) throws WordsDataSourceException {
boolean categorise = false;
if (wordsData instanceof ICategorisedWordsDataSource) {
categorise = true;
}
for (int i = 0; i <= words.length - 1; i++) {
if (isClassifiableWord(words[i])) {
if (categorise) {
((ICategorisedWordsDataSource) wordsData).addNonMatch(category, transformWord(words[i]));
} else {
wordsData.addNonMatch(transformWord(words[i]));
}
}
}
}
/**
* Allows transformations to be done to word.
* This implementation transforms the word to lowercase if the classifier
* is in case-insenstive mode.
*
* @param word
* @return the transformed word
* @throws IllegalArgumentException if a null is passed
*/
protected String transformWord(String word) {
if (word != null) {
if (!isCaseSensitive) {
return word.toLowerCase();
} else {
return word;
}
} else {
throw new IllegalArgumentException("Null cannot be passed");
}
}
/**
*
* NOTE: Override this method with care. There is a good chance it will be removed
* or have signature changes is later versions.
*
* <br />
* @todo need an option to only use the "X" most "important" words when calculating overall probability
* "important" is defined as being most distant from NEUTAL_PROBABILITY
*/
protected double calculateOverallProbability(WordProbability[] wps) {
if (wps == null || wps.length == 0) {
return IClassifier.NEUTRAL_PROBABILITY;
} else {
// we need to calculate xy/(xy + z)
// where z = (1-x)(1-y)
// firstly, calculate z and xy
double z = 0d;
double xy = 0d;
for (int i = 0; i < wps.length; i++) {
if (z == 0) {
z = (1 - wps[i].getProbability());
} else {
z = z * (1 - wps[i].getProbability());
}
if (xy == 0) {
xy = wps[i].getProbability();
} else {
xy = xy * wps[i].getProbability();
}
}
double numerator = xy;
double denominator = xy + z;
return numerator / denominator;
}
}
private WordProbability[] calcWordsProbability(String category, String[] words) throws WordsDataSourceException {
if (category == null) {
throw new IllegalArgumentException("category cannont be null");
}
boolean categorise = false;
if (wordsData instanceof ICategorisedWordsDataSource) {
categorise = true;
}
checkCategoriesSupported(category);
if (words == null) {
return new WordProbability[0];
} else {
List wps = new ArrayList();
for (int i = 0; i < words.length; i++) {
if (isClassifiableWord(words[i])) {
WordProbability wp = null;
if (categorise) {
wp = ((ICategorisedWordsDataSource) wordsData).getWordProbability(category, transformWord(words[i]));
} else {
wp = wordsData.getWordProbability(transformWord(words[i]));
}
if (wp != null) {
wps.add(wp);
}
}
}
return (WordProbability[]) wps.toArray(new WordProbability[wps.size()]);
}
}
private void checkCategoriesSupported(String category) {
// if the category is not the default
if (!ICategorisedClassifier.DEFAULT_CATEGORY.equals(category)) {
// and the data source does not support categories
if (!(wordsData instanceof ICategorisedWordsDataSource)) {
// throw an IllegalArgumentException
throw new IllegalArgumentException("Word Data Source does not support non-default categories.");
}
}
}
private boolean isClassifiableWord(String word) {
if (word == null || "".equals(word) || stopWordProvider.isStopWord(word)) {
return false;
} else {
return true;
}
}
protected static double normaliseSignificance(double sig) {
if (Double.compare(IClassifier.UPPER_BOUND, sig) < 0) {
return IClassifier.UPPER_BOUND;
} else if (Double.compare(IClassifier.LOWER_BOUND, sig) > 0) {
return IClassifier.LOWER_BOUND;
} else {
return sig;
}
}
/**
* @return true if the classifier is case sensitive, false otherwise
* (false by default)
*/
public boolean isCaseSensitive() {
return isCaseSensitive;
}
/**
* @param b True if the classifier should be case sensitive, false otherwise
*/
public void setCaseSensitive(boolean b) {
isCaseSensitive = b;
}
/**
* @return the {@link net.sf.classifier4J.bayesian.IWordsDataSource} used
* by this classifier
*/
public IWordsDataSource getWordsDataSource() {
return wordsData;
}
/**
* @return the {@link net.sf.classifier4J.ITokenizer} used
* by this classifier
*/
public ITokenizer getTokenizer() {
return tokenizer;
}
/**
* @return the {@link net.sf.classifier4J.IStopWordProvider} used
* by this classifier
*/
public IStopWordProvider getStopWordProvider() {
return stopWordProvider;
}
public String toString() {
return new ToStringBuilder(this).append("IWordsDataSource", wordsData).append("ITokenizer", tokenizer).append("IStopWordProvider", stopWordProvider).toString();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -