📄 bayesianclassifier.java

📁 Classifier4J是一个很好的基于java的分类器,里面有Native bayes和KNN等方法的文本分类.另外还提供了分词和自动摘要等功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12

        double matchProbability = classify(category, input);

        return (matchProbability >= cutoff);
    }

    protected double classify(String category, String words[]) throws WordsDataSourceException {
        WordProbability[] wps = calcWordsProbability(category, words);
        return normaliseSignificance(calculateOverallProbability(wps));
    }

    protected void teachMatch(String category, String words[]) throws WordsDataSourceException {
        boolean categorise = false;
        if (wordsData instanceof ICategorisedWordsDataSource) {
            categorise = true;
        }
        for (int i = 0; i <= words.length - 1; i++) {
            if (isClassifiableWord(words[i])) {
                if (categorise) {
                    ((ICategorisedWordsDataSource) wordsData).addMatch(category, transformWord(words[i]));
                } else {
                    wordsData.addMatch(transformWord(words[i]));
                }
            }
        }
    }

    protected void teachNonMatch(String category, String words[]) throws WordsDataSourceException {
        boolean categorise = false;
        if (wordsData instanceof ICategorisedWordsDataSource) {
            categorise = true;
        }

        for (int i = 0; i <= words.length - 1; i++) {
            if (isClassifiableWord(words[i])) {
                if (categorise) {
                    ((ICategorisedWordsDataSource) wordsData).addNonMatch(category, transformWord(words[i]));
                } else {
                    wordsData.addNonMatch(transformWord(words[i]));
                }

            }
        }
    }

    /**
     * Allows transformations to be done to word.
     * This implementation transforms the word to lowercase if the classifier
     * is in case-insenstive mode.
     *
     * @param word
     * @return the transformed word
     * @throws IllegalArgumentException if a null is passed
     */
    protected String transformWord(String word) {
        if (word != null) {
            if (!isCaseSensitive) {
                return word.toLowerCase();
            } else {
                return word;
            }
        } else {
            throw new IllegalArgumentException("Null cannot be passed");
        }
    }

    /**
     *
     * NOTE: Override this method with care. There is a good chance it will be removed
     * or have signature changes is later versions.
     *
     * <br />
     * @todo need an option to only use the "X" most "important" words when calculating overall probability
     * "important" is defined as being most distant from NEUTAL_PROBABILITY
     */
    protected double calculateOverallProbability(WordProbability[] wps) {
        if (wps == null || wps.length == 0) {
            return IClassifier.NEUTRAL_PROBABILITY;
        } else {
            // we need to calculate xy/(xy + z)
            // where z = (1-x)(1-y)

            // firstly, calculate z and xy
            double z = 0d;
            double xy = 0d;
            for (int i = 0; i < wps.length; i++) {
                if (z == 0) {
                    z = (1 - wps[i].getProbability());
                } else {
                    z = z * (1 - wps[i].getProbability());
                }

                if (xy == 0) {
                    xy = wps[i].getProbability();
                } else {
                    xy = xy * wps[i].getProbability();
                }
            }

            double numerator = xy;
            double denominator = xy + z;

            return numerator / denominator;
        }
    }

    private WordProbability[] calcWordsProbability(String category, String[] words) throws WordsDataSourceException {
        if (category == null) {
            throw new IllegalArgumentException("category cannont be null");
        }

        boolean categorise = false;
        if (wordsData instanceof ICategorisedWordsDataSource) {
            categorise = true;
        }

        checkCategoriesSupported(category);

        if (words == null) {
            return new WordProbability[0];
        } else {
            List wps = new ArrayList();
            for (int i = 0; i < words.length; i++) {
                if (isClassifiableWord(words[i])) {
                    WordProbability wp = null;
                    if (categorise) {
                        wp = ((ICategorisedWordsDataSource) wordsData).getWordProbability(category, transformWord(words[i]));
                    } else {
                        wp = wordsData.getWordProbability(transformWord(words[i]));
                    }
                    if (wp != null) {
                        wps.add(wp);
                    }
                }
            }
            return (WordProbability[]) wps.toArray(new WordProbability[wps.size()]);
        }
    }

    private void checkCategoriesSupported(String category) {
        // if the category is not the default
        if (!ICategorisedClassifier.DEFAULT_CATEGORY.equals(category)) {
            // and the data source does not support categories
            if (!(wordsData instanceof ICategorisedWordsDataSource)) {
                // throw an IllegalArgumentException
                throw new IllegalArgumentException("Word Data Source does not support non-default categories.");
            }
        }
    }

    private boolean isClassifiableWord(String word) {
        if (word == null || "".equals(word) || stopWordProvider.isStopWord(word)) {
            return false;
        } else {
            return true;
        }
    }

    protected static double normaliseSignificance(double sig) {

        if (Double.compare(IClassifier.UPPER_BOUND, sig) < 0) {
            return IClassifier.UPPER_BOUND;
        } else if (Double.compare(IClassifier.LOWER_BOUND, sig) > 0) {
            return IClassifier.LOWER_BOUND;
        } else {
            return sig;
        }
    }
    /**
     * @return true if the classifier is case sensitive, false otherwise
     * (false by default)
     */
    public boolean isCaseSensitive() {
        return isCaseSensitive;
    }

    /**
     * @param b True if the classifier should be case sensitive, false otherwise
     */
    public void setCaseSensitive(boolean b) {
        isCaseSensitive = b;
    }

    /**
     * @return the {@link net.sf.classifier4J.bayesian.IWordsDataSource} used
     * by this classifier
     */
    public IWordsDataSource getWordsDataSource() {
        return wordsData;
    }

    /**
     * @return the {@link net.sf.classifier4J.ITokenizer} used
     * by this classifier
     */
    public ITokenizer getTokenizer() {
        return tokenizer;
    }

    /**
     * @return the {@link net.sf.classifier4J.IStopWordProvider} used
     * by this classifier
     */
    public IStopWordProvider getStopWordProvider() {
        return stopWordProvider;
    }

    public String toString() {
        return new ToStringBuilder(this).append("IWordsDataSource", wordsData).append("ITokenizer", tokenizer).append("IStopWordProvider", stopWordProvider).toString();
    }

}
上一页 12
💿 文件大小 710 K
👤 上传用户 sunny_02
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#Classifier4J #Native #bayes #java
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -