📄 bayesianclassifier.java
字号:
/*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 2003 Nick Lothian. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* developers of Classifier4J (http://classifier4j.sf.net/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The name "Classifier4J" must not be used to endorse or promote
* products derived from this software without prior written
* permission. For written permission, please contact
* http://sourceforge.net/users/nicklothian/.
*
* 5. Products derived from this software may not be called
* "Classifier4J", nor may "Classifier4J" appear in their names
* without prior written permission. For written permission, please
* contact http://sourceforge.net/users/nicklothian/.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*/
package net.sf.classifier4J.bayesian;
import java.util.ArrayList;
import java.util.List;
import net.sf.classifier4J.AbstractCategorizedTrainableClassifier;
import net.sf.classifier4J.DefaultStopWordsProvider;
import net.sf.classifier4J.DefaultTokenizer;
import net.sf.classifier4J.ICategorisedClassifier;
import net.sf.classifier4J.IClassifier;
import net.sf.classifier4J.IStopWordProvider;
import net.sf.classifier4J.ITokenizer;
import net.sf.classifier4J.util.ToStringBuilder;
/**
*
* <p>A implementation of {@link net.sf.classifier4J.IClassifier} based on Bayes'
* theorem (see http://www.wikipedia.org/wiki/Bayes_theorem).</p>
*
* <p>The basic usage pattern for this class is:
* <ol>
* <li>Create a instance of {@link net.sf.classifier4J.bayesian.IWordsDataSource}</li>
* <li>Create a new instance of BayesianClassifier, passing the IWordsDataSource
* to the constructor</li>
* <li>Call {@link net.sf.classifier4J.IClassifier#classify(java.lang.String) }
* or {@link net.sf.classifier4J.IClassifier#isMatch(java.lang.String) }
* </ol>
* </p>
*
* <p>For example:<br>
* <tt>
* IWordsDataSource wds = new SimpleWordsDataSource();<br>
* IClassifier classifier = new BayesianClassifier(wds);<br>
* System.out.println( "Matches = " + classifier.classify("This is a sentence") );
* </tt>
* </p>
*
* @author Nick Lothian
* @author Peter Leschev
*
*/
public class BayesianClassifier extends AbstractCategorizedTrainableClassifier {
IWordsDataSource wordsData;
ITokenizer tokenizer;
IStopWordProvider stopWordProvider;
private boolean isCaseSensitive = false;
/**
* Default constructor that uses the SimpleWordsDataSource & a DefaultTokenizer
* (set to BREAK_ON_WORD_BREAKS).
*/
public BayesianClassifier() {
this(new SimpleWordsDataSource(), new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));
}
/**
* Constructor for BayesianClassifier that specifies a datasource. The
* DefaultTokenizer (set to BREAK_ON_WORD_BREAKS) will be used.
*
* @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
*/
public BayesianClassifier(IWordsDataSource wd) {
this(wd, new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS));
}
/**
* Constructor for BayesianClassifier that specifies a datasource & tokenizer
*
* @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
* @param tokenizer a {@link net.sf.classifier4J.ITokenizer}
*/
public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer) {
this(wd, tokenizer, new DefaultStopWordsProvider());
}
/**
* Constructor for BayesianClassifier that specifies a datasource, tokenizer
* and stop words provider
*
* @param wd a {@link net.sf.classifier4J.bayesian.IWordsDataSource}
* @param tokenizer a {@link net.sf.classifier4J.ITokenizer}
* @param swp a {@link net.sf.classifier4J.IStopWordProvider}
*/
public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer, IStopWordProvider swp) {
if (wd == null) {
throw new IllegalArgumentException("IWordsDataSource can't be null");
}
this.wordsData = wd;
if (tokenizer == null) {
throw new IllegalArgumentException("ITokenizer can't be null");
}
this.tokenizer = tokenizer;
if (swp == null) {
throw new IllegalArgumentException("IStopWordProvider can't be null");
}
this.stopWordProvider = swp;
}
/**
* @see net.sf.classifier4J.ICategorisedClassifier#isMatch(java.lang.String, java.lang.String)
*/
public boolean isMatch(String category, String input) throws WordsDataSourceException {
return isMatch(category, tokenizer.tokenize(input));
}
/**
* @see net.sf.classifier4J.ICategorisedClassifier#classify(java.lang.String, java.lang.String)
*/
public double classify(String category, String input) throws WordsDataSourceException {
if (category == null) {
throw new IllegalArgumentException("category cannot be null");
}
if (input == null) {
throw new IllegalArgumentException("input cannot be null");
}
checkCategoriesSupported(category);
return classify(category, tokenizer.tokenize(input));
}
public void teachMatch(String category, String input) throws WordsDataSourceException {
if (category == null) {
throw new IllegalArgumentException("category cannot be null");
}
if (input == null) {
throw new IllegalArgumentException("input cannot be null");
}
checkCategoriesSupported(category);
teachMatch(category, tokenizer.tokenize(input));
}
public void teachNonMatch(String category, String input) throws WordsDataSourceException {
if (category == null) {
throw new IllegalArgumentException("category cannot be null");
}
if (input == null) {
throw new IllegalArgumentException("input cannot be null");
}
checkCategoriesSupported(category);
teachNonMatch(category, tokenizer.tokenize(input));
}
protected boolean isMatch(String category, String input[]) throws WordsDataSourceException {
if (category == null) {
throw new IllegalArgumentException("category cannot be null");
}
if (input == null) {
throw new IllegalArgumentException("input cannot be null");
}
checkCategoriesSupported(category);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -