📄 binarylmclassifier.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.classify;import com.aliasi.lm.LanguageModel;import com.aliasi.lm.UniformBoundaryLM;import com.aliasi.lm.UniformProcessLM;import com.aliasi.stats.MultivariateEstimator;/** * A <code>BinaryLMClassifier</code> is a boolean dynamic language * model classifier based on a single language model and cross-entropy * threshold. It defines two categories, accept and reject, with * acceptance determined by measuring sample cross-entropy rate in a * language model against a threshold. As a language model * classifier, the multivariate category estimator is uniform, the * accepting language model is dynamic, and the rejecting language * model is constant. * * <P>As an instance of language model classifier, this class provides * scores that are adjusted per-character average log probabilities, * which are roughly negative sample cross-entropy rates (see {@link * LMClassifier}). The accepting language model behaves in the usual * way. The rejecting language model provides a constant * per-character log estimate. The uniform rejecting model is defined * to be a boundary uniform lanuage model if the specified model is a * sequence language model and a process uniform language model * otherwise. * * <P>Training events may be supplied in the same way as for the * superclass {@link DynamicLMClassifier}, with two caveats. First, * the multivariate category model remains uniform and thus does not * contribute to classification. Second, training events for the * rejection category are ignored. Thus only the language model for * the accepting category is trained. The broader interface is * implemented without exceptions in order to allow binary classifiers * to be plugged in for ones with explicit rejection models. * * <P>Instances of this class are compilable as instances of their * superclass. The resulting object read back in will be an instance * of {@link LMClassifier}, not of this class, but its classification * behavior will be identical. * * <P>Resetting category language models is not allowed for binary * language model classifiers, because they only contain one model and * all else is constant. * * <P>Binary langauge model classifiers are concurrent-read and * single-write thread safe. The only write operation is training the * accepting category. Classification and compilation are reads. If * the language model underlying this classifier is not thread safe, * then reads may not be called concurrently. * * @author Bob Carpenter * @version 3.0 * @since LingPipe2.0 */public class BinaryLMClassifier extends DynamicLMClassifier<LanguageModel.Dynamic> { private final String mAcceptCategory; private final String mRejectCategory; /** * Construct a binary character sequence classifier that accepts * or rejects inputs based on their cross-entropy being above or * below a fixed cross-entropy threshold. If an input is accepted * the best category will be {@link #DEFAULT_ACCEPT_CATEGORY}, * otherwise it will be {@link #DEFAULT_REJECT_CATEGORY}. The * labels of the categories can be reversed in order to build a * rejector or changed altogether with the four-argument * constructor. See the class documentation for more information * on training, classification and compilation. * * @param acceptingLM The language model that determines * whether an input is accepted or rejected. * @param crossEntropyThreshold Maximum cross-entropy against a * model to accept the input. */ public BinaryLMClassifier(LanguageModel.Dynamic acceptingLM, double crossEntropyThreshold) { this(acceptingLM,crossEntropyThreshold, DEFAULT_ACCEPT_CATEGORY, DEFAULT_REJECT_CATEGORY); } /** * Construct a binary character sequence classifier that accepts * or rejects inputs based on their cross-entropy being above or * below a fixed cross-entropy threshold. If an input is accepted * the best category will be the specified accept category, * otherwise it will be the specified reject category. See the * class documentation for more information on training, * classification and compilation. * * @param acceptingLM The language model that determines * whether an input is accepted or rejected. * @param crossEntropyThreshold Maximum cross-entropy against a * model to accept the input. * @param acceptCategory Category label for matching input. * @param rejectCategory Category label for rejecting input. */ public BinaryLMClassifier(LanguageModel.Dynamic acceptingLM, double crossEntropyThreshold, String acceptCategory, String rejectCategory) { super(new String[] { rejectCategory, acceptCategory }, new LanguageModel.Dynamic[] { createRejectLM(crossEntropyThreshold, acceptingLM), acceptingLM }); mAcceptCategory = acceptCategory; mRejectCategory = rejectCategory; // set up to uniform distribution categoryEstimator().train(acceptCategory,1); categoryEstimator().train(rejectCategory,1); } /** * Returns the category assigned to matching/accepted cases. * * @return The acceptance category. */ public String acceptCategory() { return mAcceptCategory; } /** * Returns the category assigned to non-matching/rejected cases. * * @return The rejection category. */ public String rejectCategory() { return mRejectCategory; } /** * If the specified category is the accept catgory, train the * underlying language model. If the category is the reject * category, only the category distribution is trained. Either way, the multivariate * category estimate is not updated. * * @throws IllegalArgumentException If the category is unknown. */ public void train(String category, char[] cs, int start, int end) { super.train(category,cs,start,end); } /** * If the specified category is the accept catgory, train the * underlying language model. If the category is the reject * category, ignore the call. Either way, the multivariate * category estimate is not updated. * * @param category Category of this training sample. * @param cSeq Char sequence for this training sample. * @throws IllegalArgumentException If the category is unknown. */ public void train(String category, CharSequence cSeq) { lmForCategory(mAcceptCategory).train(cSeq); } /** * Throws an {@link UnsupportedOperationException}. * * @param category Ignored. * @param lm Ignored. * @param newCount Ignored. * @throws UnsupportedOperationException Always. */ public void resetCategory(String category, LanguageModel.Dynamic lm, int newCount) { String msg = "Resets not allowed for Binary LM classifier."; throw new UnsupportedOperationException(msg); } static LanguageModel.Dynamic createRejectLM(double crossEntropyThreshold, LanguageModel acceptingLM) { if (acceptingLM instanceof LanguageModel.Sequence) return new UniformBoundaryLM(crossEntropyThreshold); else return new UniformProcessLM(crossEntropyThreshold); } /** * The default value of the category for accepting * input, "true". */ public static final String DEFAULT_ACCEPT_CATEGORY = Boolean.TRUE.toString(); /** * The default value of the category for rejecting input, * "false". */ public static final String DEFAULT_REJECT_CATEGORY = Boolean.FALSE.toString(); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -