📄 dynamiclmclassifier.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.classify;import com.aliasi.corpus.ClassificationHandler;import com.aliasi.corpus.Corpus;import com.aliasi.corpus.ObjectHandler;import com.aliasi.stats.MultivariateDistribution;import com.aliasi.stats.MultivariateEstimator;import com.aliasi.lm.LanguageModel;import com.aliasi.lm.NGramProcessLM;import com.aliasi.lm.NGramBoundaryLM;import com.aliasi.lm.TokenizedLM;import com.aliasi.tokenizer.TokenizerFactory;import com.aliasi.util.AbstractExternalizable;import com.aliasi.util.Compilable;import com.aliasi.util.Factory;import java.io.ObjectInput;import java.io.ObjectOutput;import java.io.IOException;/** * A <code>DynamicLMClassifier</code> is a language model classifier * that accepts training events of categorized character sequences. * Training is based on a multivariate estimator for the category * distribution and dynamic language models for the per-category * character sequence estimators. These models also form the basis of * the superclass's implementation of classification. * * <P>Because this class implements training and classification, it * may be used in tag-a-little, learn-a-little supervised learning * without retraining epochs. This makes it ideal for active * learning applications, for instance. * * <P>At any point after adding training events, the classfier may be * compiled to an object output. The classifier read back in will be * a non-dynamic instance of {@link LMClassifier}. It will be based * on the compiled version of the multivariate estimator and the * compiled version of the dynamic language models for the categories. * * <P>Instances of this class allow concurrent read operations but * require writes to run exclusively. Reads in this context are * either calculating estimates or compiling; writes are training. * Extensions to LingPipe's classes may impose tighter restrictions. * For instance, a subclass of <code>MultivariateEstimator</code> * might be used that does not allow concurrent estimates; in that * case, its restrictions are passed on to this classifier. The same * goes for the language models and in the case of token language * models, the tokenizer factories. * * @author Bob Carpenter * @version 3.3.1 * @since LingPipe2.0 */public class DynamicLMClassifier<L extends LanguageModel.Dynamic> extends LMClassifier<L,MultivariateEstimator> implements ClassificationHandler<CharSequence,Classification>, Compilable { /** * Construct a dynamic language model classifier over the * specified categories with specified language * models per category and an overall category estimator. * * <P>The multivariate estimator over categories is initialized * with one count for each category. Technically, initializing * counts involves a uniform Dirichlet prior with * <code>α=1</code>, which is often called Laplace * smoothing. * * @param categories Categories used for classification. * @param languageModels Dynamic language models for categories. * @throws IllegalArgumentException If there are not at least two * categories, or if the length of the category and language model * arrays is not the same. */ public DynamicLMClassifier(String[] categories, L[] languageModels) { super(categories, languageModels, createCategoryEstimator(categories)); } /** * Provide a training instance for the specified category * consisting of the sequence of characters in the specified * character slice. A call to this method increments the count of * the category in the maximum likelihood estimator and also * trains the language model for the specified category. Thus the * balance of categories reflected in calls to this method for * training should reflect the balance of categories in the test * set. * * <P>No modeling of the begin or end of the sequence is carried * out. If such a behavior is desired, it should be reflected in * the training instances supplied to this method. * * <P>The component models for this classifier may be accessed and * trained independently using {@link #categoryEstimator()} and * {@link #lmForCategory(String)}. * * @param category Category of this training sequence. * @param cs Characters used for training. * @param start Index of first character to use for training. * @param end Index of one past the last character to use for * training. * @throws IllegalArgumentException If the category is not known. */ public void train(String category, char[] cs, int start, int end) { train(category,new String(cs,start,end-start)); } /** * Provide a training instance for the specified category * consisting of the specified sample character sequence. * Training behavior is as described in {@link * #train(String,char[],int,int)}. * * @param category Category of this training sequence. * @param sampleCSeq Category sequence for training. * @throws IllegalArgumentException If the category is not known. */ public void train(String category, CharSequence sampleCSeq) { train(category,sampleCSeq,1); } /** * Provide a training instance for the specified category * consisting of the specified sample character sequence with the * specified count. Training behavior is as described in {@link * #train(String,char[],int,int)}. * * <p>Counts of zero are ignored, whereas counts less than * zero raise an exception. * * @param category Category of this training sequence. * @param sampleCSeq Category sequence for training. * @param count Number of training instances. * @throws IllegalArgumentException If the category is not known * or if the count is negative. */ public void train(String category, CharSequence sampleCSeq, int count) { if (count < 0) { String msg = "Counts must be non-negative." + " Found count=" + count; throw new IllegalArgumentException(msg); } if (count == 0) return; lmForCategory(category).train(sampleCSeq,count); categoryEstimator().train(category,count); } // this is only needed by the em training method private static class EmHandler implements ObjectHandler<CharSequence> { private final DynamicLMClassifier mClassifier; private final DynamicLMClassifier mLastClassifier; private final double mMultiple; EmHandler(DynamicLMClassifier classifier, DynamicLMClassifier lastClassifier, double multiple) { mClassifier = classifier; mLastClassifier = lastClassifier; mMultiple = multiple; } public void handle(CharSequence cs) { ConditionalClassification classification = mLastClassifier.classify(cs); for (int rank = 0; rank < classification.size(); ++rank) { String category = classification.category(rank); double pCatGivenCs = classification.conditionalProbability(rank); int count = (int) (pCatGivenCs * mMultiple); mClassifier.train(category,cs,count); } } } /** * Train a dynamic language model classifier using the specified * labeled and unlabled corpora with the expectation maximization * (EM) algorithm run for the specified number of epochs with the * specified instance multiple, creating a dynamic classifier for * each epoch using the specified factory. * * <p>The training instance multiple parameter specifies the * quantization of conditional probabilities into integer counts. * The higher the value, the more outcomes are used for each * unlabeled instance. * * <p>The exact form of the EM algorithm as used by this method * is: * * <blockquote><pre> * 1. create classifier using factory * 2. train on labeled data * 3. for each epoch: * A. create a new classifier * B. train the new classifier on labeled data * C. for each unlabeled datum * i. classify using last classifier * ii. for each output category in result * a. multiply conditional prob by multiple, cast to int * b. train new classifier on datum using category plus count * </pre></blockquote> * * @param classifierFactory Factory for creating the dynamic * language model classifiers needed by EM. * @param labeledData A corpus of labeled data. * @param unlabeledData A corpus of unlabeled data. * @param numEpochs Number of epochs to run EM. * @param trainingInstanceMultiple Amount to multiply each * conditional probability by to generate an integer count * for training. */ public static <L extends LanguageModel.Dynamic> DynamicLMClassifier<L> trainEm(Factory<DynamicLMClassifier<L>> classifierFactory,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -