📄 dynamiclmclassifier.java
字号:
Corpus<ClassificationHandler<CharSequence,Classification>> labeledData, Corpus<ObjectHandler<CharSequence>> unlabeledData, int numEpochs, double trainingInstanceMultiple) throws IOException { DynamicLMClassifier<L> lastClassifier = classifierFactory.create(); labeledData.visitCorpus(lastClassifier); for (int epoch = 0; epoch < numEpochs; ++epoch) { DynamicLMClassifier<L> classifier = classifierFactory.create(); labeledData.visitCorpus(classifier); ObjectHandler<CharSequence> emHandler = new EmHandler(classifier,lastClassifier,trainingInstanceMultiple); unlabeledData.visitCorpus(emHandler); lastClassifier = classifier; } return lastClassifier; } /** * Provides a training instance for the specified character * sequence using the best category from the specified * classification. Only the first-best category from the * classification is used. The object is cast to {@link CharSequence}, * and the result passed along with the first-best category * to {@link #train(String,CharSequence)}. * * @param charSequence Character sequence for training. * @param classification Classification to use for training. * @throws ClassCastException If the specified object does not * implement <code>CharSequence</code>. */ public void handle(CharSequence charSequence, Classification classification) { train(classification.bestCategory(),(CharSequence) charSequence); } /** * Returns the maximum likelihood estimator for categories in this * classifier. Changes to the returned model will be reflected in * this classifier; thus it may be used to train the category * estimator without affecting the language models for any * category. * * @return The maximum likelihood estimator for categories in this * classifier. * @deprecated As of 3.0, use general method {@link * #categoryDistribution()}. */ public MultivariateEstimator categoryEstimator() { return (MultivariateEstimator) mCategoryDistribution; } /** * Returns the language model for the specified category. Changes * to the returned model will be reflected in this classifier; thus * it may be used to train a language model without affecting * the category estimates. * * @return The language model for the specified category. * @throws IllegalArgumentException If the category is not known. * @deprecated As of 3.0, use general {@link #languageModel(String)}. */ public L lmForCategory(String category) { L result = mCategoryToModel.get(category); if (result == null) { String msg = "Unknown category=" + category; throw new IllegalArgumentException(msg); } return result; } /** * Writes a compiled version of this classifier to the specified * object output. The object returned will be an instance * of {@link LMClassifier}. * * @param objOut Object output to which this classifier is * written. * @throws IOException If there is an I/O exception writing to * the output stream. */ public void compileTo(ObjectOutput objOut) throws IOException { objOut.writeObject(new Externalizer(this)); } /** * Resets the specified category to the specified language model. * This also resets the count in the multivariate estimator of * categories to zero. * * @param category Category to reset. * @param lm New dynamic language model for category. * @param newCount New count for category. * @throws IllegalArgumentException If the category is not known. */ public void resetCategory(String category, L lm, int newCount) { if (newCount < 0) { String msg = "Count must be non-negative." + " Found new count=" + newCount; throw new IllegalArgumentException(msg); } categoryEstimator().resetCount(category); // resets to zero categoryEstimator().train(category,newCount); L currentLM = lmForCategory(category); for (int i = 0; i < mLanguageModels.length; ++i) { if (currentLM == mLanguageModels[i]) { mLanguageModels[i] = lm; break; } } mCategoryToModel.put(category,lm); } /** * Construct a dynamic classifier over the specified categories, * using process character n-gram models of the specified order. * * <P>See the documentation for the constructor {@link * #DynamicLMClassifier(String[], LanguageModel.Dynamic[])} for * information on the category multivariate estimate for priors. * * @param categories Categories used for classification. * @param maxCharNGram Maximum length of character sequence * counted in model. * @throws IllegalArgumentException If there are not at least two * categories. */ public static DynamicLMClassifier<NGramProcessLM> createNGramProcess(String[] categories, int maxCharNGram) { NGramProcessLM[] lms = new NGramProcessLM[categories.length]; for (int i = 0; i < lms.length; ++i) lms[i] = new NGramProcessLM(maxCharNGram); return new DynamicLMClassifier<NGramProcessLM>(categories,lms); } /** * Construct a dynamic classifier over the specified cateogries, * using boundary character n-gram models of the specified order. * * <P>See the documentation for the constructor {@link * #DynamicLMClassifier(String[], LanguageModel.Dynamic[])} for * information on the category multivariate estimate for priors. * * @param categories Categories used for classification. * @param maxCharNGram Maximum length of character sequence * counted in model. * @throws IllegalArgumentException If there are not at least two * categories. */ public static DynamicLMClassifier<NGramBoundaryLM> createNGramBoundary(String[] categories, int maxCharNGram) { NGramBoundaryLM[] lms = new NGramBoundaryLM[categories.length]; for (int i = 0; i < lms.length; ++i) lms[i] = new NGramBoundaryLM(maxCharNGram); return new DynamicLMClassifier<NGramBoundaryLM>(categories,lms); } /** * Construct a dynamic language model classifier over the * specified categories using token n-gram language models of the * specified order and the specified tokenizer factory for * tokenization. * * <P>The multivariate estimator over categories is initialized * with one count for each category. * * <P>The unknown token and whitespace models are uniform sequence * models. * * @param categories Categories used for classification. * @param maxTokenNGram Maximum length of token n-grams used. * @param tokenizerFactory Tokenizer factory for tokenization. * @throws IllegalArgumentException If there are not at least two * categories. */ public static DynamicLMClassifier<TokenizedLM> createTokenized(String[] categories, TokenizerFactory tokenizerFactory, int maxTokenNGram) { TokenizedLM[] lms = new TokenizedLM[categories.length]; for (int i = 0; i < lms.length; ++i) lms[i] = new TokenizedLM(tokenizerFactory,maxTokenNGram); return new DynamicLMClassifier<TokenizedLM>(categories,lms); } // used in init and by other classes to create a smoothed estimator static MultivariateEstimator createCategoryEstimator(String[] categories) { MultivariateEstimator estimator = new MultivariateEstimator(); for (int i = 0; i < categories.length; ++i) estimator.train(categories[i],1); return estimator; } private static class Externalizer extends AbstractExternalizable { static final long serialVersionUID = -5411956637253735953L; final DynamicLMClassifier mClassifier; public Externalizer() { mClassifier = null; } public Externalizer(DynamicLMClassifier classifier) { mClassifier = classifier; } public void writeExternal(ObjectOutput objOut) throws IOException { objOut.writeObject(mClassifier.categories()); mClassifier.categoryEstimator().compileTo(objOut); int numCategories = mClassifier.mCategories.length; for (int i = 0; i < numCategories; ++i) ((LanguageModel.Dynamic) mClassifier.mLanguageModels[i]).compileTo(objOut); } public Object read(ObjectInput objIn) throws ClassNotFoundException, IOException { String[] categories = (String[]) objIn.readObject(); MultivariateDistribution categoryEstimator = (MultivariateDistribution) objIn.readObject(); LanguageModel[] models = new LanguageModel[categories.length]; for (int i = 0; i < models.length; ++i) models[i] = (LanguageModel) objIn.readObject(); return new LMClassifier(categories,models,categoryEstimator); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -