📄 trainspellchecker.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.spell;import com.aliasi.corpus.TextHandler;import com.aliasi.lm.CompiledNGramProcessLM;import com.aliasi.lm.NGramProcessLM;import com.aliasi.tokenizer.Tokenizer;import com.aliasi.tokenizer.TokenizerFactory;import com.aliasi.util.AbstractExternalizable;import com.aliasi.util.Compilable;import com.aliasi.util.ObjectToCounterMap;import com.aliasi.util.Strings;import java.io.Externalizable;import java.io.ObjectInput;import java.io.ObjectOutput;import java.io.IOException;import java.io.Serializable;import java.util.HashSet;import java.util.Set;/** * A <code>TrainSpellChecker</code> instance provides a mechanism for * collecting training data for a compiled spell checker. Training * instances are nothing more than character sequences which represent * likely user queries. * * <h3>Data Normalization</h3> * * <P>In training the source language model, all training data is * whitespace normalized with an initial whitespace, final whitespace, * and all internal whitespace sequences converted to a single space * character. * * <h3>Token Sensitivity</h3> * * <P>A tokenization factory may be optionally specified for training * token-sensitive spell checkers. With tokenization, input is * further normalized to insert a single whitespace between all * tokens not already separated by a space in the input. The tokens * are then output during compilation and read back into the compiled * spell checker. The set of tokens output may be pruned to remove * any below a given count threshold. The resulting set of tokens * is used to constrain the set of alternative spellings suggested * during spelling correction to include only tokens in the observed * token set. * * <h3>Direct Training</h3> * * <P>As an alternative to using the spell checker trainer, a language * model may be trained directly and supplied in compiled form along * with a weighted edit distance to the public constructors for * compiled spell checkers. It's critical that the normalization happens * the same way as for the spell checker trainer. * * <h3>Weighted Edit Distance</h3> * * <P>In constructing a spell checker trainer, a compilable weighted * edit distance must be specified. This edit distance model will be * compiled along with the language model and token set and used as * the channel model in the compiled spell checker. The * * <h3>Compilation</h3> * * <P>After training, a model is written out through the * <code>Compilable</code> interface using {@link * #compileTo(ObjectOutput)}. When this model is read back in, it * will be an instance of {@link CompiledSpellChecker}. The compiled * spell checkers allow many runtime parameters to be tuned; see the * class documentation for full details. * * <h3>Serialization</h3> * * A spell checker trainer may be serialized in the usual way: * * <blockquote><pre> * TrainSpellChecker trainer = ...; * ObjectOutput out = ...; * out.writeObject(trainer);</pre></blockquote> * * And then read back in by reversing this operation: * * <blockquote><pre> * ObjectInput in = ...; * TrainSpellChecker trainer * = (TrainSpellChecker) in.readObject();</pre></blockquote> * * <p>The resulting round trip produces a trainer that is functionally * identical to the original one. Serialization is useufl for * storing models for which more training data will be available * later. * * <p><b>Warning:</b> The object input and output used for * serialization must extend {@link java.io.InputStream} and {@link * java.io.OutputStream}. The only implementations of {@link ObjectInput} and * {@link ObjectOutput} as of the 1.6 JDK do extend the streams, so * this will only be a problem with customized object input or output * objects. If you need this method to work with custom input and * output objects that do not extend the corresponding streams, drop * us a line and we can perhaps refactor the output methods to remove * this restriction. [Note: This warning was inherited from {@link * NGramProcessLM}.] * * * @author Bob Carpenter * @version 3.6 * @since LingPipe2.0 */public class TrainSpellChecker implements Compilable, TextHandler, Serializable { private final WeightedEditDistance mEditDistance; private final NGramProcessLM mLM; private final TokenizerFactory mTokenizerFactory; private final ObjectToCounterMap<String> mTokenCounter; private long mNumTrainingChars = 0L; private TrainSpellChecker(long numTrainingChars, WeightedEditDistance editDistance, NGramProcessLM lm, TokenizerFactory tokenizerFactory, ObjectToCounterMap<String> tokenCounter) { mNumTrainingChars = numTrainingChars; mEditDistance = editDistance; mLM = lm; mTokenizerFactory = tokenizerFactory; mTokenCounter = tokenCounter; } /** * Construct a non-tokenizing spell checker trainer from the * specified language model and edit distance. See {@link * SpellChecker} for more information on the language model and * edit distance models in the compiled spell checker. * * @param lm Compilable language model. * @param editDistance Compilable weighted edit distance. * @throws IllegalArgumentException If the edit distance is not * compilable. */ public TrainSpellChecker(NGramProcessLM lm, WeightedEditDistance editDistance) { this(lm,editDistance,null); } /** * Construct a spell checker trainer from the specified n-gram * process language model, tokenizer factory and edit distance. * The language model must be an instance of the character-level * n-gram process language model class. The edit distance must be * compilable. The tokenizer factory may be <code>null</code>, in * which case tokens are not saved as part of training and the * compiled spell checker is not token sensitive. If the * tokenizer factory is specified, it must be compilable. * * @param lm Compilable language model. * @param editDistance Compilable weighted edit distance. * @param tokenizerFactory Optional tokenizer factory. * @throws IllegalArgumentException If the edit distance is not * compilable or if the tokenizer factory is non-null and not compilable. */ public TrainSpellChecker(NGramProcessLM lm, WeightedEditDistance editDistance, TokenizerFactory tokenizerFactory) { assertCompilable("Edit distance",editDistance); if (tokenizerFactory != null) assertCompilable("Tokenizer factory",tokenizerFactory); mLM = lm; mTokenizerFactory = tokenizerFactory; mEditDistance = editDistance; mTokenCounter = new ObjectToCounterMap<String>(); } /** * Returns the n-gram process language model (source model) * underlying this spell checker trainer. * * <p>The returned value is a reference to the language model * held by the trainer, so any changes to it will affect this * spell checker. * * @return The n-gram process LM for this trainer. */ public NGramProcessLM languageModel() { return mLM; } /** * Returns the weighted edit distance (channel model) underlying this spell checker * trainer. * * <p>The returned value is a reference to the langauge model * held by the trainer, so any changes to it will affect this * spell checker. * * @return The edit distance for this trainer. */ public WeightedEditDistance editDistance() { return mEditDistance; } /** * Returns the counter for the tokens in the training set. This * may be used to print out the tokens with their counts for later * perusal. The value returned is the actual counter, so any * changes made to it will be reflected in this spell checker. * Pruning the token counts may have eliminated tokens in the
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -