📄 tokenizedlm.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.lm;import com.aliasi.corpus.IntArrayHandler;import com.aliasi.corpus.StringArrayHandler;import com.aliasi.corpus.TextHandler;import com.aliasi.symbol.SymbolTable;import com.aliasi.symbol.MapSymbolTable;import com.aliasi.stats.BinomialDistribution;import com.aliasi.stats.Statistics;import com.aliasi.tokenizer.Tokenizer;import com.aliasi.tokenizer.TokenizerFactory;import com.aliasi.util.AbstractExternalizable;// import com.aliasi.util.Arrays;import com.aliasi.util.BoundedPriorityQueue;import com.aliasi.util.Exceptions;// import com.aliasi.util.Math;import com.aliasi.util.Scored;import com.aliasi.util.ScoredObject;import com.aliasi.util.Strings;import java.io.ObjectInput;import java.io.ObjectOutput;import java.io.IOException;import java.util.ArrayList;import java.util.Comparator;import java.util.LinkedList;import java.util.Iterator;/** * A <code>TokenizedLM</code> provides a dynamic sequence language * model which models token sequences with an n-gram model, and * whitespace and unknown tokens with their own sequence language * models. * * <P>A tokenized language model factors the probability assigned to a * character sequence as follows: * * <blockquote><code> * P(cs) * = P<sub><sub>tok</sub></sub>(toks(cs)) * <big><big><big>Π</big></big></big><sub><sub>t in unknownToks(cs)</sub></sub> * P<sub><sub>unk</sub></sub>(t) * <big><big><big>Π</big></big></big><sub><sub>w in whitespaces(cs)</sub></sub> * P<sub><sub>whsp</sub></sub>(w) * </code></blockquote> * * where * * <UL> * <LI> <code>P<sub><sub>tok</sub></sub></code> is the token model * esimate and where <code>toks(cs)</code> replaces known tokens with * their integer identifiers, unknown tokens with <code>-1</code> and * adds boundary symbols <code>-2</code> front and back, the same * adjustment is used to remove the initial boundary estimate as in * {@link NGramBoundaryLM}; * * <LI> <code>P<sub><sub>unk</sub></sub></code> is the unknown token * sequence language model and <code>unknownToks(cs)</code> is the * list of unknown tokens in the input (with duplicates); and * * <LI> <code>P<sub><sub>whsp</sub></sub></code> is the whitespace sequence * language model and <code>whitespaces(cs)</code> is the list of * whitespaces in the character sequence (with duplicates). * * </UL> * * <P>The token n-gram model itself uses the same method of counting * and smoothing as described in the class documentation for {@link * NGramProcessLM}. Like {@link NGramBoundaryLM}, boundary tokens are * inserted before and after other tokens. And like the n-gram * character boundary model, the initial boundary estimate is subtracted * from the overall estimate for normalization purposes. * * <P>Tokens are all converted to integer identifiers using an * internal dynamic symbol table. All symbols in symbol tables get * non-negative identifiers; the negative value <code>-1</code> is * used for the unknown token in models, just as in symbol tables. * The value <code>-2</code> is used for the boundary marker in the * counters. * * <P>In order for all estimates to be non-zero, the integer * sequence counter used to back the token model is initialized * with a count of 1 for the end-of-stream identifier (-2). The * unknown token count for any context is taken to be the number * of outcomes in that context. Because unknowns are estimated * directly in this manner, there is no need to interpolate the * unigram model with a uniform model for unknown outcome. Instead, * the occurrence of an unknown is modeled directly and its * identity is modeled by the unknown token language model. * * <P>In order to produce a properly normalized sequence model, the * concatenation of tokens and whitespaces returned by the tokenizer * should concatenate together to produce the original input. Note * that this condition is <i>not</i> checked at runtime. But, * sequences may be normalized before being trained and evaluated for * a language model. For instance, all alphabetic characters might be * reduced to lower case and all punctuation characters removed and * all non-empty sequences of whitespace reduced to a single space * character. A langauge model may then be defined over this * normalized space of input, not the original space (and may thus use * a reduced number of characters for its uniform estimates). * Although this normalization may be carried out by a tokenizer in * practice, for instance for use in a tokenized classifier, an * normalization is consistent the interface specification for {@link * LanguageModel.Sequence} or {@link LanguageModel.Dynamic} only if * done on the outside. * * @author Bob Carpenter * @version 3.1.2 * @since LingPipe2.0 */public class TokenizedLM implements LanguageModel.Dynamic, LanguageModel.Sequence, LanguageModel.Tokenized, TextHandler { private final TokenizerFactory mTokenizerFactory; private final MapSymbolTable mSymbolTable; private final TrieIntSeqCounter mCounter; private final LanguageModel.Sequence mUnknownTokenModel; private final LanguageModel.Sequence mWhitespaceModel; private final double mLambdaFactor; private final LanguageModel.Dynamic mDynamicUnknownTokenModel; private final LanguageModel.Dynamic mDynamicWhitespaceModel; private final int mNGramOrder; /** * Constructs a tokenized language model with the specified * tokenization factory and n-gram order. The unknown token and * whitespace models are both uniform sequence language models * with default parameters as described in the documentation for * the constructor {@link UniformBoundaryLM#UniformBoundaryLM()}. * The default interpolation hyperparameter is equal to the n-gram * Order. * * @param factory Tokenizer factory for the model. * @param nGramOrder N-gram Order. * @throws IllegalArgumentException If the n-gram order is less * than 0. */ public TokenizedLM(TokenizerFactory factory, int nGramOrder) { this(factory, nGramOrder, new UniformBoundaryLM(), new UniformBoundaryLM(), nGramOrder); } /** * Construct a tokenized language model with the specified * tokenization factory and n-gram order, sequence models for * unknown tokens and whitespace, and an interpolation * hyperparameter. * * <P>In order for this model to be serializable, the unknown * token and whitespace models should be serializable. If they do * not, a runtime exception will be thrown when attempting to * serialize this model. If these models implement {@link * LanguageModel.Dynamic}, they will be trained by calls to the * training method. * * @param tokenizerFactory Tokenizer factory for the model. * @param nGramOrder Length of maximum n-gram for model. * @param unknownTokenModel Sequence model for unknown tokens. * @param whitespaceModel Sequence model for all whitespace. * @param lambdaFactor Value of the interpolation hyperparameter. * @throws IllegalArgumentException If the n-gram order is less * than 1 or the interpolation is not a non-negative number. */ public TokenizedLM(TokenizerFactory tokenizerFactory, int nGramOrder, LanguageModel.Sequence unknownTokenModel, LanguageModel.Sequence whitespaceModel, double lambdaFactor) { NGramProcessLM.checkMaxNGram(nGramOrder); NGramProcessLM.checkLambdaFactor(lambdaFactor); mSymbolTable = new MapSymbolTable(); mNGramOrder = nGramOrder; mTokenizerFactory = tokenizerFactory; mUnknownTokenModel = unknownTokenModel; mWhitespaceModel = whitespaceModel; mDynamicUnknownTokenModel = (mUnknownTokenModel instanceof LanguageModel.Dynamic) ? (LanguageModel.Dynamic) mUnknownTokenModel : null; mDynamicWhitespaceModel = (mWhitespaceModel instanceof LanguageModel.Dynamic) ? (LanguageModel.Dynamic) mWhitespaceModel : null; mCounter = new TrieIntSeqCounter(nGramOrder); mLambdaFactor = lambdaFactor; mCounter.incrementSubsequences(new int[] { BOUNDARY_TOKEN },0,1); } /** * Returns the integer sequence counter underlying this model. * Symbols are mapped to integers using the symbol table returned * by {@link #symbolTable()}. Changes to this counter affect this * tokenized language model. * * @return The sequence counter underlying this model. */ public TrieIntSeqCounter sequenceCounter() { return mCounter; } /** * Returns the symbol table underlying this tokenized language * model's token n-gram model. Changes to the symbol table affect * this tokenized language model. * * @return The symbol table underlying this language model. */ public SymbolTable symbolTable() { return mSymbolTable; } /** * Returns the order of the token n-gram model underlying this * tokenized language model. * * @return The order of the token n-gram model underlying this * tokenized language model. */ public int nGramOrder() { return mNGramOrder; } /** * Returns the tokenizer factory for this tokenized language * model. * * @return The tokenizer factory for this tokenized language * model. */ public TokenizerFactory tokenizerFactory() { return mTokenizerFactory; } /** * Returns the unknown token seqeunce language model for this * tokenized language model. Changes to the returned language * model affect this tokenized language model. * * @return The unknown token language model. */ public LanguageModel.Sequence unknownTokenLM() { return mUnknownTokenModel; } /** * Returns the whitespace language model for this tokenized * language model. Changes to the returned language model affect * this tokenized language model. * * @return The whitespace language model. */ public LanguageModel.Sequence whitespaceLM() { return mWhitespaceModel; } /** * Writes a compiled version of this tokenized language model to * the specified object output. When the model is read back in * it will be an instance of {@link CompiledTokenizedLM}. * * @param objOut Object output to which a compiled version of this * model is written. * @throws IOException If there is an I/O error writing the * output. */ public void compileTo(ObjectOutput objOut) throws IOException { objOut.writeObject(new Externalizer(this)); } /** * Visits the n-grams of the specified length with at least the specified * minimum count stored in the underlying counter of this * tokenized language model and passes them to the specified handler. * * @param nGramLength Length of n-grams visited. * @param minCount Minimum count of a visited n-gram. * @param handler Handler whose handle method is called for each * visited n-gram. */ public void handleNGrams(int nGramLength, int minCount, StringArrayHandler handler) { StringArrayAdapter adapter = new StringArrayAdapter(handler); mCounter.handleNGrams(nGramLength,minCount,adapter); } double lambda(int[] tokIds) { double numExtensionsD = mCounter.numExtensions(tokIds,0,tokIds.length); double extCountD = mCounter.extensionCount(tokIds,0,tokIds.length); return extCountD / (extCountD + mLambdaFactor * numExtensionsD); } /** * Trains the token sequence model, whitespace model (if dynamic) and * unknown token model (if dynamic). * * @param cSeq Character sequence to train. */ public void train(CharSequence cSeq) { char[] cs = Strings.toCharArray(cSeq); train(cs,0,cs.length); } /** * Trains the token sequence model, whitespace model (if dynamic) and * unknown token model (if dynamic) with the specified count number * of instances. Calling <code>train(cs,n)</code> is equivalent to * calling <code>train(cs)</code> a total of <code>n</code> times. * * @param cSeq Character sequence to train. * @param count Number of instances to train. * @throws IllegalArgumentException If the count is not positive. */ public void train(CharSequence cSeq, int count) { if (count < 0) { String msg = "Counts must be non-negative." + " Found count=" + count; throw new IllegalArgumentException(msg); } if (count == 0) return; char[] cs = Strings.toCharArray(cSeq); train(cs,0,cs.length,count); } /** * Trains the token sequence model, whitespace model (if dynamic) and * unknown token model (if dynamic). * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one plus last character in slice.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -