📄 tokenizedlm.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.lm;import com.aliasi.corpus.IntArrayHandler;import com.aliasi.corpus.StringArrayHandler;import com.aliasi.corpus.TextHandler;import com.aliasi.symbol.SymbolTable;import com.aliasi.symbol.MapSymbolTable;import com.aliasi.stats.BinomialDistribution;import com.aliasi.stats.Statistics;import com.aliasi.tokenizer.Tokenizer;import com.aliasi.tokenizer.TokenizerFactory;import com.aliasi.util.AbstractExternalizable;// import com.aliasi.util.Arrays;import com.aliasi.util.BoundedPriorityQueue;import com.aliasi.util.Exceptions;// import com.aliasi.util.Math;import com.aliasi.util.Scored;import com.aliasi.util.ScoredObject;import com.aliasi.util.Strings;import java.io.ObjectInput;import java.io.ObjectOutput;import java.io.IOException;import java.util.ArrayList;import java.util.Comparator;import java.util.LinkedList;import java.util.Iterator;/** * A <code>TokenizedLM</code> provides a dynamic sequence language * model which models token sequences with an n-gram model, and * whitespace and unknown tokens with their own sequence language * models. * * <P>A tokenized language model factors the probability assigned to a * character sequence as follows: * * <blockquote><code> * P(cs) * = P<sub><sub>tok</sub></sub>(toks(cs)) * <big><big><big>&Pi;</big></big></big><sub><sub>t in unknownToks(cs)</sub></sub> *  P<sub><sub>unk</sub></sub>(t) * <big><big><big>&Pi;</big></big></big><sub><sub>w in whitespaces(cs)</sub></sub> *   P<sub><sub>whsp</sub></sub>(w) * </code></blockquote> * * where * * <UL> * <LI> <code>P<sub><sub>tok</sub></sub></code> is the token model * esimate and where <code>toks(cs)</code> replaces known tokens with * their integer identifiers, unknown tokens with <code>-1</code> and * adds boundary symbols <code>-2</code> front and back, the same * adjustment is used to remove the initial boundary estimate as in * {@link NGramBoundaryLM}; * * <LI> <code>P<sub><sub>unk</sub></sub></code> is the unknown token * sequence language model and <code>unknownToks(cs)</code> is the * list of unknown tokens in the input (with duplicates); and * * <LI> <code>P<sub><sub>whsp</sub></sub></code> is the whitespace sequence * language model and <code>whitespaces(cs)</code> is the list of * whitespaces in the character sequence (with duplicates). * * </UL> * * <P>The token n-gram model itself uses the same method of counting * and smoothing as described in the class documentation for {@link * NGramProcessLM}.  Like {@link NGramBoundaryLM}, boundary tokens are * inserted before and after other tokens.  And like the n-gram * character boundary model, the initial boundary estimate is subtracted * from the overall estimate for normalization purposes. * * <P>Tokens are all converted to integer identifiers using an * internal dynamic symbol table.  All symbols in symbol tables get * non-negative identifiers; the negative value <code>-1</code> is * used for the unknown token in models, just as in symbol tables. * The value <code>-2</code> is used for the boundary marker in the * counters. * * <P>In order for all estimates to be non-zero, the integer * sequence counter used to back the token model is initialized * with a count of 1 for the end-of-stream identifier (-2).  The * unknown token count for any context is taken to be the number * of outcomes in that context.  Because unknowns are estimated * directly in this manner, there is no need to interpolate the * unigram model with a uniform model for unknown outcome.  Instead, * the occurrence of an unknown is modeled directly and its * identity is modeled by the unknown token language model. * * <P>In order to produce a properly normalized sequence model, the * concatenation of tokens and whitespaces returned by the tokenizer * should concatenate together to produce the original input.  Note * that this condition is <i>not</i> checked at runtime.  But, * sequences may be normalized before being trained and evaluated for * a language model.  For instance, all alphabetic characters might be * reduced to lower case and all punctuation characters removed and * all non-empty sequences of whitespace reduced to a single space * character.  A langauge model may then be defined over this * normalized space of input, not the original space (and may thus use * a reduced number of characters for its uniform estimates). * Although this normalization may be carried out by a tokenizer in * practice, for instance for use in a tokenized classifier, an * normalization is consistent the interface specification for {@link * LanguageModel.Sequence} or {@link LanguageModel.Dynamic} only if * done on the outside. * * @author Bob Carpenter * @version 3.1.2 * @since   LingPipe2.0 */public class TokenizedLM    implements LanguageModel.Dynamic,               LanguageModel.Sequence,               LanguageModel.Tokenized,               TextHandler {    private final TokenizerFactory mTokenizerFactory;    private final MapSymbolTable mSymbolTable;    private final TrieIntSeqCounter mCounter;    private final LanguageModel.Sequence mUnknownTokenModel;    private final LanguageModel.Sequence mWhitespaceModel;    private final double mLambdaFactor;    private final LanguageModel.Dynamic mDynamicUnknownTokenModel;    private final LanguageModel.Dynamic mDynamicWhitespaceModel;    private final int mNGramOrder;    /**     * Constructs a tokenized language model with the specified     * tokenization factory and n-gram order.  The unknown token and     * whitespace models are both uniform sequence language models     * with default parameters as described in the documentation for     * the constructor {@link UniformBoundaryLM#UniformBoundaryLM()}.     * The default interpolation hyperparameter is equal to the n-gram     * Order.     *     * @param factory Tokenizer factory for the model.     * @param nGramOrder N-gram Order.     * @throws IllegalArgumentException If the n-gram order is less     * than 0.     */    public TokenizedLM(TokenizerFactory factory,                       int nGramOrder) {        this(factory,             nGramOrder,             new UniformBoundaryLM(),             new UniformBoundaryLM(),             nGramOrder);    }    /**     * Construct a tokenized language model with the specified     * tokenization factory and n-gram order, sequence models for     * unknown tokens and whitespace, and an interpolation     * hyperparameter.     *     * <P>In order for this model to be serializable, the unknown     * token and whitespace models should be serializable.  If they do     * not, a runtime exception will be thrown when attempting to     * serialize this model.  If these models implement {@link     * LanguageModel.Dynamic}, they will be trained by calls to the     * training method.     *     * @param tokenizerFactory Tokenizer factory for the model.     * @param nGramOrder Length of maximum n-gram for model.     * @param unknownTokenModel Sequence model for unknown tokens.     * @param whitespaceModel Sequence model for all whitespace.     * @param lambdaFactor Value of the interpolation hyperparameter.     * @throws IllegalArgumentException If the n-gram order is less     * than 1 or the interpolation is not a non-negative number.     */    public TokenizedLM(TokenizerFactory tokenizerFactory,                       int nGramOrder,                       LanguageModel.Sequence unknownTokenModel,                       LanguageModel.Sequence whitespaceModel,                       double lambdaFactor) {        NGramProcessLM.checkMaxNGram(nGramOrder);        NGramProcessLM.checkLambdaFactor(lambdaFactor);        mSymbolTable = new MapSymbolTable();        mNGramOrder = nGramOrder;        mTokenizerFactory = tokenizerFactory;        mUnknownTokenModel = unknownTokenModel;        mWhitespaceModel = whitespaceModel;        mDynamicUnknownTokenModel            = (mUnknownTokenModel instanceof LanguageModel.Dynamic)            ? (LanguageModel.Dynamic) mUnknownTokenModel            : null;        mDynamicWhitespaceModel            = (mWhitespaceModel instanceof LanguageModel.Dynamic)            ? (LanguageModel.Dynamic) mWhitespaceModel            : null;        mCounter = new TrieIntSeqCounter(nGramOrder);        mLambdaFactor = lambdaFactor;        mCounter.incrementSubsequences(new int[] { BOUNDARY_TOKEN },0,1);    }    /**     * Returns the integer sequence counter underlying this model.     * Symbols are mapped to integers using the symbol table returned     * by {@link #symbolTable()}.  Changes to this counter affect this     * tokenized language model.     *     * @return The sequence counter underlying this model.     */    public TrieIntSeqCounter sequenceCounter() {        return mCounter;    }    /**     * Returns the symbol table underlying this tokenized language     * model's token n-gram model.  Changes to the symbol table affect     * this tokenized language model.     *     * @return The symbol table underlying this language model.     */    public SymbolTable symbolTable() {        return mSymbolTable;    }    /**     * Returns the order of the token n-gram model underlying this     * tokenized language model.     *     * @return The order of the token n-gram model underlying this     * tokenized language model.     */    public int nGramOrder() {        return mNGramOrder;    }    /**     * Returns the tokenizer factory for this tokenized language     * model.     *     * @return The tokenizer factory for this tokenized language     * model.     */    public TokenizerFactory tokenizerFactory() {        return mTokenizerFactory;    }    /**     * Returns the unknown token seqeunce language model for this     * tokenized language model.  Changes to the returned language     * model affect this tokenized language model.     *     * @return The unknown token language model.     */    public LanguageModel.Sequence unknownTokenLM() {        return mUnknownTokenModel;    }    /**     * Returns the whitespace language model for this tokenized     * language model.  Changes to the returned language model affect     * this tokenized language model.     *     * @return The whitespace language model.     */    public LanguageModel.Sequence whitespaceLM() {        return mWhitespaceModel;    }    /**     * Writes a compiled version of this tokenized language model to     * the specified object output.  When the model is read back in     * it will be an instance of {@link CompiledTokenizedLM}.     *     * @param objOut Object output to which a compiled version of this     * model is written.     * @throws IOException If there is an I/O error writing the     * output.     */    public void compileTo(ObjectOutput objOut) throws IOException {        objOut.writeObject(new Externalizer(this));    }    /**     * Visits the n-grams of the specified length with at least the specified     * minimum count stored in the underlying counter of this     * tokenized language model and passes them to the specified handler.     *     * @param nGramLength Length of n-grams visited.     * @param minCount Minimum count of a visited n-gram.     * @param handler Handler whose handle method is called for each     * visited n-gram.     */    public void handleNGrams(int nGramLength, int minCount,                             StringArrayHandler handler) {        StringArrayAdapter adapter = new StringArrayAdapter(handler);        mCounter.handleNGrams(nGramLength,minCount,adapter);    }    double lambda(int[] tokIds) {        double numExtensionsD = mCounter.numExtensions(tokIds,0,tokIds.length);        double extCountD = mCounter.extensionCount(tokIds,0,tokIds.length);        return extCountD / (extCountD + mLambdaFactor * numExtensionsD);    }    /**     * Trains the token sequence model, whitespace model (if dynamic) and     * unknown token model (if dynamic).     *     * @param cSeq Character sequence to train.     */    public void train(CharSequence cSeq) {        char[] cs = Strings.toCharArray(cSeq);        train(cs,0,cs.length);    }    /**     * Trains the token sequence model, whitespace model (if dynamic) and     * unknown token model (if dynamic) with the specified count number     * of instances.  Calling <code>train(cs,n)</code> is equivalent to     * calling <code>train(cs)</code> a total of <code>n</code> times.     *     * @param cSeq Character sequence to train.     * @param count Number of instances to train.     * @throws IllegalArgumentException If the count is not positive.     */    public void train(CharSequence cSeq, int count) {        if (count < 0) {            String msg = "Counts must be non-negative."                + " Found count=" + count;            throw new IllegalArgumentException(msg);        }        if (count == 0) return;        char[] cs = Strings.toCharArray(cSeq);        train(cs,0,cs.length,count);    }    /**     * Trains the token sequence model, whitespace model (if dynamic) and     * unknown token model (if dynamic).     *     * @param cs Underlying character array.     * @param start Index of first character in slice.     * @param end Index of one plus last character in slice.
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -