📄 ngramboundarylm.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.lm;import com.aliasi.io.BitInput;import com.aliasi.io.BitOutput;import com.aliasi.stats.Model;import com.aliasi.util.AbstractExternalizable;import java.io.InputStream;import java.io.IOException;import java.io.ObjectInput;import java.io.ObjectOutput;import java.io.OutputStream;/** * An <code>NGramBoundaryLM</code> provides a dynamic sequence * language model for which training, estimation and pruning may be * interleaved.  A sequence language model normalizes probabilities * over all sequences. * * <P>The model may be compiled to an object output; the compiled * model read from the corresponding object input will be * an instance of {@link CompiledNGramBoundaryLM}. * * <P>This class wraps an n-gram process language model by supplying a * special boundary character <code>boundaryChar</code> at * construction time which will be added to the total number of * characters in defining the estimator.  For each training event, the * boundary character is inserted both before and after the character * sequence provided.  The actual unigram count of this boundary must * then be decremented so that the initial character isn't counted in * estimates.  During estimation, the initial boundary character is * used as context and the final one is used to estimate the * end-of-stream likelihood.  Thus if <code>P<sub><sub>pr</sub></sub> * is the underlying process model then the boundary model defines * estimates by: * * <blockquote><code> *  P<sub><sub>b</sub></sub>(c<sub><sub>1</sub></sub>,...,c<sub><sub>N</sub></sub>) * <br>&nbsp; * = P<sub><sub>pr</sub></sub>(boundaryChar|boundaryChar,c<sub><sub>1</sub></sub>,...,c<sub><sub>N</sub></sub>) * <br>&nbsp;&nbsp;&nbsp; *   * <big><big><big>&Sigma;</big></big></big><sub><sub>1<=i<=N</sub></sub> *           P<sub><sub>pr</sub></sub>(c<sub><sub>i</sub></sub>|boundaryChar,c<sub><sub>1</sub></sub>,...,c<sub><sub>i-1</sub></sub>) * * <br>&nbsp; *  = P<sub><sub>pr</sub></sub>(boundaryChar,c<sub><sub>1</sub></sub>,...,c<sub><sub>N</sub></sub>,boundaryChar) *    - P<sub><sub>pr</sub></sub>(boundaryChar) * </code></blockquote> * * The result of serializing and deserializing an n-gram boundary * language model is a compiled implementation of a conditional * sequence language model.  The serialization format is the boundary character * followed by the serialization of the contained writable process * language model. * *<p>Models may be pruned by pruning the substring counter returned * by {@link #substringCounter()}.  See the documentation for the * class of the return object, {@link TrieCharSeqCounter}, for more * information. * * @author Bob Carpenter * @version 3.5.1 * @since   LingPipe2.0 */public class NGramBoundaryLM    implements LanguageModel.Sequence,               LanguageModel.Conditional,               LanguageModel.Dynamic,               Model<CharSequence> {    private final NGramProcessLM mProcessLM;    private final char mBoundaryChar;    private final char[] mBoundaryArray;    /**     * Constructs a dynamic n-gram sequence language model with the     * specified maximum n-gram and default values for other     * parameters.       *     * <P>The default number of characters is {@link     * Character#MAX_VALUE}<code>-1</code>, the default interpolation     * parameter ratio is equal to the n-gram length, and the boundary     * character is the byte-order marker <code>U+FFFF</code>     *     * @param maxNGram Maximum n-gram length in model.     */    public NGramBoundaryLM(int maxNGram) {        this(maxNGram,Character.MAX_VALUE-1);    }    /**     * Constructs a dynamic n-gram sequence language model with the     * specified maximum n-gram, specified maximum number of observed     * characters, and default values for other parameters.       *     * <P>The default interpolation     * parameter ratio is equal to the n-gram length, and the boundary     * character is the byte-order marker <code>U+FFFF</code>     *     * @param maxNGram Maximum n-gram length in model.     * @param numChars Maximum number of character seen in training     * and test sets.     */    public NGramBoundaryLM(int maxNGram, int numChars) {        this(maxNGram,numChars,maxNGram,'\uFFFF');    }    /**     * Construct a dynamic n-gram sequence language model with the     * specified maximum n-gram length, number of characters,     * interpolation ratio hyperparameter and boundary character.     * Note that the boundary character must not occur as a regular     * character in the input.  Unicode provides several options for     * marker characters; for instance the byte order markers     * <code>U+FFFF</code> or <code>U+FEFF</code> may be used     * internally by applications but may not be part of valid unicode     * character streams and thus make ideal choices for boundary     * characters.  See:     *     * <a href="http://www.unicode.org/versions/Unicode4.0.0/ch15.pdf">Unicode Standard, Chapter 15.8:  NonCharacters</a>     *     * @param maxNGram Maximum n-gram length in model.     * @param numChars Maximum number of character seen in training     * and test sets.     * @param lambdaFactor Interpolation ratio hyperparameter.     * @param boundaryChar Boundary character.     */    public NGramBoundaryLM(int maxNGram,                           int numChars,                           double lambdaFactor,                           char boundaryChar) {        this(new NGramProcessLM(maxNGram,numChars+1,lambdaFactor),             boundaryChar);    }    /**     * Construct an n-gram boundary language model with the specified     * boundary character and underlying process language model.     *     * <p>This constructor may be used to reconstitute a serialized     * model.  By writing the trie character sequence counter for the     * underlying process language model, it may be read back in.     * This may be used to construct a process language model, which     * may be used to reconstruct a boundary language model using     * this constructor.     *     * @param processLm Underlying process language model.     * @param boundaryChar Character used to encode boundaries.     */    public NGramBoundaryLM(NGramProcessLM processLm,                           char boundaryChar) {                      mBoundaryChar = boundaryChar;        mBoundaryArray = new char[] { boundaryChar };        mProcessLM = processLm;    }    /**     * Writes this language model to the specified output stream.     *      * <p>A bit output is wrapped around the output stream for     * writing.  The format begins with a delta-encoding of     * the boundary character plus 1, and is followed by the     * bit output of the underlying process language model.     *     * @param out Output stream from which to read the language model.     * @throws IOException If there is an underlying I/O error.     */    public void writeTo(OutputStream out) throws IOException {        BitOutput bitOut = new BitOutput(out);        bitOut.writeDelta((long)(mBoundaryChar+1));        mProcessLM.writeTo(bitOut);        bitOut.flush();    }    /**     * Read a process language model from the specified input     * stream.       *     * <p>See {@link #writeTo(OutputStream)} for a description     * of the binary format.     *     * @param in Input stream from which to read the model.     * @return Process language model read from stream.     * @throws IOException If there is an underlying I/O error.     */    public static NGramBoundaryLM readFrom(InputStream in)         throws IOException {
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -