📄 ngramboundarylm.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.lm;import com.aliasi.io.BitInput;import com.aliasi.io.BitOutput;import com.aliasi.stats.Model;import com.aliasi.util.AbstractExternalizable;import java.io.InputStream;import java.io.IOException;import java.io.ObjectInput;import java.io.ObjectOutput;import java.io.OutputStream;/** * An <code>NGramBoundaryLM</code> provides a dynamic sequence * language model for which training, estimation and pruning may be * interleaved. A sequence language model normalizes probabilities * over all sequences. * * <P>The model may be compiled to an object output; the compiled * model read from the corresponding object input will be * an instance of {@link CompiledNGramBoundaryLM}. * * <P>This class wraps an n-gram process language model by supplying a * special boundary character <code>boundaryChar</code> at * construction time which will be added to the total number of * characters in defining the estimator. For each training event, the * boundary character is inserted both before and after the character * sequence provided. The actual unigram count of this boundary must * then be decremented so that the initial character isn't counted in * estimates. During estimation, the initial boundary character is * used as context and the final one is used to estimate the * end-of-stream likelihood. Thus if <code>P<sub><sub>pr</sub></sub> * is the underlying process model then the boundary model defines * estimates by: * * <blockquote><code> * P<sub><sub>b</sub></sub>(c<sub><sub>1</sub></sub>,...,c<sub><sub>N</sub></sub>) * <br> * = P<sub><sub>pr</sub></sub>(boundaryChar|boundaryChar,c<sub><sub>1</sub></sub>,...,c<sub><sub>N</sub></sub>) * <br> * * <big><big><big>Σ</big></big></big><sub><sub>1<=i<=N</sub></sub> * P<sub><sub>pr</sub></sub>(c<sub><sub>i</sub></sub>|boundaryChar,c<sub><sub>1</sub></sub>,...,c<sub><sub>i-1</sub></sub>) * * <br> * = P<sub><sub>pr</sub></sub>(boundaryChar,c<sub><sub>1</sub></sub>,...,c<sub><sub>N</sub></sub>,boundaryChar) * - P<sub><sub>pr</sub></sub>(boundaryChar) * </code></blockquote> * * The result of serializing and deserializing an n-gram boundary * language model is a compiled implementation of a conditional * sequence language model. The serialization format is the boundary character * followed by the serialization of the contained writable process * language model. * *<p>Models may be pruned by pruning the substring counter returned * by {@link #substringCounter()}. See the documentation for the * class of the return object, {@link TrieCharSeqCounter}, for more * information. * * @author Bob Carpenter * @version 3.5.1 * @since LingPipe2.0 */public class NGramBoundaryLM implements LanguageModel.Sequence, LanguageModel.Conditional, LanguageModel.Dynamic, Model<CharSequence> { private final NGramProcessLM mProcessLM; private final char mBoundaryChar; private final char[] mBoundaryArray; /** * Constructs a dynamic n-gram sequence language model with the * specified maximum n-gram and default values for other * parameters. * * <P>The default number of characters is {@link * Character#MAX_VALUE}<code>-1</code>, the default interpolation * parameter ratio is equal to the n-gram length, and the boundary * character is the byte-order marker <code>U+FFFF</code> * * @param maxNGram Maximum n-gram length in model. */ public NGramBoundaryLM(int maxNGram) { this(maxNGram,Character.MAX_VALUE-1); } /** * Constructs a dynamic n-gram sequence language model with the * specified maximum n-gram, specified maximum number of observed * characters, and default values for other parameters. * * <P>The default interpolation * parameter ratio is equal to the n-gram length, and the boundary * character is the byte-order marker <code>U+FFFF</code> * * @param maxNGram Maximum n-gram length in model. * @param numChars Maximum number of character seen in training * and test sets. */ public NGramBoundaryLM(int maxNGram, int numChars) { this(maxNGram,numChars,maxNGram,'\uFFFF'); } /** * Construct a dynamic n-gram sequence language model with the * specified maximum n-gram length, number of characters, * interpolation ratio hyperparameter and boundary character. * Note that the boundary character must not occur as a regular * character in the input. Unicode provides several options for * marker characters; for instance the byte order markers * <code>U+FFFF</code> or <code>U+FEFF</code> may be used * internally by applications but may not be part of valid unicode * character streams and thus make ideal choices for boundary * characters. See: * * <a href="http://www.unicode.org/versions/Unicode4.0.0/ch15.pdf">Unicode Standard, Chapter 15.8: NonCharacters</a> * * @param maxNGram Maximum n-gram length in model. * @param numChars Maximum number of character seen in training * and test sets. * @param lambdaFactor Interpolation ratio hyperparameter. * @param boundaryChar Boundary character. */ public NGramBoundaryLM(int maxNGram, int numChars, double lambdaFactor, char boundaryChar) { this(new NGramProcessLM(maxNGram,numChars+1,lambdaFactor), boundaryChar); } /** * Construct an n-gram boundary language model with the specified * boundary character and underlying process language model. * * <p>This constructor may be used to reconstitute a serialized * model. By writing the trie character sequence counter for the * underlying process language model, it may be read back in. * This may be used to construct a process language model, which * may be used to reconstruct a boundary language model using * this constructor. * * @param processLm Underlying process language model. * @param boundaryChar Character used to encode boundaries. */ public NGramBoundaryLM(NGramProcessLM processLm, char boundaryChar) { mBoundaryChar = boundaryChar; mBoundaryArray = new char[] { boundaryChar }; mProcessLM = processLm; } /** * Writes this language model to the specified output stream. * * <p>A bit output is wrapped around the output stream for * writing. The format begins with a delta-encoding of * the boundary character plus 1, and is followed by the * bit output of the underlying process language model. * * @param out Output stream from which to read the language model. * @throws IOException If there is an underlying I/O error. */ public void writeTo(OutputStream out) throws IOException { BitOutput bitOut = new BitOutput(out); bitOut.writeDelta((long)(mBoundaryChar+1)); mProcessLM.writeTo(bitOut); bitOut.flush(); } /** * Read a process language model from the specified input * stream. * * <p>See {@link #writeTo(OutputStream)} for a description * of the binary format. * * @param in Input stream from which to read the model. * @return Process language model read from stream. * @throws IOException If there is an underlying I/O error. */ public static NGramBoundaryLM readFrom(InputStream in) throws IOException {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -