📄 uniformboundarylm.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
字号:
package com.aliasi.lm;import com.aliasi.util.AbstractExternalizable;import com.aliasi.util.Strings;import java.io.IOException;import java.io.ObjectInput;import java.io.ObjectOutput;/** * A <code>UniformBoundaryLM</code> implements a uniform sequence * language model with a specified number of outcomes and the same * probability assigned to the end-of-stream marker.  The formula * for computing sequence likelihood estimates is: * * <blockquote><code> *   log2Estimate(cSeq) = *   = log<sub><sub>2</sub></sub> ( (cSeq.length()+1) / (numOutcomes+1) ) * </code></blockquote> * * Adding one to the number of outcomes makes the end-of-sequence * just as likely as any other character.  Adding one to the * sequence length adds the log likelihood of the end-of-sequence * marker itself. * * <p>This model is defined as dynamic for convenience.  Calls * to the training methods have no effect. * * @author  Bob Carpenter * @version 2.3.2 * @since   LingPipe2.0 */public class UniformBoundaryLM    implements LanguageModel.Dynamic,               LanguageModel.Sequence {    private final double mLog2EstimatePerChar;    private final int mNumOutcomes;    /**     * Construct uniform boundary language model with the full set     * of characters.     */    public UniformBoundaryLM() {        this(Character.MAX_VALUE-1);    }    /**     * Construct a uniform boundary language model with the specified     * number of outcomes.  The estimate will include the     * end-of-stream boundary output and thus the per-character     * estimate will be <code>1/(numOutcomes+1)</code>.     *     * @param numOutcomes Number of outcomes.     */    public UniformBoundaryLM(int numOutcomes) {        UniformProcessLM.validateNumOutcomes(numOutcomes+1);        mNumOutcomes = numOutcomes;        mLog2EstimatePerChar             = -com.aliasi.util.Math.log2(1.0 + (double)numOutcomes);    }    /**     * Create a constant uniform boundary LM with the specified     * character cross-entropy rate.  Recall that cross-entropy is the     * negative character average log probability.  Thus the log     * estimate returned for a boundary model will include the final     * terminator, and yield:     *     * <blockquote><code>     * log<sub><sub>2</sub></sub> P(cs)      * = - crossEntropyRate * (cs.length() + 1)     * </code></blockquote>     *     * The number of outcomes is set by rounding down the exponent of     * the cross-entropy and subtracting one for the boundary     * character:     *      * <blockquote><code>     * numOutcomes = (int) 2.0<sup><sup>crossEntropyRate</sup></sup> - 1     * </code></blockquote>     *      * Even if the above expression evaluates to less than zero, the     * number of outcomes will then be rounded up to zero.     *     * @param crossEntropyRate The cross-entropy rate of the model.     * @throws IllegalArgumentException If the cross-entropy rate is     * not finite and non-negative.     */    public UniformBoundaryLM(double crossEntropyRate) {        com.aliasi.util.Math.assertFiniteNonNegative("Cross-entropy rate",                                                      crossEntropyRate);        mLog2EstimatePerChar = -crossEntropyRate;        mNumOutcomes = Math.max(0,                                (int) (Math.pow(2.0,crossEntropyRate) - 1.0));    }        private UniformBoundaryLM(int numOutcomes,                              double log2EstimatePerChar) {        mNumOutcomes = numOutcomes;        mLog2EstimatePerChar = log2EstimatePerChar;    }    /**     * Returns the number of outcomes for this uniform model.     *     * @return The number of outcomes for this uniform model.     */    public int numOutcomes() {        return mNumOutcomes;    }    /**     * Writes a compiled version of this model to the specified object     * output.  The object read back in will also be an instance     * of {@link UniformBoundaryLM}.     *     * @param objOut Object output to which this model is written.     * @throws IOException If there is an I/O error during the write.     */    public void compileTo(ObjectOutput objOut) throws IOException {        objOut.writeObject(new Externalizer(this));    }    /**     * Ignores the training data.     *     * @param cs Ignored.     * @param start Ignored.     * @param end Ignored.     */    public void train(char[] cs, int start, int end) {        // ignore    }    /**     * Ignores the training data.     *     * @param cs Ignored.     * @param start Ignored.     * @param end Ignored.     * @param count Ignored.     */    public void train(char[] cs, int start, int end, int count) {        // ignore    }    /**     * Ignores the training data.     *     * @param cSeq Ignored.     */    public void train(CharSequence cSeq) {         // ignore    }    /**     * Ignores the training data.     *     * @param cSeq Ignored.     * @param count Ignored.     */    public void train(CharSequence cSeq, int count) {         // ignore    }    public double log2Estimate(char[] cs, int start, int end) {        Strings.checkArgsStartEnd(cs,start,end);        return log2Estimate(end-start);    }    public double log2Estimate(CharSequence cSeq) {        return log2Estimate(cSeq.length());    }    private double log2Estimate(int length) {        return mLog2EstimatePerChar * (1.0 + (double) length);    }    private static UniformBoundaryLM         createUniformBoundaryLM(int numOutcomes,                                double log2EstimatePerChar) {        return new UniformBoundaryLM(numOutcomes,log2EstimatePerChar);    }    /**     * A constant uniform boundary language model returning     * zero log estimates.  This is done by setting the number     * of characters to zero.       *     * <P>This constant is particularly useful for removing the     * contribution of whitespace characters to token n-gram language     * models.     */    public static final UniformBoundaryLM ZERO_LM        = new UniformBoundaryLM(0);    private static class Externalizer extends AbstractExternalizable {    	static final long serialVersionUID = -5389627995529538230L;    	private final UniformBoundaryLM mLM;        public Externalizer() {             mLM = null;         }        public Externalizer(UniformBoundaryLM lm) {            mLM = lm;        }        public Object read(ObjectInput objIn) throws IOException {            int numOutcomes = objIn.readInt();            double log2EstimatePerChar = objIn.readDouble();            return createUniformBoundaryLM(numOutcomes,log2EstimatePerChar);        }        public void writeExternal(ObjectOutput objOut) throws IOException {            objOut.writeInt(mLM.numOutcomes());            objOut.writeDouble(mLM.mLog2EstimatePerChar);        }    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -