📄 compiledngramprocesslm.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
        return result;    }    /**     * Returns the maximum length n-gram used for this compiled     * language model.  The maximum amount of context used for     * estimates will be one less than this.     *     * @return The maximum length n-gram counted in this language     * model.     */    public int maxNGram() {        return mMaxNGram;    }    /**     * Returns the total number of nodes in this language model's trie     * structure.  This represents the total number of sequences for     * which there are precompiled conditional probability estimates.     *     * @return The total number of nodes in the underlying trie     * structure.     */    public int numNodes() {        return mChars.length;    }    /**     * Return the index in the parallel array structure underlying of     * the maximum length suffix of the specified string that is     * defined as a context.     *     * @param context String of context.     * @return int Index of maximum length suffix of the specified     * context.     */    public int longestContextIndex(String context) {        char[] cs = context.toCharArray();        int length = cs.length;        for (int i = 0; i < length; ++i) {            int k = getIndex(cs,i,length);            if (k >= 0) {                while (k >= mLogOneMinusLambdas.length)                    k = mSuffix[k];                return k;            }        }        return 0; // back off all the way    }    int numInternalNodes() {        return mFirstChild.length;    }    private void compileSuffixes(String context, int index) {        mSuffix[index] = suffixIndex(context);        if (index >= mFirstChild.length) return;        int firstChildIndex = mFirstChild[index];        int lastChildIndex =            index+1 < mFirstChild.length            ? mFirstChild[index+1]            : mChars.length;        for (int i = firstChildIndex; i < lastChildIndex; ++i)            compileSuffixes(context + mChars[i], i);    }    private int suffixIndex(String context) {        int suffixLength = context.length()-1;        if (suffixLength < 0) return -1;        char[] cs = new char[suffixLength];        for (int i = 0; i < suffixLength; ++i)            cs[i] = context.charAt(i+1);        return getIndex(cs,0,suffixLength);    }    /**     * This method is a convenience impelementation of the {@link     * Model} interface which delegates the call to {@link     * #log2Estimate(CharSequence)}.     *     * @param cSeq Character sequence whose probability is returned.     * @return The log (base 2) probability of the specified character sequence.     */    public double log2Prob(CharSequence cSeq) {        return log2Estimate(cSeq);    }    /**     * This method is a convenience implementation of the {@link Model}     * interface which returns the result of raising 2.0 to the      * power of the result of a call to {@link #log2Estimate(CharSequence)}.     *     * @param cSeq Character sequence whose probability is returned.     * @return The log probability of the specified character sequence.     */    public double prob(CharSequence cSeq) {        return Math.pow(2.0,log2Estimate(cSeq));    }    public final double log2Estimate(CharSequence cSeq) {        char[] cs = Strings.toCharArray(cSeq);        return log2Estimate(cs,0,cs.length);    }    /**     * Returns the log (base 2) estimate of the specified character in     * the context with the specified index.  This corresponds to     * values returned by the conditional estimates when the context     * and outcome character are specified in a singlecharacter sequence or     * slice.     *      *      * <P>The main use of this method is to incrementally compute     * conditional estimates and contexts, in conjunction with the     * method {@link #nextContext(int,char)}.     *     * @param contextIndex Index of context of estimate.     * @param nextChar Character being estimated.     * @return Log (base 2) estimate of character in context.     */    public final double log2Estimate(int contextIndex,                                      char nextChar) {        double sum = 0.0;        int outcomeIndex;        for (int currentContextIndex = contextIndex;             (outcomeIndex = getIndex(currentContextIndex,nextChar)) < 0;             currentContextIndex = mSuffix[currentContextIndex]) {            if (currentContextIndex < mLogOneMinusLambdas.length)                sum += mLogOneMinusLambdas[currentContextIndex];            if (currentContextIndex == ROOT_NODE_INDEX) {                return sum + mLogUniformEstimate;            }        }        return sum + mLogProbs[outcomeIndex];        }    /**     * Returns the index of the context formed by appending the     * specified character to the context of the specified index.  The     * main use of this method is to incrementally compute conditional     * estimates and contexts, in conjunction with the method {@link     * #log2Estimate(int,char)}.       *     * <P>Note that the index of the root node is always <code>0</code>.     *     * @param contextIndex Index of present context.     * @param nextChar Next character.     * @return Index of context formed by appending next character to     * the present context.     * @throws IllegalArgumentException If the context index is less     * than zero or greater than the last context index.     */    public int nextContext(int contextIndex, char nextChar) {        if (contextIndex < 0             || contextIndex > mLastContextIndex) {            String msg = "Context must be greater than zero."                + " Context must be less than last index=" + mLastContextIndex                + " Context=" + contextIndex;            throw new IllegalArgumentException(msg);        }        for (int currentContextIndex = contextIndex;             true;             currentContextIndex = mSuffix[currentContextIndex]) {                     int outcomeIndex = getIndex(currentContextIndex,nextChar);            if (outcomeIndex < mLogOneMinusLambdas.length                && outcomeIndex >= 0) return outcomeIndex;            if (currentContextIndex == ROOT_NODE_INDEX)                 return ROOT_NODE_INDEX; // can't go back further        }    }    public final double log2Estimate(char[] cs, int start, int end) {        int len = mLogOneMinusLambdas.length;        Strings.checkArgsStartEnd(cs,start,end);        double sum = 0.0;        int contextIndex = ROOT_NODE_INDEX;        NEXT_CHAR:        for (int i = start; i < end; ++i) {            char nextChar = cs[i];            int outcomeIndex;            while ((outcomeIndex = getIndex(contextIndex,nextChar)) < 0) {                if (contextIndex < len)                    sum += mLogOneMinusLambdas[contextIndex];                if (contextIndex == ROOT_NODE_INDEX) {                    sum += mLogUniformEstimate;                    contextIndex = ROOT_NODE_INDEX;                    continue NEXT_CHAR;                }                contextIndex = mSuffix[contextIndex]; // backoff until end            }            sum += mLogProbs[outcomeIndex];            contextIndex                = outcomeIndex < len                ? outcomeIndex                : mSuffix[outcomeIndex];        }        return sum;    }    public double log2ConditionalEstimate(CharSequence cSeq) {        char[] cs = cSeq.toString().toCharArray();        return log2ConditionalEstimate(cs,0,cs.length);    }    public double log2ConditionalEstimate(char[] cs, int start, int end) {        Strings.checkArgsStartEnd(cs,start,end);        double total = 0.0;        int contextEnd = end - 1;        char c = cs[contextEnd]; // last char        int maxContextLength = Math.min(contextEnd-start,mMaxNGram-1);        for (int contextLength = maxContextLength;             contextLength >= 0;             --contextLength) {            int contextStart = contextEnd - contextLength;            int contextIndex = getIndex(cs,contextStart,contextEnd);            if (contextIndex == -1) continue; // no ctx, try shorter context            while (contextIndex > mLastContextIndex)                contextIndex = mSuffix[contextIndex];  // no outcomes,             // go to shortest w. outcomes            int outcomeIndex = getIndex(contextIndex,c);            if (outcomeIndex != -1)                return total + mLogProbs[outcomeIndex];            total += mLogOneMinusLambdas[contextIndex];        }        return total + mLogUniformEstimate;    }    /**     * Returns a string-based representation of this compiled n-gram.     * It writes one row per line of the parallel indices.  It should     * probably not be called with very large models, as the resulting     * string will be much larger than the model itself.     *     * @return String-based representation of this model.     */    public String toString() {        StringBuffer sb = new StringBuffer();        sb.append("Max NGram=" + mMaxNGram);        sb.append('\n');        sb.append("Log2 Uniform Estimate=" + mLogUniformEstimate);        sb.append('\n');        sb.append("i c suff prob 1-lambda firstChild");        sb.append('\n');        for (int i = 0; i < mChars.length; ++i) {            sb.append(i);            sb.append(" ");            sb.append(mChars[i]);            sb.append(" ");            sb.append(mSuffix[i]);            sb.append(" ");            sb.append(mLogProbs[i]);            if (i < mLogOneMinusLambdas.length) {                sb.append(" ");                sb.append(mFirstChild[i]);                sb.append(" ");                sb.append(mLogOneMinusLambdas[i]);            }            sb.append("\n");        }        return sb.toString();    }    private int getIndex(int fromIndex, char c) {        if (fromIndex+1 >= mFirstChild.length) return -1;        int low = mFirstChild[fromIndex];        int high = mFirstChild[fromIndex+1]-1;        while (low <= high) {            int mid = (high + low)/2;            if (mChars[mid] == c) return mid;            else if (mChars[mid] < c)                low = (low == mid) ? mid+1 : mid;            else                high = (high == mid) ? mid-1 : mid;        }        return -1;    }    private int getIndex(char[] cs, int start, int end) {        int index = 0;        for (int currentStart = start;             currentStart < end;             ++currentStart) {            index = getIndex(index,cs[currentStart]);            if (index == -1) return -1;        }        return index;    }    /**     * The index of the root node, namely <code>0</code>.     */    public static final int ROOT_NODE_INDEX = 0;    private static final int CACHE_NOT_COMPUTED_VALUE = -1;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -