📄 compiledngramprocesslm.java
字号:
return result; } /** * Returns the maximum length n-gram used for this compiled * language model. The maximum amount of context used for * estimates will be one less than this. * * @return The maximum length n-gram counted in this language * model. */ public int maxNGram() { return mMaxNGram; } /** * Returns the total number of nodes in this language model's trie * structure. This represents the total number of sequences for * which there are precompiled conditional probability estimates. * * @return The total number of nodes in the underlying trie * structure. */ public int numNodes() { return mChars.length; } /** * Return the index in the parallel array structure underlying of * the maximum length suffix of the specified string that is * defined as a context. * * @param context String of context. * @return int Index of maximum length suffix of the specified * context. */ public int longestContextIndex(String context) { char[] cs = context.toCharArray(); int length = cs.length; for (int i = 0; i < length; ++i) { int k = getIndex(cs,i,length); if (k >= 0) { while (k >= mLogOneMinusLambdas.length) k = mSuffix[k]; return k; } } return 0; // back off all the way } int numInternalNodes() { return mFirstChild.length; } private void compileSuffixes(String context, int index) { mSuffix[index] = suffixIndex(context); if (index >= mFirstChild.length) return; int firstChildIndex = mFirstChild[index]; int lastChildIndex = index+1 < mFirstChild.length ? mFirstChild[index+1] : mChars.length; for (int i = firstChildIndex; i < lastChildIndex; ++i) compileSuffixes(context + mChars[i], i); } private int suffixIndex(String context) { int suffixLength = context.length()-1; if (suffixLength < 0) return -1; char[] cs = new char[suffixLength]; for (int i = 0; i < suffixLength; ++i) cs[i] = context.charAt(i+1); return getIndex(cs,0,suffixLength); } /** * This method is a convenience impelementation of the {@link * Model} interface which delegates the call to {@link * #log2Estimate(CharSequence)}. * * @param cSeq Character sequence whose probability is returned. * @return The log (base 2) probability of the specified character sequence. */ public double log2Prob(CharSequence cSeq) { return log2Estimate(cSeq); } /** * This method is a convenience implementation of the {@link Model} * interface which returns the result of raising 2.0 to the * power of the result of a call to {@link #log2Estimate(CharSequence)}. * * @param cSeq Character sequence whose probability is returned. * @return The log probability of the specified character sequence. */ public double prob(CharSequence cSeq) { return Math.pow(2.0,log2Estimate(cSeq)); } public final double log2Estimate(CharSequence cSeq) { char[] cs = Strings.toCharArray(cSeq); return log2Estimate(cs,0,cs.length); } /** * Returns the log (base 2) estimate of the specified character in * the context with the specified index. This corresponds to * values returned by the conditional estimates when the context * and outcome character are specified in a singlecharacter sequence or * slice. * * * <P>The main use of this method is to incrementally compute * conditional estimates and contexts, in conjunction with the * method {@link #nextContext(int,char)}. * * @param contextIndex Index of context of estimate. * @param nextChar Character being estimated. * @return Log (base 2) estimate of character in context. */ public final double log2Estimate(int contextIndex, char nextChar) { double sum = 0.0; int outcomeIndex; for (int currentContextIndex = contextIndex; (outcomeIndex = getIndex(currentContextIndex,nextChar)) < 0; currentContextIndex = mSuffix[currentContextIndex]) { if (currentContextIndex < mLogOneMinusLambdas.length) sum += mLogOneMinusLambdas[currentContextIndex]; if (currentContextIndex == ROOT_NODE_INDEX) { return sum + mLogUniformEstimate; } } return sum + mLogProbs[outcomeIndex]; } /** * Returns the index of the context formed by appending the * specified character to the context of the specified index. The * main use of this method is to incrementally compute conditional * estimates and contexts, in conjunction with the method {@link * #log2Estimate(int,char)}. * * <P>Note that the index of the root node is always <code>0</code>. * * @param contextIndex Index of present context. * @param nextChar Next character. * @return Index of context formed by appending next character to * the present context. * @throws IllegalArgumentException If the context index is less * than zero or greater than the last context index. */ public int nextContext(int contextIndex, char nextChar) { if (contextIndex < 0 || contextIndex > mLastContextIndex) { String msg = "Context must be greater than zero." + " Context must be less than last index=" + mLastContextIndex + " Context=" + contextIndex; throw new IllegalArgumentException(msg); } for (int currentContextIndex = contextIndex; true; currentContextIndex = mSuffix[currentContextIndex]) { int outcomeIndex = getIndex(currentContextIndex,nextChar); if (outcomeIndex < mLogOneMinusLambdas.length && outcomeIndex >= 0) return outcomeIndex; if (currentContextIndex == ROOT_NODE_INDEX) return ROOT_NODE_INDEX; // can't go back further } } public final double log2Estimate(char[] cs, int start, int end) { int len = mLogOneMinusLambdas.length; Strings.checkArgsStartEnd(cs,start,end); double sum = 0.0; int contextIndex = ROOT_NODE_INDEX; NEXT_CHAR: for (int i = start; i < end; ++i) { char nextChar = cs[i]; int outcomeIndex; while ((outcomeIndex = getIndex(contextIndex,nextChar)) < 0) { if (contextIndex < len) sum += mLogOneMinusLambdas[contextIndex]; if (contextIndex == ROOT_NODE_INDEX) { sum += mLogUniformEstimate; contextIndex = ROOT_NODE_INDEX; continue NEXT_CHAR; } contextIndex = mSuffix[contextIndex]; // backoff until end } sum += mLogProbs[outcomeIndex]; contextIndex = outcomeIndex < len ? outcomeIndex : mSuffix[outcomeIndex]; } return sum; } public double log2ConditionalEstimate(CharSequence cSeq) { char[] cs = cSeq.toString().toCharArray(); return log2ConditionalEstimate(cs,0,cs.length); } public double log2ConditionalEstimate(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs,start,end); double total = 0.0; int contextEnd = end - 1; char c = cs[contextEnd]; // last char int maxContextLength = Math.min(contextEnd-start,mMaxNGram-1); for (int contextLength = maxContextLength; contextLength >= 0; --contextLength) { int contextStart = contextEnd - contextLength; int contextIndex = getIndex(cs,contextStart,contextEnd); if (contextIndex == -1) continue; // no ctx, try shorter context while (contextIndex > mLastContextIndex) contextIndex = mSuffix[contextIndex]; // no outcomes, // go to shortest w. outcomes int outcomeIndex = getIndex(contextIndex,c); if (outcomeIndex != -1) return total + mLogProbs[outcomeIndex]; total += mLogOneMinusLambdas[contextIndex]; } return total + mLogUniformEstimate; } /** * Returns a string-based representation of this compiled n-gram. * It writes one row per line of the parallel indices. It should * probably not be called with very large models, as the resulting * string will be much larger than the model itself. * * @return String-based representation of this model. */ public String toString() { StringBuffer sb = new StringBuffer(); sb.append("Max NGram=" + mMaxNGram); sb.append('\n'); sb.append("Log2 Uniform Estimate=" + mLogUniformEstimate); sb.append('\n'); sb.append("i c suff prob 1-lambda firstChild"); sb.append('\n'); for (int i = 0; i < mChars.length; ++i) { sb.append(i); sb.append(" "); sb.append(mChars[i]); sb.append(" "); sb.append(mSuffix[i]); sb.append(" "); sb.append(mLogProbs[i]); if (i < mLogOneMinusLambdas.length) { sb.append(" "); sb.append(mFirstChild[i]); sb.append(" "); sb.append(mLogOneMinusLambdas[i]); } sb.append("\n"); } return sb.toString(); } private int getIndex(int fromIndex, char c) { if (fromIndex+1 >= mFirstChild.length) return -1; int low = mFirstChild[fromIndex]; int high = mFirstChild[fromIndex+1]-1; while (low <= high) { int mid = (high + low)/2; if (mChars[mid] == c) return mid; else if (mChars[mid] < c) low = (low == mid) ? mid+1 : mid; else high = (high == mid) ? mid-1 : mid; } return -1; } private int getIndex(char[] cs, int start, int end) { int index = 0; for (int currentStart = start; currentStart < end; ++currentStart) { index = getIndex(index,cs[currentStart]); if (index == -1) return -1; } return index; } /** * The index of the root node, namely <code>0</code>. */ public static final int ROOT_NODE_INDEX = 0; private static final int CACHE_NOT_COMPUTED_VALUE = -1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -