📄 ngramboundarylm.java
字号:
BitInput bitIn = new BitInput(in); char boundaryChar = (char) (bitIn.readDelta()-1L); NGramProcessLM processLM = NGramProcessLM.readFrom(bitIn); return new NGramBoundaryLM(processLM,boundaryChar); } /** * Returns the underlying n-gram process language model * for this boundary language model. Changes to the returned * model affect this language model. * * @return The underlying process language model. */ public NGramProcessLM getProcessLM() { return mProcessLM; } /** * Returns the characters that have been observed for this * language model, including the special boundary character. * * @return The observed characters for this langauge model. */ public char[] observedCharacters() { return mProcessLM.observedCharacters(); } /** * Returns the underlying substring counter for this language * model. This model may be pruned by pruning the counter * returned by this method. * * @return The underlying substring counter for this language model. */ public TrieCharSeqCounter substringCounter() { return mProcessLM.substringCounter(); } /** * Writes a compiled version of this boundary language model to * the specified object output. The result may be read back in * by casting the result of {@link ObjectInput#readObject()} to * {@link CompiledNGramBoundaryLM}. * * @param objOut Object output to which this model is compiled. * @throws IOException If there is an I/O exception during the * write. */ public void compileTo(ObjectOutput objOut) throws IOException { objOut.writeObject(new Externalizer(this)); } public void train(CharSequence cs, int count) { char[] csBounded = addBoundaries(cs,mBoundaryChar); mProcessLM.train(csBounded,0,csBounded.length,count); // don't count initial boundary mProcessLM.decrementUnigram(mBoundaryChar,count); } public void train(CharSequence cs) { train(cs,1); } public void train(char[] cs, int start, int end) { train(cs,start,end,1); } public void train(char[] cs, int start, int end, int count) { char[] csBounded = addBoundaries(cs,start,end,mBoundaryChar); mProcessLM.train(csBounded,0,csBounded.length,count); mProcessLM.decrementUnigram(mBoundaryChar,count); } public double log2ConditionalEstimate(CharSequence cs) { if (cs.length() < 1) { String msg = "Conditional estimate must be at least one character."; throw new IllegalArgumentException(msg); } char[] csBounded = addBoundaries(cs,mBoundaryChar); return mProcessLM.log2ConditionalEstimate(csBounded,0,csBounded.length-1); } public double log2ConditionalEstimate(char[] cs, int start, int end) { if (end <= start) { String msg = "Conditional estimate must be at least one character."; throw new IllegalArgumentException(msg); } char[] csBounded = addBoundaries(cs,start,end,mBoundaryChar); return mProcessLM.log2ConditionalEstimate(csBounded,0,csBounded.length-1); } public double log2Estimate(CharSequence cs) { char[] csBounded = addBoundaries(cs,mBoundaryChar); return mProcessLM.log2Estimate(csBounded,0,csBounded.length) - mProcessLM.log2Estimate(mBoundaryArray,0,1); } public double log2Estimate(char[] cs, int start, int end) { char[] csBounded = addBoundaries(cs,start,end,mBoundaryChar); return mProcessLM.log2Estimate(csBounded,0,csBounded.length) - mProcessLM.log2Estimate(mBoundaryArray,0,1); } /** * This method is a convenience impelementation of the {@link * Model} interface which delegates the call to {@link * #log2Estimate(CharSequence)}. * * @param cSeq Character sequence whose probability is returned. * @return The log (base 2) probability of the specified character sequence. */ public double log2Prob(CharSequence cSeq) { return log2Estimate(cSeq); } /** * This method is a convenience implementation of the {@link Model} * interface which returns the result of raising 2.0 to the * power of the result of a call to {@link #log2Estimate(CharSequence)}. * * @param cSeq Character sequence whose probability is returned. * @return The log probability of the specified character sequence. */ public double prob(CharSequence cSeq) { return Math.pow(2.0,log2Estimate(cSeq)); } /** * Returns a string-based representation of this language model. * It displays the boundary character and the contained * process language model. * * @return A string-based representation of this language model. */ public String toString() { StringBuffer sb = new StringBuffer(); sb.append("Boundary char=" + ((int)mBoundaryChar)); sb.append('\n'); mProcessLM.toStringBuffer(sb); return sb.toString(); } static char[] addBoundaries(CharSequence cs, char boundaryChar) { char[] cs2 = new char[cs.length() + 2]; for (int i = 0; i < cs.length(); ++i) { char c = cs.charAt(i); if (c == boundaryChar) { String msg = "Estimated string cannot contain boundary char." + " Found boundary char=" + c + " at index=" + i; throw new IllegalArgumentException(msg); } cs2[i+1] = cs.charAt(i); } addBoundaryChars(cs2,boundaryChar); return cs2; } static char[] addBoundaries(char[] cs, int start, int end, char boundaryChar) { char[] cs2 = new char[cs.length+1]; int len = end-start; for (int i = 0; i < len; ++i) { char c = cs[i+start]; if (c == boundaryChar) { // ugly cut and paste from above String msg = "Estimated string cannot contain boundary char." + " Found boundary char=" + c + " at index=" + (i+start); throw new IllegalArgumentException(msg); } cs2[i+1] = c; } addBoundaryChars(cs2,boundaryChar); return cs2; } static void addBoundaryChars(char[] cs, char boundaryChar) { cs[0] = boundaryChar; cs[cs.length-1] = boundaryChar; } static class Externalizer extends AbstractExternalizable { private static final long serialVersionUID = -7945082563035787530L; final NGramBoundaryLM mLM; public Externalizer() { this(null); } public Externalizer(NGramBoundaryLM lm) { mLM = lm; } public Object read(ObjectInput objIn) throws IOException { return new CompiledNGramBoundaryLM(objIn); } public void writeExternal(ObjectOutput objOut) throws IOException { objOut.writeChar(mLM.mBoundaryChar); mLM.mProcessLM.compileTo(objOut); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -