📄 compiledspellchecker.java
字号:
/** * Sets the language model for this spell checker to the * specified value. * * @param lm New language model for this spell checker. */ public void setLanguageModel(CompiledNGramProcessLM lm) { mLM = lm; } /** * Sets the tokenizer factory for input processing to the * specified value. If the value is <code>null</code>, no * tokenization is performed on the input. * * @param factory Tokenizer factory for this spell checker. */ public void setTokenizerFactory(TokenizerFactory factory) { mTokenizerFactory = factory; } /** * Sets the set of tokens that can be produced by editing. * If the specified set is <code>null</code>, editing will * not be token sensitive. * * <P><i>Warning:</i> Spelling correction without tokenization may * be slow, especially with a large n-best size. * * @param tokenSet The new set of tokens or <code>null</code> if * not tokenizing. */ public final void setTokenSet(Set<String> tokenSet) { int maxLen = 0; for (String token : tokenSet) maxLen = java.lang.Math.max(maxLen,token.length()); // System.out.println("longest token=" + maxLen); mTokenSet = tokenSet; mTokenPrefixTrie = tokenSet == null ? null : prefixTrie(tokenSet); } /** * Sets The n-best size to the specified value. The n-best * size controls the number of hypotheses maintained going forward * for each character in the input. A higher value indicates * a broader and slower search for corrections. * * @param size Size of the n-best lists at each character. * @throws IllegalArgumentException If the size is less than one. */ public void setNBest(int size) { if (size < 1) { String msg = "N-best size must be greather than 0." + " Found size=" + size; throw new IllegalArgumentException(msg); } mNBestSize = size; } /** * Sets this spell checker to allow insertions if the specified * value is <code>true</code> and to disallow them if it is * <code>false</code>. If the value is <code>false</code>, then * the number of consecutive insertions allowed is also set * to zero. * * @param allowInsert New insertion mode. */ public void setAllowInsert(boolean allowInsert) { mAllowInsert = allowInsert; if (!allowInsert) setNumConsecutiveInsertionsAllowed(0); } /** * Sets this spell checker to allow deletions if the specified * value is <code>true</code> and to disallow them if it is * <code>false</code>. * * @param allowDelete New deletion mode. */ public void setAllowDelete(boolean allowDelete) { mAllowDelete = allowDelete; } /** * Sets this spell checker to allow matches if the specified * value is <code>true</code> and to disallow them if it is * <code>false</code>. * * @param allowMatch New match mode. */ public void setAllowMatch(boolean allowMatch) { mAllowMatch = allowMatch; } /** * Sets this spell checker to allow substitutions if the specified * value is <code>true</code> and to disallow them if it is * <code>false</code>. * * @param allowSubstitute New substitution mode. */ public void setAllowSubstitute(boolean allowSubstitute) { mAllowSubstitute = allowSubstitute; } /** * Sets this spell checker to allow transpositions if the specified * value is <code>true</code> and to disallow them if it is * <code>false</code>. * * @param allowTranspose New transposition mode. */ public void setAllowTranspose(boolean allowTranspose) { mAllowTranspose = allowTranspose; } /** * Set the number of consecutive insertions allowed to the * specified value. The value must not be negative. If the * number of insertions allowed is specified to be greater than * zero, then the allow insertion model will be set to * <code>true</code>. * * @param numAllowed Number of insertions allowed in a row. * @throws IllegalArgumentException If the number specified is * less than zero. */ public void setNumConsecutiveInsertionsAllowed(int numAllowed) { if (numAllowed < 0) { String msg = "Num insertions allowed must be >= 0." + " Found numAllowed=" + numAllowed; throw new IllegalArgumentException(msg); } if (numAllowed > 0) setAllowInsert(true); mNumConsecutiveInsertionsAllowed = numAllowed; } /** * Returns a first-best hypothesis of the intended message given a * received message. This method returns <code>null</code> if the * received message is itself the best hypothesis. The exact * definition of hypothesis ranking is provided in the class * documentation above. * * @param receivedMsg The message received over the noisy channel. * @return The first-best hypothesis of the intended source * message. */ public String didYouMean(String receivedMsg) { String msg = normalizeQuery(receivedMsg); if (msg.length() == 0) return msg; DpSpellQueue queue = new DpSpellQueue(); DpSpellQueue finalQueue = new DpSpellQueue(); computeBestPaths(msg,queue,finalQueue); if (finalQueue.isEmpty()) return msg; State bestState = (State) finalQueue.pop(); //System.out.println("Winner is: "+bestState); return bestState.output().trim(); } void computeBestPaths(String msg, StateQueue queue, StateQueue finalQueue) { double[] editPenalties = editPenalties(msg); State initialState = new State(0.0,false,mTokenPrefixTrie, null, mLM.nextContext(0,' ')); addToQueue(queue,initialState,editPenalties[0]); DpSpellQueue nextQ = new DpSpellQueue(); DpSpellQueue nextQ2 = new DpSpellQueue(); for (int i = 0; i < msg.length(); ++i) { char c = msg.charAt(i); char nextC = ((i+1) < msg.length()) ? msg.charAt(i+1) : 0; Iterator it = queue.iterator(); while (it.hasNext()) { State state = (State) it.next(); if ((i+1) < msg.length()) extend2(c,nextC,state,nextQ,nextQ2,editPenalties[i]); else extend1(c,state,nextQ,editPenalties[i]); } queue = nextQ; nextQ = nextQ2; nextQ2 = new DpSpellQueue(); } extendToFinalSpace(queue,finalQueue); } /** * Returns an iterator over the n-best spelling corrections for * the specified input string. The iterator produces instances * of {@link ScoredObject}, the object of which is the corrected * string and the score of which is the joint score of edit (channel) * costs and language model (source) cost of the output. * * <p>Unlike for HMMs and chunking, this n-best list is not exact * due to pruning during spelling correction. The maximum number * of returned results is determined by the n-best paramemter, as * set through {@link #setNBest(int)}. The larger the n-best list, * the higher-quality the results, even earlier on the list. * * <p>N-best spelling correction is not an exact computation * due to heuristic pruning during decoding. Thus setting the * n-best list to a larger result may result in better n-best * results, even for earlier results on the list. For instance, * the result of the first five corrections is not necessarily the * same with a 5-element, 10-element or 1000-element n-best size * (as specified by {@link #setNBest(int)}. * * <p>A rough confidence measure may be determined by comparing * the scores, which are log (base 2) edit (channel) plus log * (base 2) language model (source) scores. A very crude measure * is to compare the score of the first result to the score of * the second result; if there is a large gap, confidence is high. * A tighter measure is to convert the log probabilities back to * linear, add them all up, and then divide. For instance, if * there were results: * * <blockquote><table border="1" cellpadding="5"> * <tr><th algin="left">Rank</th> * <th algin="left">String</th> * <th algin="left">Log (2) Prob</th> * <th algin="left">Prob</th> * <th align="left">Conf</th></tr> * <tr><td>0</td><td>foo</td><td>-2</td> <td>0.250</td><td>0.571</tr> * <tr><td>0</td><td>for</td><td>-3</td> <td>0.125</td>0.285</tr> * <tr><td>0</td><td>food</td><td>-4</td><td>0.062</td>0.143</tr> * <tr><td>0</td><td>of</td><td>-10</td> <td>0.001</td>0.002</tr> * </table></blockquote> * * Here there are four results, with log probabilities -2, -3, * -4 and -10, which have the corresponding linear probabilities. * The sum of these probabilities is 0.438. Hence the confidence * in the top-ranked answer is 0.250/0.438=0.571. * * <p><b>Warning:</b> Spell checking with n-best output is * currently implemented with a very naive algorithm and is * thus very slow compared to first-best spelling correction. * The reason for this is that there the dynamic programming * is turned off for n-best spelling correction, hence a lot * redundant computation is done. * * @param receivedMsg Input message. * @return Iterator over n-best spelling suggestions. */ public Iterator<ScoredObject<String>> didYouMeanNBest(String receivedMsg) { String msg = normalizeQuery(receivedMsg); if (msg.length() == 0) return new Iterators.Singleton(new ScoredObject("",0)); StateQueue queue = new NBestSpellQueue(); StateQueue finalQueue = new NBestSpellQueue(); computeBestPaths(msg,queue,finalQueue); BoundedPriorityQueue<ScoredObject<String>> resultQueue = new BoundedPriorityQueue<ScoredObject<String>>(ScoredObject .SCORE_COMPARATOR, mNBestSize); Iterator it = finalQueue.iterator(); while (it.hasNext()) { State state = (State) it.next(); resultQueue.add(new ScoredObject<String>(state.output().trim(), state.score())); } return resultQueue.iterator(); } private boolean isShortToken(String token) { return token.length() <= mMinimumTokenLengthToCorrect; } private double[] editPenalties(String msg) { double[] penalties = new double[msg.length()]; Arrays.fill(penalties,0.0); if (mTokenSet == null) return penalties; int charPosition = 0; for (int i = 0; i < penalties.length; ++i) { char c = msg.charAt(i); if ((mTokenSet != null) && ((i == 0) || (msg.charAt(i-1) == ' '))) { int endIndex = msg.indexOf(' ', i); if (endIndex == -1) endIndex = msg.length(); String token = msg.substring(i,endIndex); if (mDoNotEditTokens.contains(token) || isShortToken(token)) { // penalize space before if (i > 0) { penalties[i-1] = Double.NEGATIVE_INFINITY; } // penalize chars within for (int j = i; j < endIndex; ++j) { penalties[j] = Double.NEGATIVE_INFINITY; } // penalize space after (may get double penalized) if (endIndex < penalties.length) { penalties[endIndex] = Double.NEGATIVE_INFINITY; } } else if (mTokenSet.contains(token)) { if (i > 0) { penalties[i-1] += mKnownTokenEditCost; } // penalize chars within for (int j = i; j < endIndex; ++j) { penalties[j] += mKnownTokenEditCost; } // penalize space after (may get double penalized) if (endIndex < penalties.length) { penalties[endIndex] += mKnownTokenEditCost;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -