📄 compiledspellchecker.java
字号:
} } } if (c == ' ') { charPosition = 0; // this'll also affect first by making it non-first penalties[i] += mFirstCharEditCost; } else if (charPosition == 0) { penalties[i] += mFirstCharEditCost; ++charPosition; } else if (charPosition == 1) { penalties[i] += mSecondCharEditCost; ++charPosition; } } // for (int i = 0; i < penalties.length; ++i) // System.out.println(" " + msg.charAt(i) + "=" + penalties[i]); // return penalties; } /** * Returns a string-based representation of the parameters of * this compiled spell checker. * * @return A string representing the parameters of this spell * checker. */ public String parametersToString() { StringBuffer sb = new StringBuffer(); sb.append("SEARCH"); sb.append("\n N-best size=" + mNBestSize); sb.append("\n\nTOKEN SENSITIVITY"); sb.append("\n Token sensitive=" + (mTokenSet != null)); if (mTokenSet != null) { sb.append("\n # Known Tokens=" + mTokenSet.size()); } sb.append("\n\nEDITS ALLOWED"); sb.append("\n Allow insert=" + mAllowInsert); sb.append("\n Allow delete=" + mAllowDelete); sb.append("\n Allow match=" + mAllowMatch); sb.append("\n Allow substitute=" + mAllowSubstitute); sb.append("\n Allow transpose=" + mAllowTranspose); sb.append("\n Num consecutive insertions allowed=" + mNumConsecutiveInsertionsAllowed); sb.append("\n Minimum Length Token Edit=" + mMinimumTokenLengthToCorrect); sb.append("\n # of do-not-Edit Tokens=" + mDoNotEditTokens.size()); sb.append("\n\nEDIT COSTS"); sb.append("\n Edit Distance=" + mEditDistance); sb.append("\n Known Token Edit Cost=" + mKnownTokenEditCost); sb.append("\n First Char Edit Cost=" + mFirstCharEditCost); sb.append("\n Second Char Edit Cost=" + mSecondCharEditCost); sb.append("\n\nEDIT DISTANCE\n"); sb.append(mEditDistance); sb.append("\n\nTOKENIZER FACTORY\n"); sb.append(mTokenizerFactory); return sb.toString(); } String normalizeQuery(String query) { StringBuffer sb = new StringBuffer(); if (mTokenizerFactory == null) { Strings.normalizeWhitespace(query,sb); } else { char[] cs = query.toCharArray(); Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length); String nextToken; for (int i = 0; (nextToken = tokenizer.nextToken()) != null; ++i) { if (i > 0) sb.append(' '); sb.append(nextToken); } } return sb.toString(); } // set this locally if tokenset == null char[] observedCharacters() { return mLM.observedCharacters(); } void extendToFinalSpace(StateQueue queue, StateQueue finalQueue) { Iterator stateIt = queue.iterator(); while (stateIt.hasNext()) { State state = (State) stateIt.next(); if (state.mTokenEdited && !state.tokenComplete()) { continue; // delete if final edited to non-token } double nextScore = state.mScore + mLM.log2Estimate(state.mContextIndex,' '); if (nextScore == Double.NEGATIVE_INFINITY) { continue; } State nextState = new State(nextScore,false, null, state, -1); finalQueue.addState(nextState); } } void extend2(char c1, char c2, State state, DpSpellQueue nextQ, DpSpellQueue nextQ2, double positionalEditPenalty) { extend1(c1,state,nextQ,positionalEditPenalty); if (positionalEditPenalty == Double.NEGATIVE_INFINITY) return; if (allowTranspose()) transpose(c1,c2,state,nextQ2,positionalEditPenalty); } void extend1(char c, State state, DpSpellQueue nextQ, double positionalEditPenalty) { if (allowMatch()) match(c,state,nextQ,positionalEditPenalty); if (positionalEditPenalty == Double.NEGATIVE_INFINITY) return; if (allowSubstitute()) substitute(c,state,nextQ,positionalEditPenalty); if (allowDelete()) delete(c,state,nextQ,positionalEditPenalty); } void addToQueue(StateQueue queue, State state, double positionalEditPenalty) { addToQueue(queue,state,0,positionalEditPenalty); } void addToQueue(StateQueue queue, State state, int numInserts, double positionalEditPenalty) { if (!queue.addState(state)) return; if (numInserts >= mNumConsecutiveInsertionsAllowed) return; if (positionalEditPenalty == Double.NEGATIVE_INFINITY) return; insert(state,queue,numInserts,positionalEditPenalty); } TokenTrieNode daughter(TokenTrieNode node, char c) { return node == null ? null : node.daughter(c); } void match(char c, State state, DpSpellQueue nextQ, double positionalEditPenalty) { if (state.mTokenEdited) { if (c == ' ') { if (!state.tokenComplete()) { return; } } else if (!state.continuedBy(c)) { return; } } double score = state.mScore + mLM.log2Estimate(state.mContextIndex,c) + mEditDistance.matchWeight(c); if (score == Double.NEGATIVE_INFINITY) return; TokenTrieNode tokenTrieNode = (c == ' ') ? mTokenPrefixTrie : daughter(state.mTokenTrieNode,c); addToQueue(nextQ, new State1(score, (c != ' ') && state.mTokenEdited, tokenTrieNode, state,c, mLM.nextContext(state.mContextIndex,c)), positionalEditPenalty); } void delete(char c, State state, DpSpellQueue nextQ, double positionalEditPenalty) { double deleteWeight = mEditDistance.deleteWeight(c); if (deleteWeight == Double.NEGATIVE_INFINITY) return; double score = state.mScore + deleteWeight + positionalEditPenalty; addToQueue(nextQ, new State(score, true, state.mTokenTrieNode, state, state.mContextIndex), positionalEditPenalty); } void insert(State state, StateQueue nextQ, int numInserts, double positionalEditPenalty) { if (state.tokenComplete()) { double score = state.mScore + mLM.log2Estimate(state.mContextIndex,' ') + mEditDistance.insertWeight(' ') + positionalEditPenalty; if (score != Double.NEGATIVE_INFINITY) addToQueue(nextQ, new State1(score,true, mTokenPrefixTrie, state,' ', mLM.nextContext(state.mContextIndex, ' ')), numInserts+1,positionalEditPenalty); } char[] followers = state.getContinuations(); if (followers == null) return; for (int i = 0; i < followers.length; ++i) { char c = followers[i]; double insertWeight = mEditDistance.insertWeight(c); if (insertWeight == Double.NEGATIVE_INFINITY) continue; double score = state.mScore + mLM.log2Estimate(state.mContextIndex,c) + insertWeight + positionalEditPenalty; if (score == Double.NEGATIVE_INFINITY) continue; addToQueue(nextQ, new State1(score,true, state.followingNode(i), state,c, mLM.nextContext(state.mContextIndex,c)), numInserts+1, positionalEditPenalty); } } void substitute(char c, State state, StateQueue nextQ, double positionalEditPenalty) { if (state.tokenComplete() && c != ' ') { double score = state.mScore + mLM.log2Estimate(state.mContextIndex,' ') + mEditDistance.substituteWeight(c,' ') + positionalEditPenalty; if (score != Double.NEGATIVE_INFINITY) addToQueue(nextQ, new State1(score,true, mTokenPrefixTrie, state,' ', mLM.nextContext(state.mContextIndex, ' ')), positionalEditPenalty); } char[] followers = state.getContinuations(); if (followers == null) return; for (int i = 0; i < followers.length; ++i) { char c2 = followers[i]; if (c == c2) continue; // don't match double substWeight = mEditDistance.substituteWeight(c,c2); if (substWeight == Double.NEGATIVE_INFINITY) continue; double score = state.mScore + mLM.log2Estimate(state.mContextIndex,c2) + substWeight + positionalEditPenalty; if (score == Double.NEGATIVE_INFINITY) continue; addToQueue(nextQ, new State1(score,true, state.followingNode(i), state,c2, mLM.nextContext(state.mContextIndex, c2)), positionalEditPenalty); } } void transpose(char c1, char c2, State state, StateQueue nextQ, double positionalEditPenalty) { double transposeWeight = mEditDistance.transposeWeight(c1,c2); if (transposeWeight == Double.NEGATIVE_INFINITY) return; if (c2 == ' ' && !state.tokenComplete()) return; TokenTrieNode midNode = (c2 == ' ') ? mTokenPrefixTrie : daughter(state.mTokenTrieNode,c2); if (c1 == ' ' && midNode != null && !midNode.mIsToken) return; int nextContextIndex = mLM.nextContext(state.mContextIndex,c2); int nextContextIndex2 = mLM.nextContext(nextContextIndex,c1); double score = state.mScore + mLM.log2Estimate(state.mContextIndex,c2) + mLM.log2Estimate(nextContextIndex,c1) + mEditDistance.transposeWeight(c1,c2) + positionalEditPenalty; if (score == Double.NEGATIVE_INFINITY) return; TokenTrieNode nextNode = (c1 == ' ') ? mTokenPrefixTrie : daughter(midNode,c1); addToQueue(nextQ, new State2(score,true,nextNode, state,c2,c1, nextContextIndex2), positionalEditPenalty); } /** * A weighted edit distance ordered by similarity that treats case * variants as zero cost and all other edits as infinite cost. * The inifite cost is {@link Double#NEGATIVE_INFINITY}. See * {@link WeightedEditDistance} for more information on * similarity-based distances. * * <P>If this model is used for spelling correction, the result is * a system that simply chooses the most likely case for output * characters given an input character and does not change anything * else. * * <P>Case here is based on the methods * {@link Character#isUpperCase(char)}, {@link Character#isLowerCase(char)} * and equality is tested by converting the upper case character to * lower case using {@link Character#toLowerCase(char)}. * * <P>This edit distance is compilable and the result of writing it * and reading it is referentially equal to this instance. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -