📄 compiledspellchecker.java
字号:
public static WeightedEditDistance CASE_RESTORING = new CaseRestoring(); /** * A weighted edit distance ordered by similarity that allows free * space insertion. The cost of inserting a space is zero, the * cost of matching is zero, and all other costs are infinite. * See {@link WeightedEditDistance} for more information on * similarity-based distances. * * <P>If this model is used for spelling correction, the result is * as system that will retokenize input with no spaces. For * instance, if the source model is trained with chinese tokens * separated by spaces and the input is a sequence of chinese * characters not separated by spaces, the output is a space-separated * tokenization. If the source model is valid pronunciations * separated by spaces and the input is pronunciations not separated * by spaces, the result is a tokenization. * * <P>This edit distance is compilable and the result of writing it * and reading it is referentially equal to this instance. */ public static WeightedEditDistance TOKENIZING = new Tokenizing(); static final int DEFAULT_N_BEST_SIZE = 64; static final double DEFAULT_KNOWN_TOKEN_EDIT_COST = -2.0; static final double DEFAULT_FIRST_CHAR_EDIT_COST = -1.5; static final double DEFAULT_SECOND_CHAR_EDIT_COST = -1.0; private static char[] characterSetToCharArray(Set<Character> cSet) { char[] cs = new char[cSet.size()]; int k = 0; for (Character c : cSet) cs[k++] = c.charValue(); return cs; } private static Map<String,char[]> prefixToContinuations(Set<String> tokens) { Map<String,char[]> prefixToContinuations = new HashMap<String,char[]>(); int count = 0; for (String token : tokens) { // if (++count % 100000 == 0) // System.out.println("token count=" + count); for (int i = 0; i < token.length(); ++i) { String prefix = token.substring(0,i); char nextChar = token.charAt(i); char[] currentCs = prefixToContinuations.get(prefix); if (currentCs == null) { prefixToContinuations.put(prefix,new char[] { nextChar }); } else { char[] nextCs = com.aliasi.util.Arrays.add(nextChar,currentCs); if (nextCs.length > currentCs.length) prefixToContinuations.put(prefix,nextCs); } } } return prefixToContinuations; } private TokenTrieNode prefixTrie(Set<String> tokens) { Map<String,char[]> prefixMap = prefixToContinuations(tokens); return completeTrieNode("",tokens,prefixMap); } private static TokenTrieNode completeTrieNode(String prefix, Set<String> tokens, Map<String,char[]> prefixMap) { // System.out.println("printing prefix map"); // for (Map.Entry<String,char[]> entry : prefixMap.entrySet()) // System.out.println("|" + entry.getKey() + "|==>|" + new String(entry.getValue()) + "|"); char[] contChars = prefixMap.get(prefix); if (contChars == null) contChars = com.aliasi.util.Arrays.EMPTY_CHAR_ARRAY; else Arrays.sort(contChars); // System.out.println("prefix=|" + prefix + "| cont chars=|" + new String(contChars) + '|'); TokenTrieNode[] contNodes = new TokenTrieNode[contChars.length]; for (int i = 0; i < contNodes.length; ++i) contNodes[i] = completeTrieNode(prefix+contChars[i],tokens,prefixMap); return new TokenTrieNode(tokens.contains(prefix), contChars, contNodes); } private static final class TokenTrieNode { final boolean mIsToken; final char[] mFollowingChars; final TokenTrieNode[] mFollowingNodes; TokenTrieNode(boolean isToken, char[] followingChars, TokenTrieNode[] followingNodes) { mIsToken = isToken; mFollowingChars = followingChars; mFollowingNodes = followingNodes; } public String toString() { StringBuffer sb = new StringBuffer(); toString("",sb,0); return sb.toString(); } void toString(String prefix, StringBuffer sb, int indent) { if (mIsToken) sb.append(" [token=" + prefix + "]"); sb.append('\n'); for (int i = 0; i < mFollowingChars.length; ++i) { for (int k = 0; k < indent; ++k) sb.append(" "); sb.append(mFollowingChars[i]); mFollowingNodes[i].toString(prefix+mFollowingChars[i], sb,indent+1); } } TokenTrieNode daughter(char c) { int index = Arrays.binarySearch(mFollowingChars,c); return index < 0 ? null : mFollowingNodes[index]; } } private final class State1 extends State { final char mChar1; State1(double score, boolean tokenEdited, TokenTrieNode tokenTrieNode, State previousState, char char1, int contextIndex) { super(score,tokenEdited,tokenTrieNode,previousState, contextIndex); mChar1 = char1; } void outputLocal(StringBuffer sb) { sb.append(mChar1); } } private final class State2 extends State { final char mChar1; final char mChar2; State2(double score, boolean tokenEdited, TokenTrieNode tokenTrieNode, State previousState, char char1, char char2, int contextIndex) { super(score,tokenEdited,tokenTrieNode,previousState, contextIndex); mChar1 = char1; mChar2 = char2; } void outputLocal(StringBuffer sb) { sb.append(mChar2); sb.append(mChar1); } } private class State implements Scored { final TokenTrieNode mTokenTrieNode; // null if not tokenizing final double mScore; final boolean mTokenEdited; final State mPreviousState; final int mContextIndex; State(double score, boolean tokenEdited, TokenTrieNode tokenTrieNode, State previousState, int contextIndex) { mScore = score; mTokenEdited = tokenEdited; mTokenTrieNode = tokenTrieNode; mPreviousState = previousState; mContextIndex = contextIndex; } public double score() { return mScore; } TokenTrieNode followingNode(int i) { return mTokenTrieNode == null ? null : mTokenTrieNode.mFollowingNodes[i]; } public String toString() { return output() + "/" + mTokenEdited + "/" + mScore; } boolean tokenComplete() { boolean result = (mTokenSet == null) || ((mTokenTrieNode != null) && mTokenTrieNode.mIsToken); return result; } boolean continuedBy(char c) { if (mTokenTrieNode == null) return true; char[] continuations = getContinuations(); return (continuations != null) && Arrays.binarySearch(continuations,c) >= 0; } char[] getContinuations() { return mTokenTrieNode == null ? observedCharacters() : mTokenTrieNode.mFollowingChars; } void outputLocal(StringBuffer sb) { /* do nothing */ } String output() { StringBuffer sb = new StringBuffer(); for (State s = this; s != null; s = s.mPreviousState) s.outputLocal(sb); // reverse int len = sb.length(); char[] cs = new char[len]; for (int i = 0; i < len; ++i) cs[i] = sb.charAt(len-i-1); return new String(cs); } } // easy to add a beam here and return false right away private final class DpSpellQueue extends StateQueue { private final HashMap mStateToBest = new HashMap(); public boolean addState(State state) { Integer dp = new Integer(state.mContextIndex); State bestState = (State) mStateToBest.get(dp); if (bestState == null) { mStateToBest.put(dp,state); return add(state); } if (bestState.mScore >= state.mScore) return false; remove(bestState); mStateToBest.put(dp,state); return add(state); } } private final class NBestSpellQueue extends StateQueue { public boolean addState(State state) { return add(state); } } private abstract class StateQueue extends BoundedPriorityQueue { StateQueue() { super(Scored.SCORE_COMPARATOR,mNBestSize); } abstract boolean addState(State state); } private static final class CaseRestoring extends FixedWeightEditDistance implements Compilable { public CaseRestoring() { super(0.0, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY); } public double substituteWeight(char cDeleted, char cInserted) { return (Character.toLowerCase(cDeleted) == Character.toLowerCase(cInserted)) ? 0.0 : Double.NEGATIVE_INFINITY; } public void compileTo(ObjectOutput objOut) throws IOException { objOut.writeObject(new Externalizable()); } private static class Externalizable extends AbstractExternalizable { private static final long serialVersionUID = 2825384056772387737L; public Externalizable() { /* do nothing */ } public void writeExternal(ObjectOutput objOut) { /* do nothing */ } public Object read(ObjectInput objIn) { return CASE_RESTORING; } } } private static final class Tokenizing extends FixedWeightEditDistance implements Compilable { public Tokenizing() { super(0.0, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY); } public double insertWeight(char cInserted) { return cInserted == ' ' ? 0.0 : Double.NEGATIVE_INFINITY; } public void compileTo(ObjectOutput objOut) throws IOException { objOut.writeObject(new Externalizable()); } private static class Externalizable extends AbstractExternalizable { private static final long serialVersionUID = -3015819851142009998L; public Externalizable() { /* do nothing */ } public void writeExternal(ObjectOutput objOut) { /* do nothing */ } public Object read(ObjectInput objIn) { return TOKENIZING; } } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -