⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 compiledspellchecker.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
    public static WeightedEditDistance CASE_RESTORING = new CaseRestoring();    /**     * A weighted edit distance ordered by similarity that allows free     * space insertion.  The cost of inserting a space is zero, the     * cost of matching is zero, and all other costs are infinite.     * See {@link WeightedEditDistance} for more information on     * similarity-based distances.     *     * <P>If this model is used for spelling correction, the result is     * as system that will retokenize input with no spaces.  For     * instance, if the source model is trained with chinese tokens     * separated by spaces and the input is a sequence of chinese     * characters not separated by spaces, the output is a space-separated     * tokenization.  If the source model is valid pronunciations     * separated by spaces and the input is pronunciations not separated     * by spaces, the result is a tokenization.     *     * <P>This edit distance is compilable and the result of writing it     * and reading it is referentially equal to this instance.     */    public static WeightedEditDistance TOKENIZING = new Tokenizing();    static final int DEFAULT_N_BEST_SIZE = 64;    static final double DEFAULT_KNOWN_TOKEN_EDIT_COST = -2.0;    static final double DEFAULT_FIRST_CHAR_EDIT_COST = -1.5;    static final double DEFAULT_SECOND_CHAR_EDIT_COST = -1.0;    private static char[] characterSetToCharArray(Set<Character> cSet) {        char[] cs = new char[cSet.size()];        int k = 0;        for (Character c : cSet)             cs[k++] = c.charValue();        return cs;    }    private static Map<String,char[]> prefixToContinuations(Set<String> tokens) {        Map<String,char[]> prefixToContinuations            = new HashMap<String,char[]>();        int count = 0;        for (String token : tokens) {            // if (++count % 100000 == 0)            // System.out.println("token count=" + count);            for (int i = 0; i < token.length(); ++i) {                String prefix = token.substring(0,i);                char nextChar = token.charAt(i);                char[] currentCs = prefixToContinuations.get(prefix);                if (currentCs == null) {                    prefixToContinuations.put(prefix,new char[] { nextChar });                } else {                    char[] nextCs = com.aliasi.util.Arrays.add(nextChar,currentCs);                    if (nextCs.length > currentCs.length)                        prefixToContinuations.put(prefix,nextCs);                }            }        }        return prefixToContinuations;    }    private TokenTrieNode prefixTrie(Set<String> tokens) {        Map<String,char[]> prefixMap = prefixToContinuations(tokens);        return completeTrieNode("",tokens,prefixMap);    }    private static TokenTrieNode completeTrieNode(String prefix,                                                  Set<String> tokens,                                                  Map<String,char[]> prefixMap) {        // System.out.println("printing prefix map");        // for (Map.Entry<String,char[]> entry : prefixMap.entrySet())        // System.out.println("|" + entry.getKey() + "|==>|" + new String(entry.getValue()) + "|");        char[] contChars = prefixMap.get(prefix);        if (contChars == null)            contChars = com.aliasi.util.Arrays.EMPTY_CHAR_ARRAY;        else            Arrays.sort(contChars);        // System.out.println("prefix=|" + prefix + "| cont chars=|" + new String(contChars) + '|');        TokenTrieNode[] contNodes = new TokenTrieNode[contChars.length];        for (int i = 0; i < contNodes.length; ++i)            contNodes[i]                = completeTrieNode(prefix+contChars[i],tokens,prefixMap);        return new TokenTrieNode(tokens.contains(prefix),                                 contChars, contNodes);    }    private static final class TokenTrieNode {        final boolean mIsToken;        final char[] mFollowingChars;        final TokenTrieNode[] mFollowingNodes;        TokenTrieNode(boolean isToken, char[] followingChars,                      TokenTrieNode[] followingNodes) {            mIsToken = isToken;            mFollowingChars = followingChars;            mFollowingNodes = followingNodes;        }        public String toString() {            StringBuffer sb = new StringBuffer();            toString("",sb,0);            return sb.toString();        }        void toString(String prefix, StringBuffer sb, int indent) {            if (mIsToken) sb.append(" [token=" + prefix + "]");            sb.append('\n');            for (int i = 0; i < mFollowingChars.length; ++i) {                for (int k = 0; k < indent; ++k) sb.append("  ");                sb.append(mFollowingChars[i]);                mFollowingNodes[i].toString(prefix+mFollowingChars[i],                                            sb,indent+1);            }        }        TokenTrieNode daughter(char c) {            int index = Arrays.binarySearch(mFollowingChars,c);            return index < 0                ? null                : mFollowingNodes[index];        }    }    private final class State1 extends State {        final char mChar1;        State1(double score, boolean tokenEdited,               TokenTrieNode tokenTrieNode, State previousState,               char char1,               int contextIndex) {            super(score,tokenEdited,tokenTrieNode,previousState,                  contextIndex);            mChar1 = char1;        }        void outputLocal(StringBuffer sb) {            sb.append(mChar1);        }    }    private final class State2 extends State {        final char mChar1;        final char mChar2;        State2(double score, boolean tokenEdited,               TokenTrieNode tokenTrieNode, State previousState,               char char1, char char2,               int contextIndex) {            super(score,tokenEdited,tokenTrieNode,previousState,                  contextIndex);            mChar1 = char1;            mChar2 = char2;        }        void outputLocal(StringBuffer sb) {            sb.append(mChar2);            sb.append(mChar1);        }    }    private class State implements Scored {        final TokenTrieNode mTokenTrieNode; // null if not tokenizing        final double mScore;        final boolean mTokenEdited;        final State mPreviousState;        final int mContextIndex;        State(double score, boolean tokenEdited,              TokenTrieNode tokenTrieNode, State previousState,              int contextIndex) {            mScore = score;            mTokenEdited = tokenEdited;            mTokenTrieNode = tokenTrieNode;            mPreviousState = previousState;            mContextIndex = contextIndex;        }        public double score() {            return mScore;        }        TokenTrieNode followingNode(int i) {            return mTokenTrieNode == null                ? null                : mTokenTrieNode.mFollowingNodes[i];        }        public String toString() {            return output() + "/" + mTokenEdited + "/" + mScore;        }        boolean tokenComplete() {            boolean result = (mTokenSet == null)                || ((mTokenTrieNode != null) && mTokenTrieNode.mIsToken);            return result;        }        boolean continuedBy(char c) {            if (mTokenTrieNode == null) return true;            char[] continuations = getContinuations();            return (continuations != null)                && Arrays.binarySearch(continuations,c) >= 0;        }        char[] getContinuations() {            return mTokenTrieNode == null                ? observedCharacters()                : mTokenTrieNode.mFollowingChars;        }        void outputLocal(StringBuffer sb) {            /* do nothing */        }        String output() {            StringBuffer sb = new StringBuffer();            for (State s = this; s != null; s = s.mPreviousState)                s.outputLocal(sb);            // reverse            int len = sb.length();            char[] cs = new char[len];            for (int i = 0; i < len; ++i)                cs[i] = sb.charAt(len-i-1);            return new String(cs);        }    }    // easy to add a beam here and return false right away    private final class DpSpellQueue extends StateQueue {        private final HashMap mStateToBest = new HashMap();        public boolean addState(State state) {            Integer dp = new Integer(state.mContextIndex);            State bestState = (State) mStateToBest.get(dp);            if (bestState == null) {                mStateToBest.put(dp,state);                return add(state);            }            if (bestState.mScore >= state.mScore)                return false;            remove(bestState);            mStateToBest.put(dp,state);            return add(state);        }    }    private final class NBestSpellQueue extends StateQueue {        public boolean addState(State state) {            return add(state);        }    }    private abstract class StateQueue extends BoundedPriorityQueue {        StateQueue() {            super(Scored.SCORE_COMPARATOR,mNBestSize);        }        abstract boolean addState(State state);    }    private static final class CaseRestoring        extends FixedWeightEditDistance        implements Compilable {        public CaseRestoring() {            super(0.0,                  Double.NEGATIVE_INFINITY,                  Double.NEGATIVE_INFINITY,                  Double.NEGATIVE_INFINITY,                  Double.NEGATIVE_INFINITY);        }        public double substituteWeight(char cDeleted, char cInserted) {            return (Character.toLowerCase(cDeleted)                    == Character.toLowerCase(cInserted))                ? 0.0                : Double.NEGATIVE_INFINITY;        }        public void compileTo(ObjectOutput objOut) throws IOException {            objOut.writeObject(new Externalizable());        }        private static class Externalizable extends AbstractExternalizable {            private static final long serialVersionUID = 2825384056772387737L;            public Externalizable() {                /* do nothing */            }            public void writeExternal(ObjectOutput objOut) {                /* do nothing */            }            public Object read(ObjectInput objIn) {                return CASE_RESTORING;            }        }    }    private static final class Tokenizing        extends FixedWeightEditDistance        implements Compilable {        public Tokenizing() {            super(0.0,                  Double.NEGATIVE_INFINITY,                  Double.NEGATIVE_INFINITY,                  Double.NEGATIVE_INFINITY,                  Double.NEGATIVE_INFINITY);        }        public double insertWeight(char cInserted) {            return cInserted == ' ' ? 0.0 : Double.NEGATIVE_INFINITY;        }        public void compileTo(ObjectOutput objOut) throws IOException {            objOut.writeObject(new Externalizable());        }        private static class Externalizable extends AbstractExternalizable {            private static final long serialVersionUID = -3015819851142009998L;            public Externalizable() {                /* do nothing */            }            public void writeExternal(ObjectOutput objOut) {                /* do nothing */            }            public Object read(ObjectInput objIn) {                return TOKENIZING;            }        }    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -