📄 compiledspellchecker.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
    /**     * Sets the language model for this spell checker to the     * specified value.     *     * @param lm New language model for this spell checker.     */    public void setLanguageModel(CompiledNGramProcessLM lm) {        mLM = lm;    }    /**     * Sets the tokenizer factory for input processing to the     * specified value.  If the value is <code>null</code>, no     * tokenization is performed on the input.     *     * @param factory Tokenizer factory for this spell checker.     */    public void setTokenizerFactory(TokenizerFactory factory) {        mTokenizerFactory = factory;    }    /**     * Sets the set of tokens that can be produced by editing.     * If the specified set is <code>null</code>, editing will     * not be token sensitive.     *     * <P><i>Warning:</i> Spelling correction without tokenization may     * be slow, especially with a large n-best size.     *     * @param tokenSet The new set of tokens or <code>null</code> if     * not tokenizing.     */    public final void setTokenSet(Set<String> tokenSet) {        int maxLen = 0;        for (String token : tokenSet)            maxLen = java.lang.Math.max(maxLen,token.length());        // System.out.println("longest token=" + maxLen);        mTokenSet = tokenSet;        mTokenPrefixTrie = tokenSet == null ? null : prefixTrie(tokenSet);    }    /**     * Sets The n-best size to the specified value.  The n-best     * size controls the number of hypotheses maintained going forward     * for each character in the input.  A higher value indicates     * a broader and slower search for corrections.     *     * @param size Size of the n-best lists at each character.     * @throws IllegalArgumentException If the size is less than one.     */    public void setNBest(int size) {        if (size < 1) {            String msg = "N-best size must be greather than 0."                + " Found size=" + size;            throw new IllegalArgumentException(msg);        }        mNBestSize = size;    }    /**     * Sets this spell checker to allow insertions if the specified     * value is <code>true</code> and to disallow them if it is     * <code>false</code>.  If the value is <code>false</code>, then     * the number of consecutive insertions allowed is also set     * to zero.     *     * @param allowInsert New insertion mode.     */    public void setAllowInsert(boolean allowInsert) {        mAllowInsert = allowInsert;        if (!allowInsert) setNumConsecutiveInsertionsAllowed(0);    }    /**     * Sets this spell checker to allow deletions if the specified     * value is <code>true</code> and to disallow them if it is     * <code>false</code>.     *     * @param allowDelete New deletion mode.     */    public void setAllowDelete(boolean allowDelete) {        mAllowDelete = allowDelete;    }    /**     * Sets this spell checker to allow matches if the specified     * value is <code>true</code> and to disallow them if it is     * <code>false</code>.     *     * @param allowMatch New match mode.     */    public void setAllowMatch(boolean allowMatch) {        mAllowMatch = allowMatch;    }    /**     * Sets this spell checker to allow substitutions if the specified     * value is <code>true</code> and to disallow them if it is     * <code>false</code>.     *     * @param allowSubstitute New substitution mode.     */    public void setAllowSubstitute(boolean allowSubstitute) {        mAllowSubstitute = allowSubstitute;    }    /**     * Sets this spell checker to allow transpositions if the specified     * value is <code>true</code> and to disallow them if it is     * <code>false</code>.     *     * @param allowTranspose New transposition mode.     */    public void setAllowTranspose(boolean allowTranspose) {        mAllowTranspose = allowTranspose;    }    /**     * Set the number of consecutive insertions allowed to the     * specified value.  The value must not be negative.  If the     * number of insertions allowed is specified to be greater than     * zero, then the allow insertion model will be set to     * <code>true</code>.     *     * @param numAllowed Number of insertions allowed in a row.     * @throws IllegalArgumentException If the number specified is     * less than zero.     */    public void setNumConsecutiveInsertionsAllowed(int numAllowed) {        if (numAllowed < 0) {            String msg = "Num insertions allowed must be >= 0."                + " Found numAllowed=" + numAllowed;            throw new IllegalArgumentException(msg);        }        if (numAllowed > 0) setAllowInsert(true);        mNumConsecutiveInsertionsAllowed = numAllowed;    }    /**     * Returns a first-best hypothesis of the intended message given a     * received message.  This method returns <code>null</code> if the     * received message is itself the best hypothesis.  The exact     * definition of hypothesis ranking is provided in the class     * documentation above.     *     * @param receivedMsg The message received over the noisy channel.     * @return The first-best hypothesis of the intended source     * message.     */    public String didYouMean(String receivedMsg) {        String msg = normalizeQuery(receivedMsg);        if (msg.length() == 0) return msg;        DpSpellQueue queue = new DpSpellQueue();        DpSpellQueue finalQueue = new DpSpellQueue();        computeBestPaths(msg,queue,finalQueue);        if (finalQueue.isEmpty())            return msg;        State bestState = (State) finalQueue.pop();        //System.out.println("Winner is: "+bestState);        return bestState.output().trim();    }    void computeBestPaths(String msg,                          StateQueue queue, StateQueue finalQueue) {        double[] editPenalties = editPenalties(msg);        State initialState = new State(0.0,false,mTokenPrefixTrie,                                       null,                                       mLM.nextContext(0,' '));        addToQueue(queue,initialState,editPenalties[0]);        DpSpellQueue nextQ = new DpSpellQueue();        DpSpellQueue nextQ2 = new DpSpellQueue();        for (int i = 0; i < msg.length(); ++i) {            char c = msg.charAt(i);            char nextC = ((i+1) < msg.length()) ? msg.charAt(i+1) : 0;            Iterator it = queue.iterator();            while (it.hasNext()) {                State state = (State) it.next();                if ((i+1) < msg.length())                    extend2(c,nextC,state,nextQ,nextQ2,editPenalties[i]);                else                    extend1(c,state,nextQ,editPenalties[i]);            }            queue = nextQ;            nextQ = nextQ2;            nextQ2 = new DpSpellQueue();        }        extendToFinalSpace(queue,finalQueue);    }    /**     * Returns an iterator over the n-best spelling corrections for     * the specified input string.  The iterator produces instances     * of {@link ScoredObject}, the object of which is the corrected     * string and the score of which is the joint score of edit (channel)     * costs and language model (source) cost of the output.     *     * <p>Unlike for HMMs and chunking, this n-best list is not exact     * due to pruning during spelling correction.  The maximum number     * of returned results is determined by the n-best paramemter, as     * set through {@link #setNBest(int)}.  The larger the n-best list,     * the higher-quality the results, even earlier on the list.     *     * <p>N-best spelling correction is not an exact computation     * due to heuristic pruning during decoding.  Thus setting the     * n-best list to a larger result may result in better n-best     * results, even for earlier results on the list.  For instance,     * the result of the first five corrections is not necessarily the     * same with a 5-element, 10-element or 1000-element n-best size     * (as specified by {@link #setNBest(int)}.     *     * <p>A rough confidence measure may be determined by comparing     * the scores, which are log (base 2) edit (channel) plus log     * (base 2) language model (source) scores.  A very crude measure     * is to compare the score of the first result to the score of     * the second result; if there is a large gap, confidence is high.     * A tighter measure is to convert the log probabilities back to     * linear, add them all up, and then divide.  For instance, if     * there were results:     *     * <blockquote><table border="1" cellpadding="5">     * <tr><th algin="left">Rank</th>     *     <th algin="left">String</th>     *     <th algin="left">Log (2) Prob</th>     *     <th algin="left">Prob</th>     *     <th align="left">Conf</th></tr>     * <tr><td>0</td><td>foo</td><td>-2</td> <td>0.250</td><td>0.571</tr>     * <tr><td>0</td><td>for</td><td>-3</td> <td>0.125</td>0.285</tr>     * <tr><td>0</td><td>food</td><td>-4</td><td>0.062</td>0.143</tr>     * <tr><td>0</td><td>of</td><td>-10</td> <td>0.001</td>0.002</tr>     * </table></blockquote>     *     * Here there are four results, with log probabilities -2, -3,     * -4 and -10, which have the corresponding linear probabilities.     * The sum of these probabilities is 0.438.  Hence the confidence     * in the top-ranked answer is 0.250/0.438=0.571.     *     * <p><b>Warning:</b> Spell checking with n-best output is     * currently implemented with a very naive algorithm and is     * thus very slow compared to first-best spelling correction.     * The reason for this is that there the dynamic programming     * is turned off for n-best spelling correction, hence a lot     * redundant computation is done.     *     * @param receivedMsg Input message.     * @return Iterator over n-best spelling suggestions.     */    public Iterator<ScoredObject<String>> didYouMeanNBest(String receivedMsg) {        String msg = normalizeQuery(receivedMsg);        if (msg.length() == 0)            return new Iterators.Singleton(new ScoredObject("",0));        StateQueue queue = new NBestSpellQueue();        StateQueue finalQueue = new NBestSpellQueue();        computeBestPaths(msg,queue,finalQueue);        BoundedPriorityQueue<ScoredObject<String>> resultQueue            = new BoundedPriorityQueue<ScoredObject<String>>(ScoredObject                                                             .SCORE_COMPARATOR,                                                             mNBestSize);        Iterator it = finalQueue.iterator();        while (it.hasNext()) {            State state = (State) it.next();            resultQueue.add(new ScoredObject<String>(state.output().trim(),                                                     state.score()));        }        return resultQueue.iterator();    }    private boolean isShortToken(String token) {        return token.length() <= mMinimumTokenLengthToCorrect;    }    private double[] editPenalties(String msg) {        double[] penalties = new double[msg.length()];        Arrays.fill(penalties,0.0);        if (mTokenSet == null) return penalties;        int charPosition = 0;        for (int i = 0; i < penalties.length; ++i) {            char c = msg.charAt(i);            if ((mTokenSet != null)                && ((i == 0) || (msg.charAt(i-1) == ' '))) {                int endIndex = msg.indexOf(' ', i);                if (endIndex == -1)                    endIndex = msg.length();                String token = msg.substring(i,endIndex);                if (mDoNotEditTokens.contains(token)                    || isShortToken(token)) {                    // penalize space before                    if (i > 0) {                        penalties[i-1] = Double.NEGATIVE_INFINITY;                    }                    // penalize chars within                    for (int j = i; j < endIndex; ++j) {                        penalties[j] = Double.NEGATIVE_INFINITY;                    }                    // penalize space after (may get double penalized)                    if (endIndex < penalties.length) {                        penalties[endIndex] = Double.NEGATIVE_INFINITY;                    }                } else if (mTokenSet.contains(token)) {                    if (i > 0) {                        penalties[i-1] += mKnownTokenEditCost;                    }                    // penalize chars within                    for (int j = i; j < endIndex; ++j) {                        penalties[j] += mKnownTokenEditCost;                    }                    // penalize space after (may get double penalized)                    if (endIndex < penalties.length) {                        penalties[endIndex] += mKnownTokenEditCost;
💿 文件大小 4561 K
👤 上传用户 edan1181
📂 所属分类 Java编程
🏷️ 相关标签

#LingPipe #Java #自然语言处理 #开源
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -