📄 compiledspellchecker.java
字号:
* language model and edit distance, tokenizer factory, the * set of valid output tokens, and maximum n-best size, with * default known token and first and second character edit costs. * The set of do-not-edit tokens * is initiall empty; set it using {@link #setDoNotEditTokens(Set)}. * * @param lm Source language model. * @param editDistance Channel edit distance model. * @param factory Tokenizer factory for tokenizing inputs. * @param tokenSet Set of valid tokens for outputs or * <code>null</code> if output is not token sensitive. * @param nBestSize Size of n-best list for spell checking. * hypothesis is pruned. * @throws IllegalArgumentException If the edit distance is not a * similarity measure. */ public CompiledSpellChecker(CompiledNGramProcessLM lm, WeightedEditDistance editDistance, TokenizerFactory factory, Set<String> tokenSet, int nBestSize) { this(lm,editDistance,factory,tokenSet,nBestSize, DEFAULT_KNOWN_TOKEN_EDIT_COST, DEFAULT_FIRST_CHAR_EDIT_COST, DEFAULT_SECOND_CHAR_EDIT_COST); } /** * Construct a compiled spell checker based on the specified * language model and edit distance, a null tokenizer factory, the * set of valid output tokens, and maximum n-best size, with * default known token and first and second character edit costs. * The set of do-not-edit tokens * is initiall empty; set it using {@link #setDoNotEditTokens(Set)}. * * @param lm Source language model. * @param editDistance Channel edit distance model. * @param tokenSet Set of valid tokens for outputs or * <code>null</code> if output is not token sensitive. * @param nBestSize Size of n-best list for spell checking. * hypothesis is pruned. */ public CompiledSpellChecker(CompiledNGramProcessLM lm, WeightedEditDistance editDistance, Set<String> tokenSet, int nBestSize) { this(lm,editDistance,null,tokenSet,nBestSize); } /** * Construct a compiled spell checker based on the specified * language model and edit distance, with a null tokenizer * factory, the specified set of valid output tokens, with default * value for n-best size, known token edit cost and first and * second character edit costs. * The set of do-not-edit tokens * is initiall empty; set it using {@link #setDoNotEditTokens(Set)}. * * @param lm Source language model. * @param editDistance Channel edit distance model. * @param tokenSet Set of valid tokens for outputs or * <code>null</code> if output is not token sensitive. */ public CompiledSpellChecker(CompiledNGramProcessLM lm, WeightedEditDistance editDistance, Set<String> tokenSet) { this(lm,editDistance,tokenSet,DEFAULT_N_BEST_SIZE); } /** * Returns the compiled language model for this spell checker. * Compiled language models are themselves immutable, and the * language model for a spell checker may not be changed, but * the result returned by this method may be used to construct * a new compiled spell checker. * * @return The language model for this spell checker. */ public CompiledNGramProcessLM languageModel() { return mLM; } /** * Returns the weighted edit distance for this compiled spell * checker. * * @return The edit distance for this spell checker. */ public WeightedEditDistance editDistance() { return mEditDistance; } /** * Returns the tokenizer factory for this spell checker. * * @return The tokenizer factory for this spell checker. */ public TokenizerFactory tokenizerFactory() { return mTokenizerFactory; } /** * Returns an unmodifiable view the set of tokens for this spell * checker. In order to change the token set, construct a new * set and use {@link #setTokenSet(Set)}. * * @return The set of tokens for this spell checker. */ public Set<String> tokenSet() { return Collections.<String>unmodifiableSet(mTokenSet); } /** * Returns an unmodifiable view of the set of tokens that will * never be edited in this compiled spell checker. To change the * value of this set, use {@link #setDoNotEditTokens(Set)}. * * @return The set of tokens that will not be edited. */ public Set<String> doNotEditTokens() { return Collections.<String>unmodifiableSet(mDoNotEditTokens); } /** * Updates the set of do-not-edit tokens to be the specified * value. If one of these tokens shows up in the input, it will * also show up in any correction supplied. * * @param tokens Set of tokens not to edit. */ public void setDoNotEditTokens(Set<String> tokens) { mDoNotEditTokens = tokens; } /** * Returns the n-best size for this spell checker. See the class * documentation above and the documentation for the method {@link * #setNBest(int)} for more information. * * @return The n-best size for this spell checker. */ public int nBestSize() { return mNBestSize; } /** * Returns the cost penalty for editing a character in a known * token. This penalty is added to each edit within a known * token. * * @return Known token edit penalty. */ public double knownTokenEditCost() { return mKnownTokenEditCost; } /** * Returns the cost penalty for editing the first character in a * token. This penalty is added to each edit while scanning the * first character of a token in the input. * * <P>As a special case, transposition only pays a single * penalty based on the penalty of the first character in * the transposition. * * @return First character edit penalty. */ public double firstCharEditCost() { return mFirstCharEditCost; } /** * Returns the cost penalty for editing the second character * in a token. This penalty is added for each edit while * scanning the second character in an input. * * @return Second character edit penalty. */ public double secondCharEditCost() { return mSecondCharEditCost; } /** * Set the known token edit cost to the specified value. * * @param cost New value for known token edit cost. */ public void setKnownTokenEditCost(double cost) { mKnownTokenEditCost = cost; } /** * Set the first character edit cost to the specified value. * * @param cost New value for the first character edit cost. */ public void setFirstCharEditCost(double cost) { mFirstCharEditCost = cost; } /** * Set the second character edit cost to the specified value. * * @param cost New value for the second character edit cost. */ public void setSecondCharEditCost(double cost) { mSecondCharEditCost = cost; } /** * Returns the number of consecutive insertions allowed. * This will be zero if insertions are not allowed. */ public int numConsecutiveInsertionsAllowed() { return mNumConsecutiveInsertionsAllowed; } /** * Returns <code>true</code> if this spell checker allows * insertions. * * @return <code>true</code> if this spell checker allows * insertions. */ public boolean allowInsert() { return mAllowInsert; } /** * Returns <code>true</code> if this spell checker allows * deletions. * * @return <code>true</code> if this spell checker allows * deletions. */ public boolean allowDelete() { return mAllowDelete; } /** * Returns <code>true</code> if this spell checker allows * matches. * * @return <code>true</code> if this spell checker allows * matches. */ public boolean allowMatch() { return mAllowMatch; } /** * Returns <code>true</code> if this spell checker allows * substitutions. * * @return <code>true</code> if this spell checker allows * substitutions. */ public boolean allowSubstitute() { return mAllowSubstitute; } /** * Returns <code>true</code> if this spell checker allows * transpositions. * * @return <code>true</code> if this spell checker allows * transpositions. */ public boolean allowTranspose() { return mAllowTranspose; } /** * Sets the edit distance for this spell checker to the * specified value. * * @param editDistance Edit distance to use for spell checking. */ public void setEditDistance(WeightedEditDistance editDistance) { mEditDistance = editDistance; } /** * Sets a minimum character length for tokens to be eligible for * editing. * * @param tokenCharLength Edit distance to use for spell checking. * @throws IllegalArgumentException If the character length * specified is less than <code>0</code>. */ public void setMinimumTokenLengthToCorrect(int tokenCharLength) { if (tokenCharLength < 0) { String msg = "Minimum token length to correct must be >= 0." + " Found tokenCharLength=" + tokenCharLength; throw new IllegalArgumentException(msg); } mMinimumTokenLengthToCorrect = tokenCharLength; } /** * Returns the minimum length of token that will be corrected. * This value is initially <code>0</code>, but may be set * using {@link #setMinimumTokenLengthToCorrect(int)}. * * @return The minimum token length to correct. */ public int minimumTokenLengthToCorrect() { return mMinimumTokenLengthToCorrect; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -