📄 trainspellchecker.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
     * training data from the counter.     *     * @return The counter for the tokens in the training set.     */    public ObjectToCounterMap<String> tokenCounter() {        return mTokenCounter;    }    /**     * Train the spelling checker on the specified character sequence.     * The sequence is normalized by normalizing all whitespace     * sequences to a single space character and inserting an initial     * and final whitespace.  If a tokenization factory is specified,     * a single space character is insterted between any tokens     * not already separated by a white space.     *     * @param cSeq Character sequence for training.     */    public void train(CharSequence cSeq) {        mLM.train(normalizeQuery(cSeq));        mNumTrainingChars += cSeq.length();    }    /**     * Train the spelling checker on the specified character sequence     * as if it had appeared with a frequency given by the specified     * count.     *     * <p>See the method {@link #train(CharSequence)} for information     * on the normalization carried out on the input character     * sequence.     *     * <p>Although calling this method is equivalent to calling {@link     * #train(CharSequence)} the specified count number of times, this     * mehod is much more efficient because it does not require     * iteration.     *     * <p>This method may be used to boost the training for a specified     * input, or just to combine inputs into single method calls.     *     * @param cSeq Character sequence for training.     * @param count Frequency of sequence to train.     * @throws IllegalArgumentException If the specified count is negative.     */    public void train(CharSequence cSeq, int count) {        if (count < 0) {            String msg = "Training counts must be non-negative."                + " Found count=" + count;            throw new IllegalArgumentException(msg);        }        if (count == 0) return;        mLM.train(normalizeQuery(cSeq),count);        mNumTrainingChars += count * cSeq.length();    }    /**     * Returns the total length in characters of all text used to     * train the spell checker.     *     * @return The number of training characters seen.     */    public long numTrainingChars() {        return mNumTrainingChars;    }    /**     * Train the spelling checker on the specified character slice.     * This method implements the necessary method for the {@link     * TextHandler} interface.  Otherwise, it behaves exactly the same     * way as {@link #train(CharSequence)}.     *     * @param cs Underlying character array.     * @param start Index of first character in slice.     * @param length Number of characters in the slice.     */    public void handle(char[] cs, int start, int length) {        train(new String(cs,start,length));    }    /**     * Prunes the set of collected tokens of all tokens with count     * less than the specified minimum.  If there was no tokenization     * factory specified for this spell checker, this method will     * have no effect.     *     * @param minCount Minimum count of preserved token.     */    public void pruneTokens(int minCount) {        mTokenCounter.prune(minCount);    }    /**     * Prunes the underlying character language model to remove     * substring counts of less than the specified minimum.     *     * @param minCount Minimum count of preserved substrings.     */    public void pruneLM(int minCount) {        mLM.substringCounter().prune(minCount);    }    /**     * Writes a compiled spell checker to the specified object output.     * The class of the spell checker read back in is {@link     * CompiledSpellChecker}.     *     * @param objOut Object output to which this spell checker is     * written.     * @throws IOException If there is an I/O error while writing.     */    public void compileTo(ObjectOutput objOut) throws IOException {        objOut.writeObject(new Externalizer(this));    }    private Object writeReplace() {        return new Serializer(this);    }    StringBuffer normalizeQuery(CharSequence cSeq) {        StringBuffer sb = new StringBuffer();        sb.append(' ');        if (mTokenizerFactory == null) {            Strings.normalizeWhitespace(cSeq,sb);            sb.append(' ');        } else {            char[] cs = Strings.toCharArray(cSeq);            Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length);            String nextToken;            while ((nextToken = tokenizer.nextToken()) != null) {                mTokenCounter.increment(nextToken);                sb.append(nextToken);                sb.append(' ');            }        }        return sb;    }    static void assertCompilable(String description, Object x) {        if (!(x instanceof Compilable)) {            String msg = description                + " must implement com.aliasi.util.Compilable."                + " Found class=" + x.getClass();            throw new IllegalArgumentException(msg);        }    }    static class Externalizer extends AbstractExternalizable {        private static final long serialVersionUID = 4907338741905144267L;        private final TrainSpellChecker mTrainer;        public Externalizer() {            this(null);        }        public Externalizer(TrainSpellChecker trainer) {            mTrainer = trainer;        }        public void writeExternal(ObjectOutput objOut) throws IOException {            mTrainer.mLM.compileTo(objOut);            boolean tokenizing = mTrainer.mTokenizerFactory != null;            objOut.writeBoolean(tokenizing);            if (tokenizing) {                Set keySet = mTrainer.mTokenCounter.keySet();                objOut.writeObject(new HashSet(keySet));            }            ((Compilable) mTrainer.mEditDistance).compileTo(objOut);        }        public Object read(ObjectInput objIn)            throws ClassNotFoundException, IOException {            CompiledNGramProcessLM lm                = (CompiledNGramProcessLM) objIn.readObject();            boolean tokenizing = objIn.readBoolean();            // System.out.println("reading token set");            Set tokenSet = tokenizing                ? (Set) objIn.readObject()                : null;            // System.out.println("     finished");            WeightedEditDistance editDistance                = (WeightedEditDistance) objIn.readObject();            return new CompiledSpellChecker(lm,editDistance,tokenSet);        }    }    static class Serializer extends AbstractExternalizable {        private TrainSpellChecker mTrainer;        public Serializer() {            this(null);        }        public Serializer(TrainSpellChecker trainer) {            mTrainer = trainer;        }        public void writeExternal(ObjectOutput objOut) throws IOException {            objOut.writeLong(mTrainer.mNumTrainingChars);            objOut.writeObject(mTrainer.mLM);            boolean tokenizing = mTrainer.mTokenizerFactory != null;            objOut.writeBoolean(tokenizing);            if (tokenizing) {                AbstractExternalizable.serializeOrCompile(mTrainer.mTokenizerFactory,objOut);                objOut.writeObject(mTrainer.mTokenCounter);            }            AbstractExternalizable.serializeOrCompile(mTrainer.mEditDistance,objOut);        }        public Object read(ObjectInput objIn)            throws ClassNotFoundException, IOException {            long numTrainingChars = objIn.readLong();            NGramProcessLM lm = (NGramProcessLM) objIn.readObject();            boolean tokenizing = objIn.readBoolean();            TokenizerFactory tokenizerFactory = null;            ObjectToCounterMap<String> tokenCounter = null;            if (tokenizing) {                tokenizerFactory = (TokenizerFactory) objIn.readObject();                tokenCounter = (ObjectToCounterMap<String>) objIn.readObject();            }            WeightedEditDistance editDistance                = (WeightedEditDistance) objIn.readObject();            return new TrainSpellChecker(numTrainingChars,                                         editDistance,                                         lm,                                         tokenizerFactory,                                         tokenCounter);        }    }}
上一页 12
💿 文件大小 4561 K
👤 上传用户 edan1181
📂 所属分类 Java编程
🏷️ 相关标签

#LingPipe #Java #自然语言处理 #开源
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -