📄 trainspellchecker.java
字号:
* training data from the counter. * * @return The counter for the tokens in the training set. */ public ObjectToCounterMap<String> tokenCounter() { return mTokenCounter; } /** * Train the spelling checker on the specified character sequence. * The sequence is normalized by normalizing all whitespace * sequences to a single space character and inserting an initial * and final whitespace. If a tokenization factory is specified, * a single space character is insterted between any tokens * not already separated by a white space. * * @param cSeq Character sequence for training. */ public void train(CharSequence cSeq) { mLM.train(normalizeQuery(cSeq)); mNumTrainingChars += cSeq.length(); } /** * Train the spelling checker on the specified character sequence * as if it had appeared with a frequency given by the specified * count. * * <p>See the method {@link #train(CharSequence)} for information * on the normalization carried out on the input character * sequence. * * <p>Although calling this method is equivalent to calling {@link * #train(CharSequence)} the specified count number of times, this * mehod is much more efficient because it does not require * iteration. * * <p>This method may be used to boost the training for a specified * input, or just to combine inputs into single method calls. * * @param cSeq Character sequence for training. * @param count Frequency of sequence to train. * @throws IllegalArgumentException If the specified count is negative. */ public void train(CharSequence cSeq, int count) { if (count < 0) { String msg = "Training counts must be non-negative." + " Found count=" + count; throw new IllegalArgumentException(msg); } if (count == 0) return; mLM.train(normalizeQuery(cSeq),count); mNumTrainingChars += count * cSeq.length(); } /** * Returns the total length in characters of all text used to * train the spell checker. * * @return The number of training characters seen. */ public long numTrainingChars() { return mNumTrainingChars; } /** * Train the spelling checker on the specified character slice. * This method implements the necessary method for the {@link * TextHandler} interface. Otherwise, it behaves exactly the same * way as {@link #train(CharSequence)}. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param length Number of characters in the slice. */ public void handle(char[] cs, int start, int length) { train(new String(cs,start,length)); } /** * Prunes the set of collected tokens of all tokens with count * less than the specified minimum. If there was no tokenization * factory specified for this spell checker, this method will * have no effect. * * @param minCount Minimum count of preserved token. */ public void pruneTokens(int minCount) { mTokenCounter.prune(minCount); } /** * Prunes the underlying character language model to remove * substring counts of less than the specified minimum. * * @param minCount Minimum count of preserved substrings. */ public void pruneLM(int minCount) { mLM.substringCounter().prune(minCount); } /** * Writes a compiled spell checker to the specified object output. * The class of the spell checker read back in is {@link * CompiledSpellChecker}. * * @param objOut Object output to which this spell checker is * written. * @throws IOException If there is an I/O error while writing. */ public void compileTo(ObjectOutput objOut) throws IOException { objOut.writeObject(new Externalizer(this)); } private Object writeReplace() { return new Serializer(this); } StringBuffer normalizeQuery(CharSequence cSeq) { StringBuffer sb = new StringBuffer(); sb.append(' '); if (mTokenizerFactory == null) { Strings.normalizeWhitespace(cSeq,sb); sb.append(' '); } else { char[] cs = Strings.toCharArray(cSeq); Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length); String nextToken; while ((nextToken = tokenizer.nextToken()) != null) { mTokenCounter.increment(nextToken); sb.append(nextToken); sb.append(' '); } } return sb; } static void assertCompilable(String description, Object x) { if (!(x instanceof Compilable)) { String msg = description + " must implement com.aliasi.util.Compilable." + " Found class=" + x.getClass(); throw new IllegalArgumentException(msg); } } static class Externalizer extends AbstractExternalizable { private static final long serialVersionUID = 4907338741905144267L; private final TrainSpellChecker mTrainer; public Externalizer() { this(null); } public Externalizer(TrainSpellChecker trainer) { mTrainer = trainer; } public void writeExternal(ObjectOutput objOut) throws IOException { mTrainer.mLM.compileTo(objOut); boolean tokenizing = mTrainer.mTokenizerFactory != null; objOut.writeBoolean(tokenizing); if (tokenizing) { Set keySet = mTrainer.mTokenCounter.keySet(); objOut.writeObject(new HashSet(keySet)); } ((Compilable) mTrainer.mEditDistance).compileTo(objOut); } public Object read(ObjectInput objIn) throws ClassNotFoundException, IOException { CompiledNGramProcessLM lm = (CompiledNGramProcessLM) objIn.readObject(); boolean tokenizing = objIn.readBoolean(); // System.out.println("reading token set"); Set tokenSet = tokenizing ? (Set) objIn.readObject() : null; // System.out.println(" finished"); WeightedEditDistance editDistance = (WeightedEditDistance) objIn.readObject(); return new CompiledSpellChecker(lm,editDistance,tokenSet); } } static class Serializer extends AbstractExternalizable { private TrainSpellChecker mTrainer; public Serializer() { this(null); } public Serializer(TrainSpellChecker trainer) { mTrainer = trainer; } public void writeExternal(ObjectOutput objOut) throws IOException { objOut.writeLong(mTrainer.mNumTrainingChars); objOut.writeObject(mTrainer.mLM); boolean tokenizing = mTrainer.mTokenizerFactory != null; objOut.writeBoolean(tokenizing); if (tokenizing) { AbstractExternalizable.serializeOrCompile(mTrainer.mTokenizerFactory,objOut); objOut.writeObject(mTrainer.mTokenCounter); } AbstractExternalizable.serializeOrCompile(mTrainer.mEditDistance,objOut); } public Object read(ObjectInput objIn) throws ClassNotFoundException, IOException { long numTrainingChars = objIn.readLong(); NGramProcessLM lm = (NGramProcessLM) objIn.readObject(); boolean tokenizing = objIn.readBoolean(); TokenizerFactory tokenizerFactory = null; ObjectToCounterMap<String> tokenCounter = null; if (tokenizing) { tokenizerFactory = (TokenizerFactory) objIn.readObject(); tokenCounter = (ObjectToCounterMap<String>) objIn.readObject(); } WeightedEditDistance editDistance = (WeightedEditDistance) objIn.readObject(); return new TrainSpellChecker(numTrainingChars, editDistance, lm, tokenizerFactory, tokenCounter); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -