📄 tokenizedlm.java
字号:
* @throws IndexOutOfBoundsException If the indices are out of * range for the character array. */ public void train(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs,start,end); Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,end-start); ArrayList tokenList = new ArrayList(); while (true) { if (mDynamicWhitespaceModel != null) { String whitespace = tokenizer.nextWhitespace(); mDynamicWhitespaceModel.train(whitespace); } // this'll pick up the last whitespace after last token String token = tokenizer.nextToken(); if (token == null) break; tokenList.add(token); } int[] tokIds = new int[tokenList.size()+2]; tokIds[0] = BOUNDARY_TOKEN; tokIds[tokIds.length-1] = BOUNDARY_TOKEN; Iterator it = tokenList.iterator(); for (int i = 1; it.hasNext(); ++i) { String token = it.next().toString(); // train underlying token model just once per token if (mDynamicUnknownTokenModel != null && mSymbolTable.symbolToID(token) < 0) { mDynamicUnknownTokenModel.train(token); } tokIds[i] = mSymbolTable.getOrAddSymbol(token); } mCounter.incrementSubsequences(tokIds,0,tokIds.length); mCounter.decrementUnigram(BOUNDARY_TOKEN); } /** * This method is a convenience implementation of the <code>TextHandler</code> * interface which delegates calls to {@link #train(char[], int, int)}. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param length Length of slice. * @throws IndexOutOfBoundsException If the indices are out of * range for the character array. */ public void handle(char[] cs, int start, int length) { train(cs,start,start+length); } /** * Trains the token sequence model, whitespace model (if dynamic) and * unknown token model (if dynamic). * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one plus last character in slice. * @param count Number of instances of sequence to train. * @throws IndexOutOfBoundsException If the indices are out of range for the * character array. * @throws IllegalArgumentException If the count is negative. */ public void train(char[] cs, int start, int end, int count) { Strings.checkArgsStartEnd(cs,start,end); if (count < 0) { String msg = "Counts must be non-negative." + " Found count=" + count; throw new IllegalArgumentException(msg); } if (count == 0) return; Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,end-start); ArrayList tokenList = new ArrayList(); while (true) { if (mDynamicWhitespaceModel != null) { String whitespace = tokenizer.nextWhitespace(); mDynamicWhitespaceModel.train(whitespace,count); } // this'll pick up the last whitespace after last token String token = tokenizer.nextToken(); if (token == null) break; tokenList.add(token); } int[] tokIds = new int[tokenList.size()+2]; tokIds[0] = BOUNDARY_TOKEN; tokIds[tokIds.length-1] = BOUNDARY_TOKEN; Iterator it = tokenList.iterator(); for (int i = 1; it.hasNext(); ++i) { String token = it.next().toString(); // train underlying token model just once per token if (mDynamicUnknownTokenModel != null && mSymbolTable.symbolToID(token) < 0) { mDynamicUnknownTokenModel.train(token,count); } tokIds[i] = mSymbolTable.getOrAddSymbol(token); } mCounter.incrementSubsequences(tokIds,0,tokIds.length,count); mCounter.decrementUnigram(BOUNDARY_TOKEN,count); } /** * This method trains the last token in the sequence given the * previous tokens. See {@link #trainSequence(CharSequence, int)} * for more information. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one plus last character in slice. * @throws IndexOutOfBoundsException If the indices are out of * range for the character array. * @throws IllegalArgumentException If the count is negative. */ public void trainSequence(char[] cs, int start, int end, int count) { Strings.checkArgsStartEnd(cs,start,end); if (count < 0) { String msg = "Count must be non-negative. Found count=" + count; throw new IllegalArgumentException(msg); } Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,end-start); String[] tokens = tokenizer.tokenize(); int len = Math.min(tokens.length,nGramOrder()); int offset = tokens.length - len; int[] tokIds = new int[len]; for (int i = 0; i < len; ++i) tokIds[i] = mSymbolTable.getOrAddSymbol(tokens[i+offset]); mCounter.incrementSequence(tokIds,0,len,count); } /** * This method increments the count of the entire sequence * specified. Note that this method does not increment any of the * token subsequences and does not increment the whitespace or * token smoothing models. * * <p>This method may be used to train a tokenized language model * from individual character sequence counts. Because the token * smoothing models are not implemented for this method, a pure * token model may be constructed by calling * <code>train(CharSequence,int)</code> for character sequences * corresponding to unigrams rather than this method in order to * train token smoothing with character subseuqneces. * * <p>For instance, with * <code>com.aliasi.tokenizer.IndoEuropeanTokenizerFactory</code>, * the sequence calling <code>trainSequence("the fast * computer",5)</code> would extract three tokens, * <code>the</code>, <code>fast</code> and <code>computer</code>, * and would increment the count of the three-token sequence, but * not any of its subsequences. * * <p>If the number of tokens is longer than the maximum n-gram * length, only the final tokens are trained. For instance, with * an n-gram length of 2, and the Indo-European tokenizer factory, * calling <code>trainSequence("a slightly faster * computer",93)</code> is equivalent to calling * <code>trainSequence("faster computer",93)</code>. * * <p>All tokens trained are added to the symbol table. This * does not include any initial tokens that are not used because * the maximum n-gram length is too short. * * @param cSeq Character sequence to train. * @param count Number of instances to train. * @throws IllegalArgumentException If the count is negative. */ public void trainSequence(CharSequence cSeq, int count) { char[] cs = Strings.toCharArray(cSeq); trainSequence(cs,0,cs.length,count); } public double log2Estimate(CharSequence cSeq) { char[] cs = Strings.toCharArray(cSeq); return log2Estimate(cs,0,cs.length); } public double log2Estimate(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs,start,end); double logEstimate = 0.0; // collect tokens, estimate whitespaces Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,end-start); ArrayList tokenList = new ArrayList(); while (true) { String whitespace = tokenizer.nextWhitespace(); logEstimate += mWhitespaceModel.log2Estimate(whitespace); String token = tokenizer.nextToken(); if (token == null) break; tokenList.add(token); } // collect token ids, estimate unknown tokens int[] tokIds = new int[tokenList.size()+2]; tokIds[0] = BOUNDARY_TOKEN; tokIds[tokIds.length-1] = BOUNDARY_TOKEN; Iterator it = tokenList.iterator(); for (int i = 1; it.hasNext(); ++i) { String token = it.next().toString(); tokIds[i] = mSymbolTable.symbolToID(token); if (tokIds[i] < 0) { logEstimate += mUnknownTokenModel.log2Estimate(token); } } // estimate token ids excluding start, inlcuding end for (int i = 2; i <= tokIds.length; ++i) { logEstimate += conditionalLog2TokenEstimate(tokIds,0,i); } return logEstimate; } class StringArrayAdapter implements IntArrayHandler { StringArrayHandler mHandler; public StringArrayAdapter(StringArrayHandler handler) { mHandler = handler; } public void handle(int[] nGram) { mHandler.handle(simpleNGramToTokens(nGram)); } String[] simpleNGramToTokens(int[] nGram) { String[] tokens = new String[nGram.length]; for (int i = 0; i < tokens.length; ++i) tokens[i] = nGram[i] >= 0 ? mSymbolTable.idToSymbol(nGram[i]) : null; return tokens; } } abstract class Collector implements IntArrayHandler { final BoundedPriorityQueue<ScoredObject<String[]>> mBPQ; Collector(int maxReturned, boolean reverse) { Comparator<Scored> comparator = reverse ? Scored.REVERSE_SCORE_COMPARATOR : Scored.SCORE_COMPARATOR; mBPQ = new BoundedPriorityQueue(comparator, maxReturned); } ScoredObject<String[]>[] nGrams() { ScoredObject<String[]>[] result = (ScoredObject<String[]>[]) new ScoredObject[mBPQ.size()]; mBPQ.toArray(result); return result; } public void handle(int[] nGram) { for (int i = 0; i < nGram.length; ++i) if (nGram[i] < 0) return; // don't include boundaries mBPQ.add(new ScoredObject<String[]>(nGramToTokens(nGram), scoreNGram(nGram))); } abstract double scoreNGram(int[] nGram); } class FreqTermCollector extends Collector { FreqTermCollector(int maxReturned, boolean reverse) { super(maxReturned,reverse); } double scoreNGram(int[] nGram) { return mCounter.count(nGram,0,nGram.length); } } class CollocationCollector extends Collector { CollocationCollector(int maxReturned) { super(maxReturned,false); } double scoreNGram(int[] nGram) { return chiSquaredIndependence(nGram); } } class SigTermCollector extends Collector { final LanguageModel.Tokenized mBGModel; SigTermCollector(int maxReturned, LanguageModel.Tokenized bgModel, boolean reverse) { super(maxReturned,reverse); mBGModel = bgModel; } double scoreNGram(int[] nGram) { String[] tokens = nGramToTokens(nGram); int totalSampleCount = mCounter.count(nGram,0,0); int sampleCount = mCounter.count(nGram,0,nGram.length); double bgProb = mBGModel.tokenProbability(tokens,0,tokens.length); double score = BinomialDistribution.z(bgProb, sampleCount, totalSampleCount); return score; } } String[] nGramToTokens(int[] nGram) { String[] toks = new String[nGram.length]; for (int i = 0; i < nGram.length; ++i) { toks[i] = nGram[i] >= 0 ? mSymbolTable.idToSymbol(nGram[i]) : (i == 0) ? "*BEGIN*" : "*END*"; } return toks; } public double tokenProbability(String[] tokens, int start, int end) { return Math.pow(2.0,tokenLog2Probability(tokens,start,end)); } public double tokenLog2Probability(String[] tokens, int start, int end) { // check args!!! double log2Estimate = 0.0; int[] tokIds = new int[tokens.length]; for (int i = start; i < end; ++i) { tokIds[i] = mSymbolTable.symbolToID(tokens[i]); double conditionalLog2TokenEstimate = conditionalLog2TokenEstimate(tokIds,0,i+1); if (Double.isInfinite(conditionalLog2TokenEstimate)) { double extCountD = mCounter.extensionCount(new int[0], 0, 0); double numTokensD = mSymbolTable.numSymbols(); log2Estimate += com.aliasi.util.Math.log2(extCountD / (extCountD + numTokensD)); log2Estimate += mUnknownTokenModel.log2Estimate(tokens[i]); } else { log2Estimate += conditionalLog2TokenEstimate; } if (Double.isInfinite(log2Estimate)) { System.out.println("tokens[" + i + "]=" + tokens[i] + "\n id=" + tokIds[i]); } } return log2Estimate; } /** * Returns the probability of the specified tokens in the * underlying token n-gram distribution. This includes the * estimation of the actual token for unknown tokens. * * @param tokens Tokens whose probability is returned. * @return The probability of the tokens. */ public double processLog2Probability(String[] tokens) { return tokenLog2Probability(tokens,0,tokens.length); } /** * Returns an array of collocations in order of confidence that * their token sequences are not independent. The object * contained in the returned scored objects will be an instance of * <code>String[]</code> containing tokens. The length of n-gram, * minimum count for a result and the maximum number of results * returned are all specified. The confidence ordering is based * on the result of Pearson's C<sub><sub>2</sub></sub> * independence statistic as computed by {@link * #chiSquaredIndependence(int[])}. * * @param nGram Length of n-grams to search for collocations. * @param minCount Minimum count for a returned n-gram. * @param maxReturned Maximum number of results returned. * @return Array of collocations in confidence order. */ public ScoredObject<String[]>[] collocations(int nGram, int minCount, int maxReturned) { CollocationCollector collector = new CollocationCollector(maxReturned); mCounter.handleNGrams(nGram,minCount,collector); return collector.nGrams(); } /**
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -