📄 tokenizedlm.java
字号:
* Returns an array of scored n-grams ordered by the significance * of the degree to which their counts in this model exceed their * expected counts in a specified background model. The returned * scored object array contains {@link ScoredObject} instances * whose objects are terms represented as string arrays and whose * scores are the collocation score for the term. For instance, * the new terms may be printed in order of significance by: * * <code><pre> * ScoredObject[] terms = new Terms(3,5,100,bgLM); * for (int i = 0; i < terms.length; ++i) { * String[] term = (String[]) terms[i].getObject(); * double score = terms[i].score(); * ... * } * </pre></code> * * <P>The exact scoring used is the z-score as defined in {@link * BinomialDistribution#z(double,int,int)} with the success * probability defined by the n-grams probability estimate in the * background model, the number of successes being the count of * the n-gram in this model and the number of trials being the * total count in this model. * * <p>See {@link #oldTerms(int,int,int,LanguageModel.Tokenized)} * for a method that returns the least significant terms in * this model relative to a background model. * * @param nGram Length of n-grams to search for significant new terms. * @param minCount Minimum count for a returned n-gram. * @param maxReturned Maximum number of results returned. * @param backgroundLM Background language model against which * significance is measured. * @return Array of new terms ordered by significance. */ public ScoredObject<String[]>[] newTerms(int nGram, int minCount, int maxReturned, LanguageModel.Tokenized backgroundLM) { return sigTerms(nGram,minCount,maxReturned,backgroundLM,false); } /** * Returns an array of scored n-grams ordered in reverse order * of significance with respect to the background model. In * other words, these are ones that occur less often in this * model than they would have been expected to given the * background model. * * <p>Note that only terms that exist in the foreground model are * considered. By contrast, reversing the roles of the models in * the sister method {@link * #newTerms(int,int,int,LanguageModel.Tokenized)} considers * every n-gram in the background model and may return slightly * different results. * * @param nGram Length of n-grams to search for significant old terms. * @param minCount Minimum count in background model for a returned n-gram. * @param maxReturned Maximum number of results returned. * @param backgroundLM Background language model from which counts are * derived. * @return Array of old terms ordered by significance. */ public ScoredObject<String[]>[] oldTerms(int nGram, int minCount, int maxReturned, LanguageModel.Tokenized backgroundLM) { return sigTerms(nGram,minCount,maxReturned,backgroundLM,true); } private ScoredObject<String[]>[] sigTerms(int nGram, int minCount, int maxReturned, LanguageModel.Tokenized backgroundLM, boolean reverse) { SigTermCollector collector = new SigTermCollector(maxReturned,backgroundLM,reverse); mCounter.handleNGrams(nGram,minCount,collector); return collector.nGrams(); } /** * Returns the most frequent n-gram terms in the training data up * to the specified maximum number. The terms are ordered by raw * counts and returned in order. The scored objects in the return * array have objects that are the terms themselves and * scores based on count. * * <p>See {@link #infrequentTerms(int,int)} to retrieve the most * frequent terms. * * @param nGram Length of n-grams to search. * @param maxReturned Maximum number of results returned. */ public ScoredObject<String[]>[] frequentTerms(int nGram, int maxReturned) { return freqTerms(nGram,maxReturned,false); } private ScoredObject<String[]>[] freqTerms(int nGram, int maxReturned, boolean reverse) { FreqTermCollector collector = new FreqTermCollector(maxReturned,reverse); mCounter.handleNGrams(nGram,1,collector); return collector.nGrams(); } /** * Returns the least frequent n-gram terms in the training data up * to the specified maximum number. The terms are ordered by raw * counts and returned in reverse order. The scored objects in * the return array have objects that are the terms themselves and * scores based on count. * * <p>See {@link #frequentTerms(int,int)} to retrieve the most * frequent terms. * * @param nGram Length of n-grams to search. * @param maxReturned Maximum number of results returned. */ public ScoredObject<String[]>[] infrequentTerms(int nGram, int maxReturned) { return freqTerms(nGram,maxReturned,true); } /** * Returns the maximum value of Pearson's C<sub><sub>2</sub></sub> * independence test statistic resulting from splitting the * specified n-gram in half to derive a contingency matrix. * Higher return values indicate more dependence among the terms * in the n-gram. * * <P>The input n-gram is split into two halves, * <code>Term<sub><sub>1</sub></sub></code> and * <code>Term<sub><sub>2</sub></sub></code>, each of which is a * non-empty sequence of integers. * <code>Term<sub><sub>1</sub></sub></code> consists of the tokens * indexed <code>0</code> to <code>mid-1</code> and * <code>Term<sub><sub>2</sub></sub></code> from <code>mid</code> * to <code>end-1</code>. * * <P>The contingency matrix for computing the independence * statistic is: * * <blockquote> * <table border='1' cellpadding='5'> * <tr><td> </td><td>+Term<sub><sub>2</sub></sub></td><td>-Term<sub><sub>2</sub></sub></td></tr> * <tr><td>+Term<sub><sub>1</sub></sub></td><td>Term(+,+)</td><td>Term(+,-)</td></tr> * <tr><td>-Term<sub><sub>1</sub></sub></td><td>Term(-,+)</td><td>Term(-,-)</td></tr> * </table> * </blockquote> * * where values for a specified integer sequence * <code>nGram</code> and midpoint <code>0 < mid < end</code> is: * * <blockquote><code> * Term(+,+) = count(nGram,0,end) * <br> * Term(+,-) = count(nGram,0,mid) - count(nGram,0,end) * <br> * Term(-,+) = count(nGram,mid,end) - count(nGram,0,end) * <br> * Term(-,-) = totalCount - Term(+,+) - Term(+,-) - Term(-,+) * </code></blockquote> * * Note that using the overall total count provides a slight * overapproximation of the count of appropriate-length n-grams. * * <P>For further information on the independence test, see the * documentation for {@link * Statistics#chiSquaredIndependence(double,double,double,double)}. * * @param nGram Array of integers whose independence * statistic is returned. * @return Minimum independence test statistic score for splits of * the n-gram. * @throws IllegalArgumentException If the specified n-gram is not at * least two elements long. */ public double chiSquaredIndependence(int[] nGram) { if (nGram.length < 2) { String msg = "Require n-gram >= 2 for chi square independence." + " Found nGram length=" + nGram.length; throw new IllegalArgumentException(msg); } if (nGram.length == 2) { return chiSquaredSplit(nGram,1); } double bestScore = Double.NEGATIVE_INFINITY; for (int mid = 1; mid+1 < nGram.length; ++mid) bestScore = Math.max(bestScore, chiSquaredSplit(nGram,mid)); return bestScore; } /** * Returns the z-score of the specified n-gram with the specified * count out of a total sample count, as measured against the * expectation of this tokenized language model. Negative * z-scores mean the sample n-gram count is lower than expected * and positive z-scores mean the sample n-gram count is higher * than expected. Z-scores close to zero indicate the sample * count is in line with expectations according to this language * model. * * <P>Formulas for z-scores and an explanation of their scaling by * deviation is described in the documentation for the static * method {@link BinomialDistribution#z(double,int,int)}. * * @param nGram The n-gram to test. * @param nGramSampleCount The number of observations of the * n-gram in the sample. * @param totalSampleCount The total number of samples. * @return The z-score for the specified sample counts against the * expections of this language model. */ public double z(int[] nGram, int nGramSampleCount, int totalSampleCount) { double totalCount = mCounter.count(nGram,0,0); double nGramCount = mCounter.count(nGram,0,nGram.length); double successProbability = nGramCount / totalCount; return BinomialDistribution.z(successProbability, nGramSampleCount, totalSampleCount); } /** * Returns a string-based representation of the token * counts for this language model. * * @return A string-based representation of this model. */ public String toString() { return mCounter.mRootNode.toString(mSymbolTable); } private double conditionalLog2TokenEstimate(int[] tokIds, int start, int end) { if (end < 1) return 0.0; // this can't get hit from current calls; end >= 1 int maxLength = mCounter.maxLength(); int contextEnd = end-1; double estimate = tokIds[end-1] == UNKNOWN_TOKEN ? 1.0 : 0.0; for (int contextStart = end-1; (contextStart >= start && (end-contextStart) <= maxLength); --contextStart) { int numExtensions = mCounter.numExtensions(tokIds,contextStart,contextEnd); if (numExtensions == 0) break; double extCountD = mCounter.extensionCount(tokIds,contextStart,contextEnd); double lambda = extCountD / (extCountD + mLambdaFactor * (double) numExtensions); estimate = estimate * (1.0 - lambda); if (tokIds[end-1] == UNKNOWN_TOKEN) continue; int count = mCounter.count(tokIds,contextStart,end); if (count > 0) estimate += (lambda * ((double) count))/extCountD; } return com.aliasi.util.Math.log2(estimate); } private double chiSquaredSplit(int[] pair, int mid) { // contingency table & probabilities // _2 _y // 1_ 12 1y // x_ x2 xy long count12 = mCounter.count(pair,0,pair.length); long count1_ = mCounter.count(pair,0,mid); long count_2 = mCounter.count(pair,mid,pair.length); long n = (long) mCounter.extensionCount(pair,0,0); long countxy = n - count1_ - count_2 + count12; long countx2 = count_2 - count12; long count1y = count1_ - count12; return Statistics.chiSquaredIndependence(count12,count1y,countx2,countxy); } private int lastInternalNodeIndex() { int last = 1; LinkedList queue = new LinkedList(); queue.add(mCounter.mRootNode); for (int i = 1; !queue.isEmpty(); ++i) { IntNode node = (IntNode) queue.removeFirst(); if (node.numExtensions() > 0) last = i; node.addDaughters(queue); } return last-1; } // this is also value returned by static final int UNKNOWN_TOKEN = SymbolTable.UNKNOWN_SYMBOL_ID; static final int BOUNDARY_TOKEN = -2; private static int[] concatenate(int[] is, int i) { int[] result = new int[is.length+1]; System.arraycopy(is,0,result,0,is.length); result[is.length] = i; return result; } static class Externalizer extends AbstractExternalizable { private static final long serialVersionUID = 6135272620545804504L; final TokenizedLM mLM; public Externalizer() { this(null); } public Externalizer(TokenizedLM lm) { mLM = lm; } public Object read(ObjectInput in) throws IOException { try { return new CompiledTokenizedLM(in); } catch (ClassNotFoundException e) { throw Exceptions.toIO("TokenizedLM.Externalizer.read()",e); } } public void writeExternal(ObjectOutput objOut) throws IOException { objOut.writeUTF(mLM.mTokenizerFactory.getClass().getName()); mLM.mSymbolTable.writeTo(objOut); ((LanguageModel.Dynamic) mLM.mUnknownTokenModel).compileTo(objOut); ((LanguageModel.Dynamic) mLM.mWhitespaceModel).compileTo(objOut); objOut.writeInt(mLM.mNGramOrder); int numNodes = mLM.mCounter.mRootNode.trieSize(); objOut.writeInt(numNodes); int lastInternalNodeIndex = mLM.lastInternalNodeIndex(); objOut.writeInt(lastInternalNodeIndex); // write root node (-int,-logP,-log(1-L),firstDtr) objOut.writeInt(Integer.MIN_VALUE); // root symbol unknown objOut.writeFloat(Float.NaN); // no estimate objOut.writeFloat((float) com.aliasi.util.Math .log2(1.0-mLM.lambda(com.aliasi.util.Arrays .EMPTY_INT_ARRAY))); objOut.writeInt(1); // first dtr = 1 LinkedList queue = new LinkedList(); int[] outcomes = mLM.mCounter.mRootNode .integersFollowing(com.aliasi.util.Arrays.EMPTY_INT_ARRAY,0,0); for (int i = 0; i < outcomes.length; ++i) queue.add(new int[] { outcomes[i] }); for (int i = 1; !queue.isEmpty(); ++i) { int[] is = (int[]) queue.removeFirst(); objOut.writeInt(is[is.length-1]); objOut.writeFloat((float) mLM.conditionalLog2TokenEstimate(is,0,is.length)); if (i <= lastInternalNodeIndex) { objOut.writeFloat((float) com.aliasi.util.Math.log2(1.0-mLM.lambda(is))); objOut.writeInt(i+queue.size()+1); } int[] followers = mLM.mCounter.mRootNode.integersFollowing(is,0,is.length); for (int j = 0; j < followers.length; ++j) queue.add(concatenate(is,followers[j])); } } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -