📄 tokenizedlm.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
     * Returns an array of scored n-grams ordered by the significance     * of the degree to which their counts in this model exceed their     * expected counts in a specified background model.  The returned     * scored object array contains {@link ScoredObject} instances     * whose objects are terms represented as string arrays and whose     * scores are the collocation score for the term.  For instance,     * the new terms may be printed in order of significance by:     *     * <code><pre>     * ScoredObject[] terms = new Terms(3,5,100,bgLM);     * for (int i = 0; i < terms.length; ++i) {     *     String[] term = (String[]) terms[i].getObject();     *     double score = terms[i].score();     *     ...     * }     * </pre></code>     *     * <P>The exact scoring used is the z-score as defined in {@link     * BinomialDistribution#z(double,int,int)} with the success     * probability defined by the n-grams probability estimate in the     * background model, the number of successes being the count of     * the n-gram in this model and the number of trials being the     * total count in this model.     *     * <p>See {@link #oldTerms(int,int,int,LanguageModel.Tokenized)}     * for a method that returns the least significant terms in     * this model relative to a background model.     *     * @param nGram Length of n-grams to search for significant new terms.     * @param minCount Minimum count for a returned n-gram.     * @param maxReturned Maximum number of results returned.     * @param backgroundLM Background language model against which     * significance is measured.     * @return Array of new terms ordered by significance.     */    public ScoredObject<String[]>[] newTerms(int nGram, int minCount,                                   int maxReturned,                                   LanguageModel.Tokenized backgroundLM) {        return sigTerms(nGram,minCount,maxReturned,backgroundLM,false);    }    /**     * Returns an array of scored n-grams ordered in reverse order     * of significance with respect to the background model.  In     * other words, these are ones that occur less often in this     * model than they would have been expected to given the     * background model.     *     * <p>Note that only terms that exist in the foreground model are     * considered.  By contrast, reversing the roles of the models in     * the sister method {@link     * #newTerms(int,int,int,LanguageModel.Tokenized)} considers     * every n-gram in the background model and may return slightly     * different results.     *     * @param nGram Length of n-grams to search for significant old terms.     * @param minCount Minimum count in background model for a returned n-gram.     * @param maxReturned Maximum number of results returned.     * @param backgroundLM Background language model from which counts are     * derived.     * @return Array of old terms ordered by significance.     */    public ScoredObject<String[]>[] oldTerms(int nGram, int minCount,                                   int maxReturned,                                   LanguageModel.Tokenized backgroundLM) {        return sigTerms(nGram,minCount,maxReturned,backgroundLM,true);    }    private ScoredObject<String[]>[] sigTerms(int nGram, int minCount,                                    int maxReturned,                                    LanguageModel.Tokenized backgroundLM,                                    boolean reverse) {        SigTermCollector collector            = new SigTermCollector(maxReturned,backgroundLM,reverse);        mCounter.handleNGrams(nGram,minCount,collector);        return collector.nGrams();    }    /**     * Returns the most frequent n-gram terms in the training data up     * to the specified maximum number.  The terms are ordered by raw     * counts and returned in order.  The scored objects in the return     * array have objects that are the terms themselves and     * scores based on count.     *     * <p>See {@link #infrequentTerms(int,int)} to retrieve the most     * frequent terms.     *     * @param nGram Length of n-grams to search.     * @param maxReturned Maximum number of results returned.     */    public ScoredObject<String[]>[] frequentTerms(int nGram, int maxReturned) {        return freqTerms(nGram,maxReturned,false);    }    private ScoredObject<String[]>[] freqTerms(int nGram, int maxReturned,                                     boolean reverse) {        FreqTermCollector collector            = new FreqTermCollector(maxReturned,reverse);        mCounter.handleNGrams(nGram,1,collector);        return collector.nGrams();    }    /**     * Returns the least frequent n-gram terms in the training data up     * to the specified maximum number.  The terms are ordered by raw     * counts and returned in reverse order.  The scored objects in     * the return array have objects that are the terms themselves and     * scores based on count.     *     * <p>See {@link #frequentTerms(int,int)} to retrieve the most     * frequent terms.     *     * @param nGram Length of n-grams to search.     * @param maxReturned Maximum number of results returned.     */    public ScoredObject<String[]>[] infrequentTerms(int nGram, int maxReturned) {        return freqTerms(nGram,maxReturned,true);    }    /**     * Returns the maximum value of Pearson's C<sub><sub>2</sub></sub>     * independence test statistic resulting from splitting the     * specified n-gram in half to derive a contingency matrix.     * Higher return values indicate more dependence among the terms     * in the n-gram.     *     * <P>The input n-gram is split into two halves,     * <code>Term<sub><sub>1</sub></sub></code> and     * <code>Term<sub><sub>2</sub></sub></code>, each of which is a     * non-empty sequence of integers.     * <code>Term<sub><sub>1</sub></sub></code> consists of the tokens     * indexed <code>0</code> to <code>mid-1</code> and     * <code>Term<sub><sub>2</sub></sub></code> from <code>mid</code>     * to <code>end-1</code>.     *     * <P>The contingency matrix for computing the independence     * statistic is:     *     * <blockquote>     * <table border='1' cellpadding='5'>     * <tr><td>&nbsp;</td><td>+Term<sub><sub>2</sub></sub></td><td>-Term<sub><sub>2</sub></sub></td></tr>     * <tr><td>+Term<sub><sub>1</sub></sub></td><td>Term(+,+)</td><td>Term(+,-)</td></tr>     * <tr><td>-Term<sub><sub>1</sub></sub></td><td>Term(-,+)</td><td>Term(-,-)</td></tr>     * </table>     * </blockquote>     *     * where values for a specified integer sequence     * <code>nGram</code> and midpoint <code>0 < mid < end</code> is:     *     * <blockquote><code>     *  Term(+,+) = count(nGram,0,end)     *  <br>     *  Term(+,-) = count(nGram,0,mid) - count(nGram,0,end)     *  <br>     *  Term(-,+) = count(nGram,mid,end) - count(nGram,0,end)     *  <br>     *  Term(-,-) = totalCount - Term(+,+) - Term(+,-) - Term(-,+)     * </code></blockquote>     *     * Note that using the overall total count provides a slight     * overapproximation of the count of appropriate-length n-grams.     *     * <P>For further information on the independence test, see the     * documentation for {@link     * Statistics#chiSquaredIndependence(double,double,double,double)}.     *     * @param nGram Array of integers whose independence     * statistic is returned.     * @return Minimum independence test statistic score for splits of     * the n-gram.     * @throws IllegalArgumentException If the specified n-gram is not at     * least two elements long.     */    public double chiSquaredIndependence(int[] nGram) {        if (nGram.length < 2) {            String msg = "Require n-gram >= 2 for chi square independence."                + " Found nGram length=" + nGram.length;            throw new IllegalArgumentException(msg);        }        if (nGram.length == 2) {            return chiSquaredSplit(nGram,1);        }        double bestScore = Double.NEGATIVE_INFINITY;        for (int mid = 1; mid+1 < nGram.length; ++mid)            bestScore = Math.max(bestScore,                                 chiSquaredSplit(nGram,mid));        return bestScore;    }    /**     * Returns the z-score of the specified n-gram with the specified     * count out of a total sample count, as measured against the     * expectation of this tokenized language model.  Negative     * z-scores mean the sample n-gram count is lower than expected     * and positive z-scores mean the sample n-gram count is higher     * than expected.  Z-scores close to zero indicate the sample     * count is in line with expectations according to this language     * model.     *     * <P>Formulas for z-scores and an explanation of their scaling by     * deviation is described in the documentation for the static     * method {@link BinomialDistribution#z(double,int,int)}.     *     * @param nGram The n-gram to test.     * @param nGramSampleCount The number of observations of the     * n-gram in the sample.     * @param totalSampleCount The total number of samples.     * @return The z-score for the specified sample counts against the     * expections of this language model.     */    public double z(int[] nGram, int nGramSampleCount, int totalSampleCount) {        double totalCount = mCounter.count(nGram,0,0);        double nGramCount = mCounter.count(nGram,0,nGram.length);        double successProbability = nGramCount / totalCount;        return BinomialDistribution.z(successProbability,                                      nGramSampleCount,                                      totalSampleCount);    }    /**     * Returns a string-based representation of the token     * counts for this language model.     *     * @return A string-based representation of this model.     */    public String toString() {        return mCounter.mRootNode.toString(mSymbolTable);    }    private double conditionalLog2TokenEstimate(int[] tokIds,                                                int start, int end) {        if (end < 1) return 0.0; // this can't get hit from current calls; end >= 1        int maxLength = mCounter.maxLength();        int contextEnd = end-1;        double estimate = tokIds[end-1] == UNKNOWN_TOKEN ? 1.0 : 0.0;        for (int contextStart = end-1;             (contextStart >= start              && (end-contextStart) <= maxLength);             --contextStart) {            int numExtensions                = mCounter.numExtensions(tokIds,contextStart,contextEnd);            if (numExtensions == 0) break;            double extCountD                = mCounter.extensionCount(tokIds,contextStart,contextEnd);            double lambda                = extCountD                / (extCountD + mLambdaFactor * (double) numExtensions);            estimate = estimate * (1.0 - lambda);            if (tokIds[end-1] == UNKNOWN_TOKEN) continue;            int count = mCounter.count(tokIds,contextStart,end);            if (count > 0)                estimate += (lambda * ((double) count))/extCountD;        }        return com.aliasi.util.Math.log2(estimate);    }    private double chiSquaredSplit(int[] pair, int mid) {        // contingency table & probabilities        //         _2    _y        //    1_   12    1y        //    x_   x2    xy        long count12 = mCounter.count(pair,0,pair.length);        long count1_ = mCounter.count(pair,0,mid);        long count_2 = mCounter.count(pair,mid,pair.length);        long n = (long) mCounter.extensionCount(pair,0,0);        long countxy = n - count1_ - count_2 + count12;        long countx2 = count_2 - count12;        long count1y = count1_ - count12;        return Statistics.chiSquaredIndependence(count12,count1y,countx2,countxy);    }    private int lastInternalNodeIndex() {        int last = 1;        LinkedList queue = new LinkedList();        queue.add(mCounter.mRootNode);        for (int i = 1; !queue.isEmpty(); ++i) {            IntNode node = (IntNode) queue.removeFirst();            if (node.numExtensions() > 0)                last = i;            node.addDaughters(queue);        }        return last-1;    }    // this is also value returned by    static final int UNKNOWN_TOKEN =        SymbolTable.UNKNOWN_SYMBOL_ID;    static final int BOUNDARY_TOKEN = -2;    private static int[] concatenate(int[] is, int i) {        int[] result = new int[is.length+1];        System.arraycopy(is,0,result,0,is.length);        result[is.length] = i;        return result;    }    static class Externalizer extends AbstractExternalizable {        private static final long serialVersionUID = 6135272620545804504L;        final TokenizedLM mLM;        public Externalizer() {            this(null);        }        public Externalizer(TokenizedLM lm) {            mLM = lm;        }        public Object read(ObjectInput in) throws IOException {            try {                return new CompiledTokenizedLM(in);            } catch (ClassNotFoundException e) {                throw Exceptions.toIO("TokenizedLM.Externalizer.read()",e);            }        }        public void writeExternal(ObjectOutput objOut) throws IOException {            objOut.writeUTF(mLM.mTokenizerFactory.getClass().getName());            mLM.mSymbolTable.writeTo(objOut);            ((LanguageModel.Dynamic) mLM.mUnknownTokenModel).compileTo(objOut);            ((LanguageModel.Dynamic) mLM.mWhitespaceModel).compileTo(objOut);            objOut.writeInt(mLM.mNGramOrder);            int numNodes = mLM.mCounter.mRootNode.trieSize();            objOut.writeInt(numNodes);            int lastInternalNodeIndex = mLM.lastInternalNodeIndex();            objOut.writeInt(lastInternalNodeIndex);            // write root node (-int,-logP,-log(1-L),firstDtr)            objOut.writeInt(Integer.MIN_VALUE);  // root symbol unknown            objOut.writeFloat(Float.NaN); // no estimate            objOut.writeFloat((float)                              com.aliasi.util.Math                              .log2(1.0-mLM.lambda(com.aliasi.util.Arrays                                                   .EMPTY_INT_ARRAY)));            objOut.writeInt(1);           // first dtr = 1            LinkedList queue = new LinkedList();            int[] outcomes                = mLM.mCounter.mRootNode                .integersFollowing(com.aliasi.util.Arrays.EMPTY_INT_ARRAY,0,0);            for (int i = 0; i < outcomes.length; ++i)                queue.add(new int[] { outcomes[i] });            for (int i = 1; !queue.isEmpty(); ++i) {                int[] is = (int[]) queue.removeFirst();                objOut.writeInt(is[is.length-1]);                objOut.writeFloat((float)                                  mLM.conditionalLog2TokenEstimate(is,0,is.length));                if (i <= lastInternalNodeIndex) {                    objOut.writeFloat((float)                                      com.aliasi.util.Math.log2(1.0-mLM.lambda(is)));                    objOut.writeInt(i+queue.size()+1);                }                int[] followers                    = mLM.mCounter.mRootNode.integersFollowing(is,0,is.length);                for (int j = 0; j < followers.length; ++j)                    queue.add(concatenate(is,followers[j]));            }        }    }}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -