📄 triecharseqcounter.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.lm;// import com.aliasi.util.Arrays;import com.aliasi.io.BitInput;import com.aliasi.io.BitOutput;import com.aliasi.util.ObjectToCounterMap;import com.aliasi.util.Strings;import java.io.InputStream;import java.io.IOException;import java.io.OutputStream;import java.util.ArrayList;import java.util.List;/** * A <code>TrieCharSeqCounter</code> stores counts for substrings of * strings.  When the counter is constructed, a maximum length is * specified, and counts are only stored for strings up to that * length.  For instance, an n-gram language model needs only counts * for strings up to length n. * * <P> Strings may be added to the counter using {@link * #incrementSubstrings(char[],int,int)}, which increments the counts * for all substrings of the specified character slice up to the * specified maximum length substring. The method {@link * #incrementPrefixes(char[],int,int)} increments only the prefixes of * the specified string.  All substrings are incremented by * incrementing prefixes for each suffix.  A substring counter may be * pruned using {@link #prune(int)}, which removes all substrings with * count below the specified threshold. * * <P>There are a wide range of reporting methods for trie-based * counters. * * <P><i>Implementation Note:</i> The trie counters are a heavily * unfolded implementation of a character-based Patricia (PAT) trie. * * @author  Bob Carpenter * @version 3.0 * @since   LingPipe2.0 */public class TrieCharSeqCounter implements CharSeqCounter {    Node mRootNode =  NodeFactory.createNode(0);    final int mMaxLength;    /**     * Construct a substring counter that stores substrings     * up to the specified maximum length.     *     * @param maxLength Maximum length of substrings stored by this     * counter.     * @throws IllegalArgumentException If the maximum length is     * negative.     */    public TrieCharSeqCounter(int maxLength) {        if (maxLength < 0) {            String msg = "Max length must be >= 0."                + " Found length=" + maxLength;            throw new IllegalArgumentException(msg);        }        mMaxLength = maxLength;    }    // following is CharSeqCounter interface w. inherited comments    public long count(char[] cs, int start, int end) {        Strings.checkArgsStartEnd(cs,start,end);        return mRootNode.count(cs,start,end);    }    public long extensionCount(char[] cs, int start, int end) {        Strings.checkArgsStartEnd(cs,start,end);        return mRootNode.contextCount(cs,start,end);    }    public char[] observedCharacters() {        return com.aliasi.util.Arrays.copy(mRootNode.outcomes(new char[] { },0,0));    }    public char[] charactersFollowing(char[] cs, int start, int end) {        Strings.checkArgsStartEnd(cs,start,end);        return com.aliasi.util.Arrays.copy(mRootNode.outcomes(cs,start,end));    }    public int numCharactersFollowing(char[] cs, int start, int end) {        Strings.checkArgsStartEnd(cs,start,end);        return mRootNode.numOutcomes(cs,start,end);    }    /**     * Returns the sum of counts for all non-empty character     * sequences.     *     * @return The sum of counts for all non-empty character     * sequences.     */    public long totalSequenceCount() {        long sum = 0l;        long[][] uniqueTotals = uniqueTotalNGramCount();        for (int i = 0; i < uniqueTotals.length; ++i)            sum += uniqueTotals[i][1];        return sum;    }    /**     * Returns the sum of the counts of all character sequences of     * the specified length.     *     * @return The sum of the counts of all character sequences of     * the specified length.     */    public long totalSequenceCount(int length) {        return mRootNode.totalNGramCount(length);    }    /**     * Returns the number of character sequences with non-zero counts,     * including the empty (zero length) character sequence.     *     * @return Number of character sequences with non-zero counts.     */    public long uniqueSequenceCount() {        return mRootNode.size();    }    /**     * Returns the number of character sequences of the specified length     * with non-zero counts.     *     * @return The number of character sequences of the specified     * length with non-zero counts.     */    public long uniqueSequenceCount(int nGramOrder) {        return mRootNode.uniqueNGramCount(nGramOrder);    }    /**     * Removes strings with counts below the specified minimum.     * Counts for remaining strings are not affected.  Pruning may be     * interleaved with updating counts in any order.     *     * @param minCount Minimum count required to retain a substring     * count.     * @throws IllegalArgumentException If the count is less than     * <code>1</code>.     */    public void prune(int minCount) {        if (minCount < 1) {            String msg = "Prune minimum count must be more than 1."                + " Found minCount=" + minCount;            throw new IllegalArgumentException(msg);        }        mRootNode = mRootNode.prune(minCount);        if (mRootNode == null)            mRootNode = NodeFactory.createNode(0);    }    /**     * Returns an array of frequency counts for n-grams of the     * specified n-gram order sorted in descending frequency order.     * This form of result is sometimes called a Zipf plot because     * of the sorting.     *     * @param nGramOrder Order of n-gram counted.     * @return Array of frequency counts, sorted in decreasing order     * of rank.     */    public int[] nGramFrequencies(int nGramOrder) {        List counts = countsList(nGramOrder);        int[] result = new int[counts.size()];        for (int i = 0; i < result.length; ++i)            result[i] = ((Long) counts.get(i)).intValue();        java.util.Arrays.sort(result);        for (int i = result.length/2; i >= 0; --i) {            int iOpp = result.length-i-1;            int tmp = result[i];            result[i] = result[iOpp];            result[iOpp] = tmp;        }        return result;    }    /**     * Returns the array of unique and total n-gram counts for each     * n-gram length.  The return array is indexed in the first     * position by n-gram length, and in the second position by     * <code>0</code> for unique counts and <code>1</code> for total     * counts.  Thus for <code>0&lt;=n&lt;=maxLength()</code>:     *     * <blockquote><code>     * uniqueTotalNGramCount()[n][0] == uniqueNGramCount(n)     * </code></blockquote>     *     * and     *     * <blockquote><code>     * uniqueTotalNGramCount()[n][1] == totalNGramCount(n)     * </code></blockquote>     *     * If unique and total counts are required for several     * n-gram depths, this method is much more efficient than     * calling all of the individual methods separately.     *     * @return The array of unique and total n-gram counts for     * each n-gram length.     */    public long[][] uniqueTotalNGramCount() {        long[][] result = new long[mMaxLength+1][2];        mRootNode.addNGramCounts(result,0);        return result;    }    /**     * Returns a counter of occurrences of the highest frequency     * n-grams of a specified n-gram order.  The actual n-grams are     * represented as strings in the result; recall that strings     * are instances of {@link CharSequence}.     *      * <p>The maximum number of results returned must be specified,     * because the entire set of n-grams is usually too large to     * return as a counter.     *     * @param nGramOrder Order of n-gram to count.     * @param maxReturn Maximum number of objects returned.     */    public ObjectToCounterMap<String> topNGrams(int nGramOrder, int maxReturn) {        NBestCounter counter = new NBestCounter(maxReturn,true);        mRootNode.topNGrams(counter,new char[nGramOrder],0,nGramOrder);        return counter.toObjectToCounter();    }    /**     * Returns the count in the training corpus for the specified     * sequence of characters.  The count returned may have been     * reduced from the raw counts in training cases by pruning.     *     * @param cSeq Character sequence.     * @return Count of character sequence in model.     */    public long count(CharSequence cSeq) {        return count(com.aliasi.util.Arrays.toArray(cSeq),0,cSeq.length());    }    /**     * Returns the sum of the counts of all character sequences one     * character longer than the specified character sequence.     *     * @param cSeq Character sequence.     * @return The sum of the counts of all character sequences one     * character longer than the specified character sequence.     */    public long extensionCount(CharSequence cSeq) {        return mRootNode.contextCount(com.aliasi.util.Arrays.toArray(cSeq),0,cSeq.length());    }    /**     * Increments the count of all substrings of the specified     * character array slice up to the maximum length specified in the     * constructor.     *     * @param cs Underlying character array.     * @param start Index of first character in slice.     * @param end Index of one past last character in slice.     * @throws IndexOutOfBoundsException If the specified start and one plus     * end point are not in the bounds of character sequence.     */    public void incrementSubstrings(char[] cs, int start, int end) {        incrementSubstrings(cs,start,end,1);    }    /**     * Increments by the specified count all substrings of the     * specified character array slice up to the maximum length     * specified in the constructor.     *     * @param cs Underlying character array.     * @param start Index of first character in slice.     * @param end Index of one past last character in slice.     * @param count Amount to increment.     * @throws IndexOutOfBoundsException If the specified start and one plus     * end point are not in the bounds of character sequence.     */    public void incrementSubstrings(char[] cs, int start, int end,                                    int count) {        Strings.checkArgsStartEnd(cs,start,end);        // increment maximal strings and prefixes        for (int i = start; i+mMaxLength <= end; ++i)            incrementPrefixes(cs,i,i+mMaxLength,count);        // increment short final strings and prefixes        for (int i = Math.max(start,end-mMaxLength+1); i < end; ++i)            incrementPrefixes(cs,i,end,count);    }    /**     * Increments the count of all substrings of the specified     * character sequence up to the maximum length specified in the     * constructor.     *     * @param cSeq Character sequence.     */    public void incrementSubstrings(CharSequence cSeq) {        incrementSubstrings(cSeq,1);    }
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -