📄 triecharseqcounter.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.lm;// import com.aliasi.util.Arrays;import com.aliasi.io.BitInput;import com.aliasi.io.BitOutput;import com.aliasi.util.ObjectToCounterMap;import com.aliasi.util.Strings;import java.io.InputStream;import java.io.IOException;import java.io.OutputStream;import java.util.ArrayList;import java.util.List;/** * A <code>TrieCharSeqCounter</code> stores counts for substrings of * strings. When the counter is constructed, a maximum length is * specified, and counts are only stored for strings up to that * length. For instance, an n-gram language model needs only counts * for strings up to length n. * * <P> Strings may be added to the counter using {@link * #incrementSubstrings(char[],int,int)}, which increments the counts * for all substrings of the specified character slice up to the * specified maximum length substring. The method {@link * #incrementPrefixes(char[],int,int)} increments only the prefixes of * the specified string. All substrings are incremented by * incrementing prefixes for each suffix. A substring counter may be * pruned using {@link #prune(int)}, which removes all substrings with * count below the specified threshold. * * <P>There are a wide range of reporting methods for trie-based * counters. * * <P><i>Implementation Note:</i> The trie counters are a heavily * unfolded implementation of a character-based Patricia (PAT) trie. * * @author Bob Carpenter * @version 3.0 * @since LingPipe2.0 */public class TrieCharSeqCounter implements CharSeqCounter { Node mRootNode = NodeFactory.createNode(0); final int mMaxLength; /** * Construct a substring counter that stores substrings * up to the specified maximum length. * * @param maxLength Maximum length of substrings stored by this * counter. * @throws IllegalArgumentException If the maximum length is * negative. */ public TrieCharSeqCounter(int maxLength) { if (maxLength < 0) { String msg = "Max length must be >= 0." + " Found length=" + maxLength; throw new IllegalArgumentException(msg); } mMaxLength = maxLength; } // following is CharSeqCounter interface w. inherited comments public long count(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs,start,end); return mRootNode.count(cs,start,end); } public long extensionCount(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs,start,end); return mRootNode.contextCount(cs,start,end); } public char[] observedCharacters() { return com.aliasi.util.Arrays.copy(mRootNode.outcomes(new char[] { },0,0)); } public char[] charactersFollowing(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs,start,end); return com.aliasi.util.Arrays.copy(mRootNode.outcomes(cs,start,end)); } public int numCharactersFollowing(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs,start,end); return mRootNode.numOutcomes(cs,start,end); } /** * Returns the sum of counts for all non-empty character * sequences. * * @return The sum of counts for all non-empty character * sequences. */ public long totalSequenceCount() { long sum = 0l; long[][] uniqueTotals = uniqueTotalNGramCount(); for (int i = 0; i < uniqueTotals.length; ++i) sum += uniqueTotals[i][1]; return sum; } /** * Returns the sum of the counts of all character sequences of * the specified length. * * @return The sum of the counts of all character sequences of * the specified length. */ public long totalSequenceCount(int length) { return mRootNode.totalNGramCount(length); } /** * Returns the number of character sequences with non-zero counts, * including the empty (zero length) character sequence. * * @return Number of character sequences with non-zero counts. */ public long uniqueSequenceCount() { return mRootNode.size(); } /** * Returns the number of character sequences of the specified length * with non-zero counts. * * @return The number of character sequences of the specified * length with non-zero counts. */ public long uniqueSequenceCount(int nGramOrder) { return mRootNode.uniqueNGramCount(nGramOrder); } /** * Removes strings with counts below the specified minimum. * Counts for remaining strings are not affected. Pruning may be * interleaved with updating counts in any order. * * @param minCount Minimum count required to retain a substring * count. * @throws IllegalArgumentException If the count is less than * <code>1</code>. */ public void prune(int minCount) { if (minCount < 1) { String msg = "Prune minimum count must be more than 1." + " Found minCount=" + minCount; throw new IllegalArgumentException(msg); } mRootNode = mRootNode.prune(minCount); if (mRootNode == null) mRootNode = NodeFactory.createNode(0); } /** * Returns an array of frequency counts for n-grams of the * specified n-gram order sorted in descending frequency order. * This form of result is sometimes called a Zipf plot because * of the sorting. * * @param nGramOrder Order of n-gram counted. * @return Array of frequency counts, sorted in decreasing order * of rank. */ public int[] nGramFrequencies(int nGramOrder) { List counts = countsList(nGramOrder); int[] result = new int[counts.size()]; for (int i = 0; i < result.length; ++i) result[i] = ((Long) counts.get(i)).intValue(); java.util.Arrays.sort(result); for (int i = result.length/2; i >= 0; --i) { int iOpp = result.length-i-1; int tmp = result[i]; result[i] = result[iOpp]; result[iOpp] = tmp; } return result; } /** * Returns the array of unique and total n-gram counts for each * n-gram length. The return array is indexed in the first * position by n-gram length, and in the second position by * <code>0</code> for unique counts and <code>1</code> for total * counts. Thus for <code>0<=n<=maxLength()</code>: * * <blockquote><code> * uniqueTotalNGramCount()[n][0] == uniqueNGramCount(n) * </code></blockquote> * * and * * <blockquote><code> * uniqueTotalNGramCount()[n][1] == totalNGramCount(n) * </code></blockquote> * * If unique and total counts are required for several * n-gram depths, this method is much more efficient than * calling all of the individual methods separately. * * @return The array of unique and total n-gram counts for * each n-gram length. */ public long[][] uniqueTotalNGramCount() { long[][] result = new long[mMaxLength+1][2]; mRootNode.addNGramCounts(result,0); return result; } /** * Returns a counter of occurrences of the highest frequency * n-grams of a specified n-gram order. The actual n-grams are * represented as strings in the result; recall that strings * are instances of {@link CharSequence}. * * <p>The maximum number of results returned must be specified, * because the entire set of n-grams is usually too large to * return as a counter. * * @param nGramOrder Order of n-gram to count. * @param maxReturn Maximum number of objects returned. */ public ObjectToCounterMap<String> topNGrams(int nGramOrder, int maxReturn) { NBestCounter counter = new NBestCounter(maxReturn,true); mRootNode.topNGrams(counter,new char[nGramOrder],0,nGramOrder); return counter.toObjectToCounter(); } /** * Returns the count in the training corpus for the specified * sequence of characters. The count returned may have been * reduced from the raw counts in training cases by pruning. * * @param cSeq Character sequence. * @return Count of character sequence in model. */ public long count(CharSequence cSeq) { return count(com.aliasi.util.Arrays.toArray(cSeq),0,cSeq.length()); } /** * Returns the sum of the counts of all character sequences one * character longer than the specified character sequence. * * @param cSeq Character sequence. * @return The sum of the counts of all character sequences one * character longer than the specified character sequence. */ public long extensionCount(CharSequence cSeq) { return mRootNode.contextCount(com.aliasi.util.Arrays.toArray(cSeq),0,cSeq.length()); } /** * Increments the count of all substrings of the specified * character array slice up to the maximum length specified in the * constructor. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @throws IndexOutOfBoundsException If the specified start and one plus * end point are not in the bounds of character sequence. */ public void incrementSubstrings(char[] cs, int start, int end) { incrementSubstrings(cs,start,end,1); } /** * Increments by the specified count all substrings of the * specified character array slice up to the maximum length * specified in the constructor. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @param count Amount to increment. * @throws IndexOutOfBoundsException If the specified start and one plus * end point are not in the bounds of character sequence. */ public void incrementSubstrings(char[] cs, int start, int end, int count) { Strings.checkArgsStartEnd(cs,start,end); // increment maximal strings and prefixes for (int i = start; i+mMaxLength <= end; ++i) incrementPrefixes(cs,i,i+mMaxLength,count); // increment short final strings and prefixes for (int i = Math.max(start,end-mMaxLength+1); i < end; ++i) incrementPrefixes(cs,i,end,count); } /** * Increments the count of all substrings of the specified * character sequence up to the maximum length specified in the * constructor. * * @param cSeq Character sequence. */ public void incrementSubstrings(CharSequence cSeq) { incrementSubstrings(cSeq,1); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -