📄 triecharseqcounter.java
字号:
/** * Increments by the specified count all substrings of the * specified character sequence up to the maximum length specified * in the constructor. * * @param cSeq Character sequence. * @param count Amount to increment. */ public void incrementSubstrings(CharSequence cSeq, int count) { incrementSubstrings(com.aliasi.util.Arrays.toArray(cSeq), 0,cSeq.length(),count); } /** * Increments the count of all prefixes of the specified * character sequence up to the maximum length specified in the * constructor. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @throws IndexOutOfBoundsException If the specified start and one plus * end point are not in the bounds of character sequence. */ public void incrementPrefixes(char[] cs, int start, int end) { incrementPrefixes(cs,start,end,1); } /** * Increments the count of all prefixes of the specified * character sequence up to the maximum length specified in the * constructor. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @param count Amount to increment. * @throws IndexOutOfBoundsException If the specified start and one plus * end point are not in the bounds of character sequence. */ public void incrementPrefixes(char[] cs, int start, int end, int count) { Strings.checkArgsStartEnd(cs,start,end); mRootNode = mRootNode.increment(cs,start,end,count); } /** * Decrements all of the substrings of the specified character * slice by one. This method may be used in conjunction with * {@link #incrementSubstrings(char[],int,int)} to implement * counts for conditional probability estimates without affecting * underlying estimates. For example, the following code: * * <blockquote><pre> * char[] cs = "abcdefghi".toCharArray(); * counter.incrementSubstrings(cs,3,7); * counter.decrementSubstrings(cs,3,5); * </pre></blockquote> * * will increment the substrings of <code>"defg"</code> * and then decrement the substrings of <code>"de"</code>, * causing the net effect of incrementing the counts of substrings * <code>"defg"</code>, * <code>"efg"</code>, * <code>"fg"</code>, * <code>"g"</code>, * <code>"def"</code>, * <code>"ef"</code>, and * <code>"f"</code>. This has the effect of increasing * the estimate of <code>g</code> given <code>def</code>, without * increasing the estimate of <code>d</code> in an empty context. * * @param cs Underlying array of characters in slice. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @throws IllegalArgumentException If the array slice is valid. */ public void decrementSubstrings(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs,start,end); for (int i = start; i < end; ++i) for (int j = i; j <= end; ++j) mRootNode = mRootNode.decrement(cs,i,j); } /** * Returns a string representation of the trie structure of counts * underlying this counter. * * <P><b>Warning:</b> The resulting string will be very large if * the number of substrings is large. To avoid blowing out * memory, do not call this method for large counters. * * @return String representation of this counter. */ public String toString() { return mRootNode.toString(); } void toStringBuffer(StringBuffer sb) { mRootNode.toString(sb,0); } /** * Decrements the unigram count for the specified character. This * method is useful for training conditional probabilities, even * though it is not powerful enough to do it in full generality. * * @param c Decrement the unigram count for the specified * character. */ public void decrementUnigram(char c) { decrementUnigram(c,1); } /** * Decrements the unigram count by the specified amount for the * specified character. This * method is useful for training conditional probabilities, even * though it is not powerful enough to do it in full generality. * * @param c Decrement the unigram count for the specified * character. * @param count Amount to decrement. */ public void decrementUnigram(char c, int count) { mRootNode = mRootNode.decrement(new char[] { c }, 0, 1, count); } private List countsList(int nGramOrder) { ArrayList accum = new ArrayList(); mRootNode.addCounts(accum,nGramOrder); return accum; } /** * Writes an encoding of this counter to the specified output * stream. It may be read back in using {@link * #readFrom(InputStream)}. * * <p>The output is produced using a {@link BitTrieWriter} wrapped * around a {@link BitOutput} wrapped around the specified * underlying output stream. First, the bit output is used to * delta-code the maximum n-gram plus 1. Then, the trie is * encoded as described in {@link BitTrieWriter}. Finally, the * bit output is flushed. The underlying output stream is neither * flushed nor closed, allowing them to be used for other pruposes * after this counter is written. * * <p>If necessary for efficiency, streams should be buffered * before being passed to this method. * * @param out Underlying output stream for writing. * @throws IOException If there is an underlying I/O error. */ public void writeTo(OutputStream out) throws IOException { BitOutput bitOut = new BitOutput(out); bitOut.writeDelta(mMaxLength+1L); TrieWriter writer = new BitTrieWriter(bitOut); writeCounter(this,writer,mMaxLength); bitOut.flush(); } /** * Writes the specified sequence counter to the specified trie * writer, restricting output to n-grams not longer than the * specified maximum. * * @param counter Counter to write. * @param writer Trie writer to which counter is written. * @param maxNGram Maximum length n-gram written. * @throws IOException If there is an underlying I/O error. */ public static void writeCounter(CharSeqCounter counter, TrieWriter writer, int maxNGram) throws IOException { writeCounter(new char[maxNGram],0,counter,writer); } /** * Reads a trie character sequence counter from the specified * input stream. * * <p>The expected encoding is described in {@link * #writeTo(OutputStream)}. * * <p>If necessary for efficiency, streams should be buffered * before being passed to this method. * * @param in Underlying input stream for reading. * @throws IOException If there is an underlying I/O error. */ public static TrieCharSeqCounter readFrom(InputStream in) throws IOException { BitInput bitIn = new BitInput(in); int maxNGram = (int) (bitIn.readDelta() - 1L); BitTrieReader reader = new BitTrieReader(bitIn); return readCounter(reader,maxNGram); } /** * Reads a trie character sequence counter from the specified * trie reader, restricting the result to the specified maximum * n-gram. * * @param reader Reader from which to read the trie. * @param maxNGram Maximum length n-gram to read. * @return The counter read from the reader. * @throws IOException If there is an underlying I/O error. */ public static TrieCharSeqCounter readCounter(TrieReader reader, int maxNGram) throws IOException { TrieCharSeqCounter counter = new TrieCharSeqCounter(maxNGram); counter.mRootNode = readNode(reader,0,maxNGram); return counter; } static void writeCounter(char[] cs, int pos, CharSeqCounter counter, TrieWriter writer) throws IOException { long count = counter.count(cs,0,pos); writer.writeCount(count); if (pos < cs.length) { // daughters within n-gram bound char[] csNext = counter.charactersFollowing(cs,0,pos); for (int i = 0; i < csNext.length; ++i) { writer.writeSymbol(csNext[i]); cs[pos] = csNext[i]; writeCounter(cs,pos+1,counter,writer); } } writer.writeSymbol(-1L); // end of daughters } private static void skipNode(TrieReader reader) throws IOException { reader.readCount(); while (reader.readSymbol() != -1) skipNode(reader); } private static Node readNode(TrieReader reader, int depth, int maxDepth) throws IOException { if (depth > maxDepth) { skipNode(reader); return null; } long count = reader.readCount(); int depthPlus1 = depth + 1; long sym1 = reader.readSymbol(); // 0+ daughters if (sym1 == -1L) return NodeFactory.createNode(count); // 1+ daughters Node node1 = readNode(reader,depthPlus1,maxDepth); long sym2 = reader.readSymbol(); if (sym2 == -1L) return NodeFactory.createNodeFold((char)sym1,node1, count); Node node2 = readNode(reader,depthPlus1,maxDepth); long sym3 = reader.readSymbol(); if (sym3 == -1L) return NodeFactory.createNode((char)sym1,node1, (char)sym2,node2, count); Node node3 = readNode(reader,depthPlus1,maxDepth); long sym4 = reader.readSymbol(); if (sym4 == -1L) return NodeFactory.createNode((char)sym1,node1, (char)sym2,node2, (char)sym3,node3, count); Node node4 = readNode(reader,depthPlus1,maxDepth); // 4+ daughters StringBuffer cBuf = new StringBuffer(); cBuf.append((char)sym1); cBuf.append((char)sym2); cBuf.append((char)sym3); cBuf.append((char)sym4); List nodeList = new ArrayList(); nodeList.add(node1); nodeList.add(node2); nodeList.add(node3); nodeList.add(node4); long sym; while ((sym = reader.readSymbol()) != -1L) { cBuf.append((char)sym); nodeList.add(readNode(reader,depthPlus1,maxDepth)); } Node[] nodes = new Node[nodeList.size()]; nodeList.toArray(nodes); char[] cs = Strings.toCharArray(cBuf); return NodeFactory.createNode(cs,nodes,count); // > 3 daughters }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -