📄 triecharseqcounter.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
    /**     * Increments by the specified count all substrings of the     * specified character sequence up to the maximum length specified     * in the constructor.     *     * @param cSeq Character sequence.     * @param count Amount to increment.     */    public void incrementSubstrings(CharSequence cSeq, int count) {        incrementSubstrings(com.aliasi.util.Arrays.toArray(cSeq),                            0,cSeq.length(),count);    }    /**     * Increments the count of all prefixes of the specified     * character sequence up to the maximum length specified in the     * constructor.     *     * @param cs Underlying character array.     * @param start Index of first character in slice.     * @param end Index of one past last character in slice.     * @throws IndexOutOfBoundsException If the specified start and one plus     * end point are not in the bounds of character sequence.     */    public void incrementPrefixes(char[] cs, int start, int end) {        incrementPrefixes(cs,start,end,1);    }    /**     * Increments the count of all prefixes of the specified     * character sequence up to the maximum length specified in the     * constructor.     *     * @param cs Underlying character array.     * @param start Index of first character in slice.     * @param end Index of one past last character in slice.     * @param count Amount to increment.     * @throws IndexOutOfBoundsException If the specified start and one plus     * end point are not in the bounds of character sequence.     */    public void incrementPrefixes(char[] cs, int start, int end,                                  int count) {        Strings.checkArgsStartEnd(cs,start,end);        mRootNode = mRootNode.increment(cs,start,end,count);    }    /**     * Decrements all of the substrings of the specified character     * slice by one.  This method may be used in conjunction with     * {@link #incrementSubstrings(char[],int,int)} to implement     * counts for conditional probability estimates without affecting     * underlying estimates.  For example, the following code:     *      * <blockquote><pre>     * char[] cs = &quot;abcdefghi&quot;.toCharArray();     * counter.incrementSubstrings(cs,3,7);     * counter.decrementSubstrings(cs,3,5);     * </pre></blockquote>     *     * will increment the substrings of <code>&quot;defg&quot;</code>     * and then decrement the substrings of <code>&quot;de&quot;</code>,     * causing the net effect of incrementing the counts of substrings     * <code>&quot;defg&quot;</code>,     * <code>&quot;efg&quot;</code>,     * <code>&quot;fg&quot;</code>,     * <code>&quot;g&quot;</code>,     * <code>&quot;def&quot;</code>,     * <code>&quot;ef&quot;</code>, and     * <code>&quot;f&quot;</code>.  This has the effect of increasing     * the estimate of <code>g</code> given <code>def</code>, without     * increasing the estimate of <code>d</code> in an empty context.     *     * @param cs Underlying array of characters in slice.     * @param start Index of first character in slice.     * @param end Index of one past last character in slice.     * @throws IllegalArgumentException If the array slice is valid.     */    public void decrementSubstrings(char[] cs, int start, int end) {        Strings.checkArgsStartEnd(cs,start,end);        for (int i = start; i < end; ++i)            for (int j = i; j <= end; ++j)                 mRootNode = mRootNode.decrement(cs,i,j);    }    /**     * Returns a string representation of the trie structure of counts     * underlying this counter.     *     * <P><b>Warning:</b> The resulting string will be very large if     * the number of substrings is large.  To avoid blowing out     * memory, do not call this method for large counters.     *     * @return String representation of this counter.     */    public String toString() {        return mRootNode.toString();    }    void toStringBuffer(StringBuffer sb) {        mRootNode.toString(sb,0);    }    /**     * Decrements the unigram count for the specified character.  This     * method is useful for training conditional probabilities, even     * though it is not powerful enough to do it in full generality.     *     * @param c Decrement the unigram count for the specified     * character.     */    public void decrementUnigram(char c) {        decrementUnigram(c,1);    }    /**     * Decrements the unigram count by the specified amount for the     * specified character.  This     * method is useful for training conditional probabilities, even     * though it is not powerful enough to do it in full generality.     *     * @param c Decrement the unigram count for the specified     * character.     * @param count Amount to decrement.     */    public void decrementUnigram(char c, int count) {        mRootNode = mRootNode.decrement(new char[] { c }, 0, 1, count);    }    private List countsList(int nGramOrder) {        ArrayList accum = new ArrayList();        mRootNode.addCounts(accum,nGramOrder);        return accum;    }    /**     * Writes an encoding of this counter to the specified output     * stream.  It may be read back in using {@link     * #readFrom(InputStream)}.     *     * <p>The output is produced using a {@link BitTrieWriter} wrapped     * around a {@link BitOutput} wrapped around the specified     * underlying output stream.  First, the bit output is used to     * delta-code the maximum n-gram plus 1.  Then, the trie is     * encoded as described in {@link BitTrieWriter}.  Finally, the     * bit output is flushed.  The underlying output stream is neither     * flushed nor closed, allowing them to be used for other pruposes     * after this counter is written.     *     * <p>If necessary for efficiency, streams should be buffered     * before being passed to this method.     *     * @param out Underlying output stream for writing.     * @throws IOException If there is an underlying I/O error.     */    public void writeTo(OutputStream out) throws IOException {        BitOutput bitOut = new BitOutput(out);        bitOut.writeDelta(mMaxLength+1L);        TrieWriter writer = new BitTrieWriter(bitOut);        writeCounter(this,writer,mMaxLength);        bitOut.flush();    }    /**     * Writes the specified sequence counter to the specified trie     * writer, restricting output to n-grams not longer than the     * specified maximum.     *     * @param counter Counter to write.     * @param writer Trie writer to which counter is written.     * @param maxNGram Maximum length n-gram written.     * @throws IOException If there is an underlying I/O error.     */    public static void writeCounter(CharSeqCounter counter,                                    TrieWriter writer,                                    int maxNGram)         throws IOException {        writeCounter(new char[maxNGram],0,counter,writer);    }    /**     * Reads a trie character sequence counter from the specified     * input stream.       *     * <p>The expected encoding is described in {@link     * #writeTo(OutputStream)}.     *     * <p>If necessary for efficiency, streams should be buffered     * before being passed to this method.     *     * @param in Underlying input stream for reading.     * @throws IOException If there is an underlying I/O error.     */    public static TrieCharSeqCounter readFrom(InputStream in)         throws IOException {        BitInput bitIn = new BitInput(in);        int maxNGram = (int) (bitIn.readDelta() - 1L);        BitTrieReader reader = new BitTrieReader(bitIn);        return readCounter(reader,maxNGram);    }    /**     * Reads a trie character sequence counter from the specified     * trie reader, restricting the result to the specified maximum     * n-gram.     *     * @param reader Reader from which to read the trie.     * @param maxNGram Maximum length n-gram to read.     * @return The counter read from the reader.     * @throws IOException If there is an underlying I/O error.     */    public static TrieCharSeqCounter readCounter(TrieReader reader,                                                 int maxNGram)         throws IOException {        TrieCharSeqCounter counter = new TrieCharSeqCounter(maxNGram);        counter.mRootNode = readNode(reader,0,maxNGram);        return counter;    }    static void writeCounter(char[] cs, int pos,                              CharSeqCounter counter,                             TrieWriter writer)        throws IOException {        long count = counter.count(cs,0,pos);        writer.writeCount(count);        if (pos < cs.length) { // daughters within n-gram bound            char[] csNext = counter.charactersFollowing(cs,0,pos);            for (int i = 0; i < csNext.length; ++i) {                writer.writeSymbol(csNext[i]);                cs[pos] = csNext[i];                writeCounter(cs,pos+1,counter,writer);            }        }        writer.writeSymbol(-1L);  // end of daughters    }    private static void skipNode(TrieReader reader)         throws IOException {        reader.readCount();        while (reader.readSymbol() != -1)            skipNode(reader);    }    private static Node readNode(TrieReader reader, int depth, int maxDepth)        throws IOException {            if (depth > maxDepth) {            skipNode(reader);            return null;        }        long count = reader.readCount();        int depthPlus1 = depth + 1;        long sym1 = reader.readSymbol();        // 0+ daughters        if (sym1 == -1L)            return NodeFactory.createNode(count);        // 1+ daughters        Node node1 = readNode(reader,depthPlus1,maxDepth);        long sym2 = reader.readSymbol();        if (sym2 == -1L)            return NodeFactory.createNodeFold((char)sym1,node1,                                              count);        Node node2 = readNode(reader,depthPlus1,maxDepth);        long sym3 = reader.readSymbol();        if (sym3 == -1L)            return NodeFactory.createNode((char)sym1,node1,                                          (char)sym2,node2,                                          count);        Node node3 = readNode(reader,depthPlus1,maxDepth);        long sym4 = reader.readSymbol();        if (sym4 == -1L)             return NodeFactory.createNode((char)sym1,node1,                                          (char)sym2,node2,                                          (char)sym3,node3,                                          count);        Node node4 = readNode(reader,depthPlus1,maxDepth);            // 4+ daughters        StringBuffer cBuf = new StringBuffer();        cBuf.append((char)sym1);        cBuf.append((char)sym2);        cBuf.append((char)sym3);        cBuf.append((char)sym4);            List nodeList = new ArrayList();        nodeList.add(node1);        nodeList.add(node2);        nodeList.add(node3);        nodeList.add(node4);        long sym;        while ((sym = reader.readSymbol()) != -1L) {            cBuf.append((char)sym);            nodeList.add(readNode(reader,depthPlus1,maxDepth));        }        Node[] nodes = new Node[nodeList.size()];        nodeList.toArray(nodes);        char[] cs = Strings.toCharArray(cBuf);        return NodeFactory.createNode(cs,nodes,count);  // > 3 daughters    }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -