📄 compiledngramprocesslm.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.lm;import com.aliasi.stats.Model;import com.aliasi.util.Strings;import java.io.ObjectInput;import java.io.IOException;import java.util.Arrays;/** * A <code>CompiledNGramProcessLM</code> implements a conditional * process language model.  Instances are constructed by reading a * serialized version of an instance of {@link NGramProcessLM} through * data input. * * <P>Compiled models contain precompulted estimates and smoothing * parameters.  For instance, consider an 3-gram process language model * trained on the string "abracadabra", which yields the following counts: * * <blockquote> * <pre> *    11 *     a 5 *       br 2 *       ca 1 *       da 1 *    bra 2 *    cad 1 *    dab 1 *    r 2 *      a 2 *         c 1 * </pre> * </blockquote> * * For instance, <code>count("")=11</code>, * <code>count("a")=5</code>, * <code>count("ab")=2</code>, * <code>count("abr")=2</code>, * <code>count("br")=2</code>, * <code>count("bra")=2</code>, * <code>count("r")=2</code>, and * <code>count("ra")=2</code>, and * <code>count("rac")=1</code>. * * Assuming a lambda factor hyperparameter set to 4.0, the compiled * model consists of the following five parallel arrays: * * <blockquote> * <table border='1' cellpadding='5'> * <tr><td>&nbsp;</td> *     <td>&nbsp;</td> *     <td><tt>char (2)</tt></td> *     <td><tt>int (4)</tt></td> *     <td><tt>float (4)</tt></td> *     <td><tt>float (4)</tt></td> *     <td><tt>int (4)</tt></td></tr> * <tr><td><i>Index</i></td> *     <td><i>Context</i></td> *     <td><i>Char</i></td> *     <td><i>Suffix</i></td> *     <td><i>log<sub><sub>2</sub></sub> P(char|context)</i></td> *     <td><i>log<sub><sub>2</sub></sub> (1 - &lambda;(context.char))</i></td> *     <td><i>First Dtr</i></td></tr> <tr><td>0</td><td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td> <td>-0.6322682</td><td>1</td></tr> <tr><td>1</td><td>""</td><td>a</td><td>0</td><td>-2.6098135</td> <td>-0.4150375</td><td>6</td></tr> <tr><td>2</td><td>""</td><td>b</td><td>0</td><td>-3.8987012</td> <td>-0.5849625</td><td>9</td></tr> <tr><td>3</td><td>""</td><td>c</td><td>0</td><td>-4.845262</td> <td>-0.32192808</td><td>10</td></tr> <tr><td>4</td><td>""</td><td>d</td><td>0</td><td>-4.845262</td> <td>-0.32192808</td><td>11</td></tr> <tr><td>5</td><td>""</td><td>r</td><td>0</td><td>-3.8987012</td> <td>-0.5849625</td><td>12</td></tr> <tr><td>6</td><td>"a"</td><td>b</td><td>2</td><td>-2.5122285</td> <td>-0.5849625</td><td>13</td></tr> <tr><td>7</td><td>"a"</td><td>c</td><td>3<td>-3.4966948</td> <td>-0.32192808</td><td>14</td></tr> <tr><td>8</td><td>"a"</td><td>d</td><td>4</td><td>-3.4966948</td> <td>-0.32192808</td><td>15</td></tr> <tr><td>9</td><td>"b"</td><td>r</td><td>5</td><td>-1.4034244</td> <td>-0.5849625</td><td>16</td></tr> <tr><td>10</td><td>"c"</td><td>a</td><td>1</td><td>-1.5948515</td> <td>-0.32192808</td><td>17</td></tr> <tr><td>11</td><td>"d"</td><td>a</td><td>1</td><td>-1.5948515</td> <td>-0.32192808</td><td>18</td></tr> <tr><td>12</td><td>"r"</td><td>a</td><td>1</td><td>-1.1760978</td> <td>-0.32192808</td><td>19</td></tr> <tr><td>13</td><td>"ab"</td><td>r</td><td>9</td><td>-0.77261907</td> <td colspan='2' rowspan='7'> <i><small>This space intentionally left left blank:  maximum length n-grams are not contexts.</small</i></td></tr> <tr><td>14</td><td>"ac"</td><td>a</td><td>10</td><td>-1.1051782</td></tr> <tr><td>15</td><td>"ad"</td><td>a</td><td>11</td><td>-1.1051782</td></tr> <tr><td>16</td><td>"br"</td><td>a</td><td>12</td><td>-0.6703262</td></tr> <tr><td>17</td><td>"ca"</td><td>d</td><td>8</td><td>-1.8843123</td></tr> <tr><td>18</td><td>"da"</td><td>b</td><td>6</td><td>-1.5554274</td></tr> <tr><td>19</td><td>"ra"</td><td>c</td><td>7</td><td>-1.8843123</td></tr> * </table> * </blockquote> * * The actual data is contained in the last five columns of the table. * Thus internal nodes require 10 bytes and terminal nodes require 18 * bytes.  The indices in the first column and the implicit context * represented by the node are for convenience.  Each of these indices * picks out a row in the table that corresponds to a node in the trie * structure representing the counts.  The nodes are arranged * according to a unicode-order breadth-first traversal of the count * trie.  Each non-terminal node stores a character, the index of the * suffix node (as in a suffix tree), a probability estimate for the * character given the context of characters that led to the * character, an interpolation value for the context represented by * the context leading to the character plus the character itself, and * finally a pointer to the first daughter of the node in the array. * Note that the last daughter of a node is thus picked out by the * first daughter of the next node minus one.  For instance, the * daughters of node 1 are 6, 7 and 8, and the daguther of node 9 is * 16.  The terminal nodes have no daughters; note that memory is not * allocated for these terminal cells. * * <P>The second column indicates the context derived by walking * from the root node (index 0) down to the node.  For instance, the * path "bra" leads from the root node 0 ("") to the node 2 ("b"), to * node 9 ("br"), and finally to node 16 ("bra").  The tree is * traversed by starting at the root node and looking up daughters by * binary search. * * * <P>If a full context and node are in the tree, the estimate can * simply be looked up.  For instance, * <code>log<sub><sub>2</sub></sub> P(b) = -3.898</code>, * <code>log<sub><sub>2</sub></sub> P(r|b) = -1.403</code>, and * <code>log<sub><sub>2</sub></sub> P(a|br) = -0.670</code>.  If the * context is available, but not the outcome, the result is just the * addition of the log one minus lambda estimates down to the first * context for which the outcome exists.  These contexts to explore * are all suffixes of the context currently being explored and may be * looked up in the suffix array. For instance, consider: * * <blockquote><code> * P(c|br)  * = &lambda;(br) P<sub><sub>ML</sub></sub>(c|br)  * + (1-&lambda;(br)) P(c|b) * </code></blockquote> * * In this case, the outcome <code>c</code> is not available for the * context <code>br</code> (the only daughter is <code>a</code>), * hence the maximum likelihood estimate is zero, and the result * reduces to the second term: * * <blockquote><code> *  P(c|br) = (1-&lambda;(br)) P(c|b) * </code></blockquote> * * and hence * * <blockquote><code> *  log<sub><sub>2</sub></sub> P(c|br) *  = log<sub><sub>2</sub></sub>((1-&lambda;(br)) P(c|b)) *  = log<sub><sub>2</sub></sub>(1-&lambda;(br)) *    + log<sub><sub>2</sub></sub> P(c|b) * </code></blockquote> * * This continues until the outcome is found.  In this case, * we continue with the next term in the same way: * * <blockquote><code> *  = log<sub><sub>2</sub></sub> (1-&lambda;(br)) *    + log<sub><sub>2</sub></sub> (1- &lambda;(b)) *    + log<sub><sub>2</sub></sub> P(c) * <br> * = -0.584 + -0.584 + -0.4845 * </code></blockquote> * * Because 3-grams are the upper bound on context length, and terminal * nodes have no daughters, we conserve length at the end of the * smoothing and daughter arrays. * * <P>In practice, this smoothing may require going all the way * down to the uniform model.  The uniform model estimate is * stored separately.  The interpolation parameter for the root * node works as for any other node. * * <P>For estimates of sequences, the final node used will act * as the first potential context for the next estimate.  This * replaces a number of lookups equal to the n-gram length by * binary character search with a simple array lookup. * * <P><i>Implementation Note:</i> The suffix indices are not included * in the binary serialization format. Instead, they are initialized * right after the binary data is read in.  This requires walking the * trie-structure and computing each node's suffix by doing a walk * from the root.  For instance, a million node 8-gram model will * require at most 8 million binary character searches during * initialization. * * @author  Bob Carpenter * @version 3.6 * @since   LingPipe2.0 */public class CompiledNGramProcessLM    implements LanguageModel.Process,               LanguageModel.Conditional,               Model<CharSequence> {    private final int mMaxNGram;    private final float mLogUniformEstimate;    private final char[] mChars;    private final float[] mLogProbs;    private final float[] mLogOneMinusLambdas;    private final int[] mFirstChild;    private final int[] mSuffix;    private final int mLastContextIndex;    // Data Format    // -------------------------------------------------    // maxNGram:int    // logUniformEstimate:float    // numTotalNodes:int    // numInternalNodes:int    // (c:char,    //  logProb:float,    //  logOneMinusLambda:float,    //  firstChild:int)^numInternalNodes    // (c:char,    //  logProb:float)^(numTotalNodes-numInternalNodes)    // public CompiledNGramProcessLM() { }    /**     * Construct a compiled n-gram process language model by reading     * it from the specified data input.  The data should have been     * constructed by serializing an instance of {@link NGramProcessLM}.     *     * @param dataIn Data input from which to read the model.     * @throws IOException If there is an exception reading from the     * input.     */    CompiledNGramProcessLM(ObjectInput dataIn) throws IOException {        mMaxNGram = dataIn.readInt();        mLogUniformEstimate = dataIn.readFloat();        int numTotalNodes = dataIn.readInt();        int lastInternalNodeIndex = dataIn.readInt();        mLastContextIndex = lastInternalNodeIndex;        mChars = new char[numTotalNodes];        mLogProbs = new float[numTotalNodes];        mSuffix = new int[numTotalNodes];        Arrays.fill(mSuffix,CACHE_NOT_COMPUTED_VALUE);        mLogOneMinusLambdas = new float[lastInternalNodeIndex+1];        mFirstChild = new int[lastInternalNodeIndex+2];        mFirstChild[lastInternalNodeIndex+1] = numTotalNodes;        for (int i = 0; i <= lastInternalNodeIndex; ++i) {            mChars[i] = dataIn.readChar();            mLogProbs[i] = dataIn.readFloat();            mLogOneMinusLambdas[i] = dataIn.readFloat();            mFirstChild[i] = dataIn.readInt();        }        for (int i = lastInternalNodeIndex+1; i < numTotalNodes; ++i) {            mChars[i] = dataIn.readChar();            mLogProbs[i] = dataIn.readFloat();        }        compileSuffixes("",ROOT_NODE_INDEX);    }    /**     * Returns the array of characters that have been observed for this     * language model.  These are returned in increasing unicode order.     *     * <P>It is assumed that the observed characters are     * those stored unigram probabilities in the trie.     *     * @return The array of characters that have been observed     * for this language model.     */    public char[] observedCharacters() {        if (mFirstChild.length < 2) return new char[0];        char[] result = new char[mFirstChild[1]-1];        for (int i = 0; i < result.length; ++i)            result[i] = mChars[i+1];
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -