📄 hmmdecoder.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
            return emitProbs;        }        emitProbs = computeEmitProbs(emission);        mEmissionCache.put(emission,emitProbs);        return emitProbs;    }        double[] computeEmitProbs(String emission) {        int numTags = mHmm.stateSymbolTable().numSymbols();        double[] emitProbs = new double[numTags];        for (int i = 0; i < numTags; ++i)            emitProbs[i] = mHmm.emitProb(i,emission);        return emitProbs;    }    double[] emitProbs(String emission) {        return (mEmissionCache == null)            ? computeEmitProbs(emission)            : cachedEmitProbs(emission);    }    double[] cachedEmitLog2Probs(String emission) {        double[] emitLog2Probs = mEmissionLog2Cache.get(emission);        if (emitLog2Probs != null) {            return emitLog2Probs;        }        emitLog2Probs = computeEmitLog2Probs(emission);        mEmissionLog2Cache.put(emission,emitLog2Probs);        return emitLog2Probs;    }    double[] computeEmitLog2Probs(String emission) {        int numTags = mHmm.stateSymbolTable().numSymbols();        double[] emitLog2Probs = new double[numTags];        for (int i = 0; i < numTags; ++i)            emitLog2Probs[i] = mHmm.emitLog2Prob(i,emission);        additiveBeamPrune(emitLog2Probs,mLog2EmissionBeam);        return emitLog2Probs;    }    static void additiveBeamPrune(double[] emitLog2Probs, double beam) {        if (beam == Double.POSITIVE_INFINITY) return; // no pruning        double best = emitLog2Probs[0];        for (int i = 1; i < emitLog2Probs.length; ++i)            if (emitLog2Probs[i] > best)                 best = emitLog2Probs[i];        for (int i = 1; i < emitLog2Probs.length; ++i)            if (emitLog2Probs[i] + beam < best)                emitLog2Probs[i] = Double.NEGATIVE_INFINITY;    }    double[] emitLog2Probs(String emission) {        return (mEmissionLog2Cache == null)            ? computeEmitLog2Probs(emission)            : cachedEmitLog2Probs(emission);    }    /**     * Return a complete tag-word lattice for the specified array of     * string emissions.  Lattices provide forward and backward     * values.     *     * <P><i>Implementation Note:</i> This method is implemented by     * the standard forward-backward dynamic programming algorithm.     * The estimates     * <code>P<sub><sub>&alpha;</sub></sub>(n,state)</code> are of     * the probability of a derivation starting at the beginning and     * arriving at the specified state after the specified number of     * tokens.     *      * <blockquote><code>     * P<sub><sub>&alpha;</sub></sub>(0,state)      *   = P<sub><sub>start</sub></sub>(state)     *     * P<sub><sub>emit</sub></sub>(emissions[0],state)     * </code></blockquote>     * <blockquote><code>     * P<sub><sub>&alpha;</sub></sub>(n,state)     * <br> &nbsp; &nbsp;      * = <big><big>&Sigma;</big></big><sub><sub>sourceState</sub></sub>     *     P<sub><sub>&alpha;</sub></sub>(n-1,sourceState)      * <br> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;     *     * P<sub><sub>transit</sub></sub>(state|sourceState)      * <br> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;     *     * P<sub><sub>emit</sub></sub>(emissions[n]|state)     * </code></blockquote>     *     * Note that the forward probabilities up to a token position     * include the emission probability for that token.     *     * <P>The backward values are defined dually as the probability of     * a derivation ending at a specified state being continued to     * the end of the analysis.       *      * <blockquote><code>     * P<sub><sub>&beta;</sub></sub>(|emissions|-1,state)      *   = P<sub><sub>end</sub></sub>(state)     * </code></blockquote>     *     * <blockquote><code>     * P<sub><sub>&beta;</sub></sub>(n,state)     * <br> &nbsp; &nbsp;      * = <big><big>&Sigma;</big></big><sub><sub>targetState</sub></sub>     *     P<sub><sub>&beta;</sub></sub>(n+1,targetState)      * <br> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;     *     * P<sub><sub>transit</sub></sub>(targetState|state)     * <br> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;     *     * P<sub><sub>emit</sub></sub>(emissions[n+1]|targetState)     * </code></blockquote>     *     * Note that token emission probabilities for a given state are not     * included in the backward score; they are computed with target     * states in a way that matches the forward algorithm's     * definition.  This asymmetry is so that the forward-backward     * estimates <code>P<sub><sub>&gamma;</sub></sub>(n,state)</code>     * correspond to the probability of a derivation being in a given     * state for a given position given the input:     *     * <blockquote><code>     * P<sub><sub>&gamma;</sub></sub>(n,state)     * = P<sub><sub>&alpha;</sub></sub>(n,state)     * * P<sub><sub>&beta;</sub></sub>(n,state)     * </code></blockquote>     *     * These values include the token emission probabilities,     * but may be normalized in the usual Bayesian fashion by     * dividing by the marginal <code>P(emissions)</code>:     *     * <blockquote><code>     * P(n,state|emissions)      * = P<sub><sub>&gamma;</sub></sub>(n,state) / P(emissions)</code>     * </code></blockquote>     *     * where the marginal is just the sum of all forward-backward     * values for any token position <code>i</code>:     *      * <blockquote><code>     * P(emissions)     * = <big><big>&Sigma;</big></big><sub><sub>state</sub></sub>     *     P<sub><sub>&gamma;</sub></sub>(i,state)     * </code></blockquote>     *     * Most of the computation is carried out by the constructor     * {@link TagWordLattice#TagWordLattice(String[],SymbolTable,double[],double[],double[][][])} with the following start, end and transition arrays:     *     * <blockquote><code>     * start(state) = P<sub><sub>start</sub></sub>(0,state)     *                * P<sub><sub>emit</sub></sub>(emissions[0],state)     * </code></blockquote>     *      * <blockquote><code>     * end(state) = P<sub><sub>end</sub></sub>(state)     * </code></blockquote>     *      * <blockquote><code>     * transition(i,sourceState,targetState)     * <br> &nbsp; &nbsp;      * = P<sub><sub>transit</sub></sub>(targetState|sourceState)     * <br> &nbsp; &nbsp; &nbsp;      *   * P<sub><sub>emit</sub></sub>(emissions[i]|targetState)     * </code></blockquote>     *      * @param emissions Array of strings emitted.     * @return Full tag-word lattice for specified emissions.     */    public TagWordLattice lattice(String[] emissions) {        int numTokens = emissions.length;        int numTags = mHmm.stateSymbolTable().numSymbols();        if (numTokens == 0)             return new TagWordLattice(emissions,mHmm.stateSymbolTable(),                                      new double[numTags],                                      new double[numTags],                                      new double[0][numTags][numTags]);        double[] starts = new double[numTags];        double[] emitProbs = emitProbs(emissions[0]);        for (int tagId = 0; tagId < numTags; ++tagId)            starts[tagId]                = mHmm.startProb(tagId) * emitProbs[tagId];            double[][][] transitions = new double[numTokens][][];        for (int i = 1; i < numTokens; ++i) {            double[][] transitionsI = new double[numTags][];            transitions[i] = transitionsI;            double[] emitProbs2 = emitProbs(emissions[i]);            for (int prevTagId = 0; prevTagId < numTags; ++prevTagId) {                double[] transitionsIPrevTag = new double[numTags];                transitions[i][prevTagId] = transitionsIPrevTag;                for (int tagId = 0; tagId < numTags; ++tagId) {                    double transitEstimate = mHmm.transitProb(prevTagId,tagId);                    transitionsIPrevTag[tagId]                         = transitEstimate * emitProbs2[tagId];                }            }        }        double[] ends = new double[numTags];        for (int tagId = 0; tagId < numTags; ++tagId)            ends[tagId] = mHmm.endProb(tagId);        return new TagWordLattice(emissions,mHmm.stateSymbolTable(),                                  starts,ends,transitions);    }                /**     * Returns an array consisting of the states with the highest     * likelihood to emit the specifed array of strings.       *     * <P><i>Implementation Note:</i> This method is implemented with     * the Viterbi algorithm.  The Viterbi algorithm uses dynamic     * programming (memoization) to compute the maximum probability of      * arriving in a state <code>state</code> after consuming     * inputs <code>emissions[0],...,emissions[n-1]</code>:     *      * <blockquote><code>     * P<sub><sub>best</sub></sub>(0,state)      * <br> &nbsp; &nbsp;     *   = P<sub><sub>start</sub></sub>(state)     * <br> &nbsp; &nbsp; &nbsp;     *     * P<sub><sub>emit</sub></sub>(emissions[0],state)     * </code></blockquote>     *     * <blockquote><code>     * P<sub><sub>best</sub></sub>(n,state)     * <br> &nbsp; &nbsp;     * = MAX<sub><sub>prevState</sub></sub>     *     P<sub><sub>best</sub></sub>(n-1,prevState)      * <br> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;     *     * P<sub><sub>transit</sub></sub>(state|prevState)      * <br> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;     *     * P<sub><sub>emit</sub></sub>(emissions[n]|state)     * </code></blockquote>     *     * <blockquote><code>     * P<sub><sub>best</sub></sub>(last,state)      *    *= P<sub><sub>end</sub></sub>(state)     * </code></blockquote>     *     * Note that the initial condition uses the start probability     * rather than the transition times the previous best probability.     * The notation in the last line is meant to indicate that     * the last index has the probability of a state being the     * last state multiplied in.  As usual, we use logarithms     * and additions rather than multiplication.     *      * <P>As usual, the algorithm employs an array of backpointers     * from a state at a given input to the last state along     * the best path.  This is computed by simply recording the     * state maximizing the above equation:     *     * <blockquote><code>     * backPtr(0,state) = null     * </code></blockquote>     *     * <blockquote><code>     * backPtr(n,state)      * <br> &nbsp; &nbsp;     * = ARGMAX<sub><sub>prevState</sub></sub>     *       P<sub><sub>best</sub></sub>(n-1,prevState)      * <br> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;      *      &nbsp; &nbsp;     *       * P<sub><sub>transit</sub></sub>(state|prevState)      * <br> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;      *      &nbsp; &nbsp;     *       * P<sub><sub>emit</sub></sub>(emissions[n]|state)     * </code></blockquote>     *     * By tracing the array of backpointers from the best final     * state, the best path can be recovered.     *     * @param emissions Array of strings emitted.     * @return Array of states most likely to have emitted the     * specified strings.     */    public String[] firstBest(String[] emissions) {        if (emissions.length == 0)             return EMPTY_STRING_ARRAY;        return new Viterbi(emissions).bestStates();    }    /**     * Returns a best-first iterator of {@link ScoredObject} instances     * consisting of arrays of tags and log (base 2) joint likelihoods     * of the tags and emissions with respect to the underlying HMM.     * Only analyses with non-zero probability estimates are returned.     *     * <P><i>Implementation Note:</i> This method is implemented by     * doing a Viterbi search to provide exact A<sup>*</sup> bounds
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -