📄 tagwordlattice.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.hmm;import com.aliasi.symbol.SymbolTable;import com.aliasi.util.ScoredObject;import java.util.Arrays;/** * A <code>TagWordLattice</code> encodes a lattice resulting from * decoding a hidden Markov model (HMM).  The lattice encodes the * tokens used as input and the tag symbol table, as well as matrices * for transition, forward and backward scores. * * <P>The lattice probabilities are factored into start, transition, * and end probabilities.  In general, the start, transition and * forward probabilities include the emission probabilities of their * destination tag.  The backward probabilities include all emissions * up to, but not including, the indexed node. * * @author  Bob Carpenter * @version 3.0 * @since   LingPipe2.1 */public class TagWordLattice {    final double[][][] mTransitions;    final double[][] mForwards;    final double[] mForwardExps;    final double[][] mBacks;    final double[] mBackExps;    final double[] mStarts;    final double[] mEnds;    final String[] mTokens;    final SymbolTable mTagSymbolTable;    double mTotal = Double.NaN;    double mLog2Total = Double.NaN;    /**     * Construct a tag-word lattice for the specified token inputs and     * the specified tag symbol table with the specified estimates.     * This constructor also allocates the forward and backward arrays     * which are the size of the number of tokens times the number of     * tags.     *     * <P>There are a number of consistency conditions on the input:     *     * <UL>     *     * <LI> The tag symbol table should not be empty.       *     * <LI> Start and end probability arrays must be the same length     * as the number of tags in the symbol table.  Start tags include     * the start tag estimate and the emission estimate for the first     * token.  End tags only include the end transition probabilities.     *     * <LI> Transit probability array should be dimension of number of     * tokens times number of tags times number of tags.  They should     * contain estimates of transition from the first tag to the     * second tag and emitting the token.     *     * <LI> Values of the transit probabilities for the first token are     * ignored because they are computed by the start probs without     * previous states.     *     * </UL>     *     * @param tokens Array of input tokens.     * @param tagSymbolTable Symbol table for tags.     * @param startProbs Array of start probabilities.     * @param endProbs Array of end probabilities.     * @param transitProbs Array of transition probabilities.     * @throws IllegalArgumentException If any of the probabilities are     * not between 0.0 and 1.0 inclusive.     */    public TagWordLattice(String[] tokens,                          SymbolTable tagSymbolTable,                          double[] startProbs,                          double[] endProbs,                          double[][][] transitProbs) {        for (int i = 0; i < startProbs.length; ++i) {            if (startProbs[i] < 0.0 || startProbs[i] > 1.0) {                String msg = "startProbs[" + i + "]=" + startProbs[i];                throw new IllegalArgumentException(msg);            }        }        for (int i = 0; i < endProbs.length; ++i) {            if (endProbs[i] < 0.0 || endProbs[i] > 1.0) {                String msg = "endProbs[" + i + "]=" + endProbs[i];                throw new IllegalArgumentException(msg);            }        }        for (int i = 1; i < transitProbs.length; ++i) {            for (int j = 0; j < transitProbs[i].length; ++j) {                for (int k = 0; k < transitProbs[i][j].length; ++k) {                    if (transitProbs[i][j][k] < 0.0 || transitProbs[i][j][k] > 1.0) {                        String msg = "transitProbs[" + i + "][" + j + "][" + k + "]="                             + transitProbs[i][j][k];                        throw new IllegalArgumentException(msg);                    }                }            }        }        int numTags = tagSymbolTable.numSymbols();        int numTokens = tokens.length;        mStarts = startProbs;        mEnds = endProbs;        mTransitions = transitProbs;        mTokens = tokens;        mTagSymbolTable = tagSymbolTable;        mForwards = new double[numTokens][numTags];        mForwardExps = new double[numTokens];        // Arrays.fill(mForwardExps,0.0);        mBacks = new double[numTokens][numTags];        mBackExps = new double[numTokens];        // Arrays.fill(mBackExps,0.0);        computeAll();    }    /**     * Returns the array of tokens underlying this tag-word lattice.     *      * @return The array of tokens for this lattice.     */    public String[] tokens() {        return mTokens;    }        /**     * Return the symbol table for tags in this tag-word lattice.     *     * @return The symbol table for the lattice.     */    public SymbolTable tagSymbolTable() {        return mTagSymbolTable;    }    /**     * Returns the array of tag-score pairs for the specified token     * index as scored objects in order of descending score.  The     * scores are log (base 2) conditional probabilities of the tag     * being assigned to the specified token given the token sequence.     *     * @param tokenIndex Token index whose tags are returned.     * @return Array of scored tags for the specified index.     */    public ScoredObject<String>[] log2ConditionalTags(int tokenIndex) {        double log2Total = log2Total();        SymbolTable st = mTagSymbolTable;        int numTags = st.numSymbols();        ScoredObject<String>[] scoredTags             = (ScoredObject<String>[]) new ScoredObject[numTags];        for (int tagId = 0; tagId < numTags; ++tagId) {            String tag = st.idToSymbol(tagId);            double log2P = log2ForwardBackward(tokenIndex,tagId);            double condLog2P = log2P - log2Total;            if (condLog2P > 0.0)                 condLog2P = 0.0;            else if (Double.isNaN(condLog2P) || Double.isInfinite(condLog2P))                condLog2P = com.aliasi.util.Math.log2(Double.MIN_VALUE);            scoredTags[tagId] = new ScoredObject<String>(tag,condLog2P);        }        Arrays.sort(scoredTags,ScoredObject.REVERSE_SCORE_COMPARATOR);        return scoredTags;    }        /**     * Returns the array of tags with the best forward-backward     * probabilities for each token position.     *     * <P><i>Note:</i> This is the independent optimization of each     * position and is not guaranteed to yield the sequence of states     * that has the highest probability.     *     * @return Array of tags with the best forward-backward scores.     */    public String[] bestForwardBackward() {        String[] bestTags = new String[mTokens.length];        int numTags = mTagSymbolTable.numSymbols();        for (int i = 0; i < bestTags.length; ++i) {            int bestTagId = 0;            double bestFB = forwardBackward(i,0);            for (int tagId = 1; tagId < numTags; ++tagId) {                double fb = forwardBackward(i,tagId);                if (fb > bestFB) {                     bestFB = fb;                    bestTagId = tagId;                }            }            bestTags[i] = mTagSymbolTable.idToSymbol(bestTagId);        }        return bestTags;    }        /**     * Return the probability of the lattice starting with the tag     * with the specified identifier and emitting the first input token.     *     * @param tagId Identifier for the tag in the symbol table.     * @return Start probability.     * @throws IndexOutOfBoundsException If the tagId is out of bounds.     */    public double start(int tagId) {        return mStarts[tagId];    }    /**     * Return the log (base 2) probability of the lattice starting     * with the tag with the specified identifier and emitting the     * first input token.  See {@link #start(int)} for more     * information.     *     * @param tagId Identifier for the tag in the symbol table.     * @return Log start probability.     * @throws IndexOutOfBoundsException If the tagId is out of bounds.     */    public double log2Start(int tagId) {        return com.aliasi.util.Math.log2(start(tagId));    }    /**     * Return the probability of the lattice ending with the specified     * tag.  Note that this does not include the probability of     * emitting the final token.     *     * @param tagId Identifier for the tag in the symbol table.     * @return End probability.     * @throws IndexOutOfBoundsException If the tag identifier is out     * of bounds.     */    public double end(int tagId) {        return mEnds[tagId];    }    /**     * Return the log (base 2) probability of the lattice ending with     * the specified tag.  See {@link #end(int)} for more information.     *     * @param tagId Identifier for the tag in the symbol table.     * @return Log end probability.     * @throws IndexOutOfBoundsException If the tag identifier is out     * of bounds.     */    public double log2End(int tagId) {        return com.aliasi.util.Math.log2(end(tagId));    }    /**     * Returns the transtion probability for the specified token index     * and source and target tag identifiers.  This transition     * probability includes the transition from the source tag      * to the target tag times the probability of the target tag      * emitting the token at the specified index.     *     * <P>Note that the token index cannot be zero here, as it is the     * index of the target of a transition.     *     * @param tokenIndex Index of token.     * @param sourceTagId Identifier for source tag in symbol table.     * @param targetTagId Identifier for target tag in symbol table.     * @return Transition score from source tag to target tag arriving
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -