📄 compiledestimator.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.chunk;import com.aliasi.symbol.SymbolTable;import com.aliasi.tokenizer.TokenCategorizer;import java.io.IOException;import java.io.ObjectInput;import java.util.ArrayList;/** * A compiled estimator is constructed by reading a binary model * compiled by a trainable estimator from a data input stream.  The * estimator may then be used to estimate instances of <code>log * P(Tag,Token|Tag-1,Token-1,Token-2)</code> using the {@link * #estimate(int,int,int,int,int)} method, where the integer values * are identifers of the associated symbols in the appropriate (tag or * token) symbol table.  The symbol tables are stored in the compiled * estimator.  Various operations on tags as identifiers are * precomputed and supplied by methods in this class. * * <p> The components of a compiled estimator is stored in the * following order. * * <br/><br/> * <table cellpadding="5" border="1"> * <tr><td width="25%"><b>Variable</b></td> *     <td width="25%"><b>Type</b></td> *     <td width="50%"><b>Description</b></td></tr> * <tr><td><code>tagSymbolTable</code></td> *     <td><code>SymbolTable</code></td> *     <td>Symbol table for tags.</td></tr> * <tr><td><code>tokenSymbolTable</code></td> *     <td><code>SymbolTable</code></td> *     <td>Symbol table for tokens.</td></tr> * <tr><td><code>tagTrie</code></td> *     <td><code>EstimatorTrie</code></td> *     <td>Estimator trie for tags.</td></tr> * <tr><td><code>tokenTrie</code></td> *     <td><code>EsitmatorTrie</code></td> *     <td>Estimator trie for tokens.</td></tr> * <tr><td><code>logVocabEstimate</code></td> *     <td><code>double</code></td> *     <td>Estimate of log likelihood of a token.</td></tr> * </table> * <br/> * * @author  Bob Carpenter * @version 2.3 * @since   LingPipe1.0 */final class CompiledEstimator {    /**     * A trie of estimates and backoffs for <code>log     * P(Tag|Tag-1,Token-1,Token-2)</code>.     */    private final EstimatorTrie mTagTrie;    /**     * A trie of estimates and backoffs for <code>log     * P(Token|Tag,Tag-1,Token-1)</code>.     */    private final EstimatorTrie mTokenTrie;    /**     * A symbol table for tags.     */    private final SymbolTable mTagSymbolTable;    /**     * The symbol table for tokens.     */    private final SymbolTable mTokenSymbolTable;    /**     * <code>mCannotFollow[tagID][tagMinus1ID]</code> is true     * if and only if a tag with identifier <code>tagID</code> can follow     * a tag of identifier <code>tagMinus1ID</code>.     */    private final boolean[][] mCannotFollow;    /**     * <code>mConvertToInterior[tagID]</code> is the interior     * tag ID with the same base tag as <code>tagID</code>.     */    private final int[] mConvertToInterior;    /**     * Array of start tag identifiers; tags are all prefixed by "ST_".     * Does not include "OUT" tag.     */    private final int[] mStart;    /**     * Array of interior tag identifiers; tags not prefixed by "ST_".     */    private final int[] mInterior;    /**     * Natural log of the uniform vocabulary estimate for this     * estimator.     */    private final double mLogUniformVocabEstimate;    /**     * Categorizer to provide token categories for smoothed     * estimates and unknown estimates.     */    private final TokenCategorizer mTokenCategorizer;    /**     * Construct a compiled estimator from a data input stream and     * sets the log estimate of the uniform vocabulary likelihood for     * smoothing the token model.     *     * @param in Data input stream from which to read the estimator.     * @param categorizer Token categorizer to classify tokens.     *     * @throws IOException If there is an I/O exception reading from     * the data input stream.     */    public CompiledEstimator(ObjectInput in)         throws ClassNotFoundException, IOException {        mTokenCategorizer = (TokenCategorizer) in.readObject();        mTagSymbolTable = (SymbolTable) in.readObject();        mTokenSymbolTable = (SymbolTable) in.readObject();        // read from model & put in training        mTagTrie = new EstimatorTrie(in);        mTokenTrie = new EstimatorTrie(in);        mLogUniformVocabEstimate = in.readDouble();        int numSymbols = mTagSymbolTable.numSymbols();        mConvertToInterior = new int[numSymbols];        mCannotFollow = new boolean[numSymbols][numSymbols];        int numTags = mTagSymbolTable.numSymbols();        ArrayList starts = new ArrayList();        ArrayList interiors = new ArrayList();        for (int tagID = 0; tagID < numTags; ++tagID) {            String tag = idToTag(tagID);            mConvertToInterior[tagID] = tagToInteriorID(tag);            if (tagID != mConvertToInterior[tagID]) {                interiors.add(new Integer(mConvertToInterior[tagID]));                starts.add(new Integer(tagID));            }            for (int tagMinus1ID = 0; tagMinus1ID < numTags; ++tagMinus1ID)                mCannotFollow[tagID][tagMinus1ID]                    = Tags.illegalSequence(idToTag(tagMinus1ID),tag);        }        mStart = convertToIntArray(starts);        mInterior = convertToIntArray(interiors);    }    /**     * Returns the array of start tag IDs.  The array returned is     * aligned with the interior tag IDs returned by {@link     * #interiorTagIDs()}.     *     * @return Array of identifiers for start tags.     */    public int[] startTagIDs() {        return mStart;    }    /**     * Returns the array of interior tag IDs.  The array returned is     * aligned with the start tag IDs returned by {@link     * #startTagIDs()}.     *     * @return Array of identifiers for interior tags.     */    public int[] interiorTagIDs() {        return mInterior;    }    /**     * Returns number of possible tags produced by this estimator,     * including both base and start forms of tags and the     * distinguished out tag.     *     * @return Number of possible tags produced by this estimator.     */    public int numTags() {        return mTagSymbolTable.numSymbols();    }    /** Maps a tag to its integer identifier or <code>-1</code> if it     * is not in the table.     * @param tag Name of tag.     * @return Integer identifier for the specified tag or <code>-1</code> if it is not in the table.     */    public int tagToID(String tag) {        return mTagSymbolTable.symbolToID(tag);    }    /**     * Maps a tag identifier to the name of that tag.  Throws an array out of     * bounds exception if the identifier does not exist in the table.     *     * @param id Identifier of the tag to return.     * @return Name of the tag identified by the specified identifier.     */    public String idToTag(int id) {        return mTagSymbolTable.idToSymbol(id);    }    /**     * Maps a token to its integer identifier or <code>-1</code> if it     * is not in the table.     *     * @param token Name of token.     * @return Integer identifier for the specified token or <code>-1</code> if it is not in the table.     */    public int tokenToID(String token) {        return mTokenSymbolTable.symbolToID(token);    }    /**     * Maps a token to its integer identifier if it is in the symbol     * table, or to the identifier of its token category.     *     * @param token Token to compute ID for.     * @return Identifier of token if it exists, or identifier of its     * category if nothing is known about the token.     */    public int tokenOrCategoryToID(String token) {        int id = tokenToID(token);        if (id < 0) {            id = tokenToID(mTokenCategorizer.categorize(token));            if (id < 0) {                System.err.println("No id for token category: " + token);            }        }        return id;    }    /**     * Maps an integer identifier to the token it represents     * in the token symbol table.     *     * @param id Identifier of the token.     * @return Token with specified identifier in the token symbol     * table.     */    public String idToToken(int id) {        return mTokenSymbolTable.idToSymbol(id);    }    /**     * Returns <code>true</code> if the tag identified by the first     * identifier cannot follow the tag identified by the second     * identifier.     *     * @param tagID Identifier of tag.     * @param tagMinus1ID Identifier of preceding tag.     * @return <code>true</code> if the tag for <code>tagID</code>     * cannot follow the tag for <code>tagMinus1ID</code>.     */    public boolean cannotFollow(int tagID, int tagMinus1ID) {        return mCannotFollow[tagID][tagMinus1ID];    }    /**     * Returns the identifier for the base tag of     * the tag picked out by the specified identifier.     *     * @param tagID Identifier of tag to convert to base form.     * @return Identifier of the base form of the tag picked out by     * the specified identifier.     */    private int idToInteriorID(int tagID) {        return mConvertToInterior[tagID];    }    /**     * Returns <code>log P(tag,token|tag-1,token-1,token-2)</code>,     * where information about the tags and tokens are supplied     * through symbol table identifiers.     *     * @param tagID Identifier of outcome tag to estimate along with     * the token.     * @param tokenID Identifier of outcome token to estimate along     * with the tag.     * @param tagMinus1ID Identifier of the previous tag.     * @param tokenMinus1ID Identifier of the previous token.     * @param tokenMinus2ID Token two back from token.     * @return <code>log P(tag,token|tag-1,token-1,token-2)</code>.     */    public double estimate(int tagID, int tokenID,                           int tagMinus1ID,                           int tokenMinus1ID,                           int tokenMinus2ID) {        if (cannotFollow(tagID,tagMinus1ID)) return Double.NaN;        int tagMinus1IDInterior = idToInteriorID(tagMinus1ID);        return estimateTag(tagID,tagMinus1IDInterior,                           tokenMinus1ID,tokenMinus2ID)            + estimateToken(tokenID,tagID,tagMinus1IDInterior,tokenMinus1ID);    }    /**     * Return <code>log P(tag|tag-1,token-1,token-2)</code>.  Returns     * <code>Double.NaN</code> when nothing is known about     * <code>tag-1</code>.     *     * @param tagID Identifier of outcome tag to estimate along with     * the token.     * @param tagMinus1ID Identifier of the previous tag.     * @param tokenMinus1ID Identifier of the previous token.     * @param tokenMinus2ID Token two back from token.     * @return <code>log P(tag|tag-1,token-1,token-2)</code>.     */    private double estimateTag(int tagID,                               int tagMinus1ID,                               int tokenMinus1ID,                               int tokenMinus2ID) {        // find most specific node matching context,        // then lookup estimate from there        // estimating from node follows backoffs,        // adding 1-lambda from current context as necessary        int nodeTag1Index = mTagTrie.lookupChild(tagMinus1ID,0);        if (nodeTag1Index == -1) {            // no outcomes for simple tag -- really an error            return Double.NaN;        }        int nodeTag1W1Index            = mTagTrie.lookupChild(tokenMinus1ID,nodeTag1Index);        if (nodeTag1W1Index == -1) {            return mTagTrie.estimateFromNode(tagID,nodeTag1Index);        }        int nodeTag1W1W2Index            = mTagTrie.lookupChild(tokenMinus2ID,nodeTag1W1Index);        if (nodeTag1W1W2Index == -1) {            return mTagTrie.estimateFromNode(tagID,nodeTag1W1Index);        }        return mTagTrie.estimateFromNode(tagID,nodeTag1W1W2Index);    }    /**     * Return <code>log P(token|tag,tag-1,token-1)</code>, where     * information about the tags and tokens are supplied through     * symbol table identifiers.  Return <code>Double.NaN</code> if     * nothign is known about <code>tag</code>.     *     * @param tokenID Identifier of outcome token to estimate along     * with the tag.     * @param tagID Identifier of outcome tag to estimate along with     * the token.     * @param tagMinus1ID Identifier of the previous tag.     * @param tokenMinus1ID Identifier of the previous token.     * @return  <code>log P(token|tag,tag-1,token-1)</code>.     */    private double estimateToken(int tokenID,                                 int tagID, int tagMinus1ID,                                 int tokenMinus1ID) {        int nodeTagIndex = mTokenTrie.lookupChild(tagID,0);        if (nodeTagIndex == -1)            return Double.NaN;        int nodeTagTag1Index = mTokenTrie.lookupChild(tagMinus1ID,nodeTagIndex);        if (nodeTagTag1Index == -1) {            return                mTokenTrie.estimateFromNodeUniform(tokenID,                                                   nodeTagIndex,                                                   mLogUniformVocabEstimate);        }        int nodeTagTag1W1Index            = mTokenTrie.lookupChild(tokenMinus1ID,nodeTagTag1Index);        if (nodeTagTag1W1Index != -1) {            return                mTokenTrie.estimateFromNodeUniform(tokenID,                                                   nodeTagTag1W1Index,                                                   mLogUniformVocabEstimate);        }        return mTokenTrie.estimateFromNodeUniform(tokenID,                                                  nodeTagTag1Index,                                                  mLogUniformVocabEstimate);    }    /**     * Return the identifier for the base tag corresponding     * to the specified tag.     *     * @param tag Tag whose base tag ID is returned.     * @return Identifier for base tag of specified tag.     */    private int tagToInteriorID(String tag) {        return tagToID(Tags.toInnerTag(tag));    }    /**     * Convert the array list of <code>Integer</code> objects to an     * array of their integer values.     *     * @param xs Arraylist of Integer objects.     * @return Array of integer values for the specified array of     * objects.     */    private static int[] convertToIntArray(ArrayList xs) {        int[] result = new int[xs.size()];        for (int i = 0; i < result.length; ++i)            result[i] = ((Integer) xs.get(i)).intValue();        return result;    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -