📄 trainableestimator.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.chunk;import com.aliasi.symbol.SymbolTableCompiler;import com.aliasi.tokenizer.TokenCategorizer;import com.aliasi.util.AbstractExternalizable;import com.aliasi.util.Compilable;import java.io.IOException;import java.io.ObjectInput;import java.io.ObjectOutput;import java.util.Iterator;import java.util.LinkedList;import java.util.TreeSet;// used only by TrainTokenShapeChunkerfinal class TrainableEstimator implements Compilable {    /**     * Root of the trie representing next tag contexts and outcomes.     */    private Node mRootTagNode;    /**     * Root of the trie representing next token contexts and outcomes.     */    private Node mRootTokenNode;    /**     * The symbol table used for tokens.     */    private final SymbolTableCompiler mTokenSymbolTable         = new SymbolTableCompiler();    /**     * The symbol table used for tags.     */    private final SymbolTableCompiler mTagSymbolTable         = new SymbolTableCompiler();    /**     * The value of the lambda factor.     */    private double mLambdaFactor;    /**     * The natural log of the estimate of the likelihood of a lexical     * item.     */    private double mLogUniformVocabEstimate;    /**     * The token categorizer for this estimator to compute estimates     * for unknown tokens.     */    private final TokenCategorizer mTokenCategorizer;    /**     * Construct a trainable estimator with the specified     * lambda factor and log uniform vocabulary estimate.     *     * @param lambdaFactor Lambda factor to use for this estimator.     * @param logUniformVocabEstimate Natural log of the uniform     * vocabulary estimate for smoothing.     * @param categorizer Token categorizer to categorize unknown tokens.     */    public TrainableEstimator(double lambdaFactor,                              double logUniformVocabEstimate,                              TokenCategorizer categorizer) {        mLambdaFactor = lambdaFactor;        mLogUniformVocabEstimate = logUniformVocabEstimate;        mTokenCategorizer = categorizer;        mRootTagNode = new Node(null,mTagSymbolTable,null);        mRootTokenNode = new Node(null,mTokenSymbolTable,null);        mTagSymbolTable.addSymbol(Tags.OUT_TAG);    }    /**     * Construct a trainable estimator with default values for the     * lambda factor of <code>4.0</code> and for the log uniform     * vocabulary estimate of <code>Math.log(1.0/1000000.0)</code>.     * These may be set before writing the estimator to file with     * {@link #setLambdaFactor(double)} and {@link     * #setLogUniformVocabularyEstimate(double)}.     *     * @param categorizer Token categorizer to categorize unknown tokens.     */    public TrainableEstimator(TokenCategorizer categorizer) {        this (4.0,              Math.log(1.0/1000000.0),              categorizer);    }    /**     * Sets the lambda factor to the specified value, which must be a     * non-negative, non-infinite double.     *     * @param lambdaFactor Lambda factor to set.     * @throws IllegalArgumentException If the specified factor is     * negative, infinite, or not a number.     */    public void setLambdaFactor(double lambdaFactor) {        if (lambdaFactor < 0.0            || Double.isNaN(lambdaFactor)            || Double.isInfinite(lambdaFactor))            throw new                IllegalArgumentException("Lambda factor must be > 0."                                         + " Was=" + lambdaFactor);        mLambdaFactor = lambdaFactor;    }    /**     * Sets the log uniform vocabulary estimate to the specified     * value, which must be a negative, non-infinite number.     *     * @param estimate Log Uniform vocabulary estimate to set.     * @throws IllegalArgumentException If the specified factor is not     * a number or is positive, zero or infinite.     */    public void setLogUniformVocabularyEstimate(double estimate) {        if (estimate >= 0.0            || Double.isNaN(estimate)            || Double.isInfinite(estimate))            throw new                IllegalArgumentException("Log vocab estimate must be < 0."                                         + " Was=" + estimate);        mLogUniformVocabEstimate = estimate;    }    /**     * Train the estimator based on the specified parallel arrays of     * tokens and tags.     *     * @param tokens Array of tokens on which to train.     * @param tags Array of tags on which to train.     */    public void handle(String[] tokens, String[] tags) {        // System.out.println("tokens(" + java.util.Arrays.asList(tokens) + ")");        // System.out.println("tags(" + java.util.Arrays.asList(tags) + ")");        // train first token/tag pair given dummy starts        if (tokens.length < 1) return;        trainOutcome(tokens[0],tags[0],                     Tags.START_TAG,                     Tags.START_TOKEN,Tags.START_TOKEN);        if (tokens.length < 2) {            // train final token/tag pair            trainOutcome(Tags.START_TOKEN,Tags.START_TAG,                         tags[0],                         tokens[0], Tags.START_TOKEN);            return;        }        // train second token/tag pair w extensions        trainOutcome(tokens[1],tags[1],                     tags[0],                     tokens[0],Tags.START_TOKEN);        // train rest of pairs given        for (int i = 2; i < tokens.length; ++i)            trainOutcome(tokens[i],tags[i],                         tags[i-1],                         tokens[i-1],tokens[i-2]);        // train final token/tag pair beyond the last        trainOutcome(Tags.START_TOKEN, Tags.START_TAG,                     tags[tags.length-1],                     tokens[tokens.length-1], tokens[tokens.length-2]);    }    /**     * Write a compiled version of this estimator to the specified     * data output stream.  Caller is responsible for closing the     * stream.     *     * @param out Data output stream to which to write a compiled     * version of this estimator.     * @throws IOException If there is an exception in the writing to     * data output stream.     */    public void compileTo(ObjectOutput out) throws IOException {        out.writeObject(new Externalizer(this));    }    static class Externalizer extends AbstractExternalizable {        private static final long serialVersionUID = 4179100933315980535L;        final TrainableEstimator mEstimator;        public Externalizer() {             this(null);        }        public Externalizer(TrainableEstimator estimator) {            mEstimator = estimator;        }        public Object read(ObjectInput in)             throws ClassNotFoundException, IOException {                    return new CompiledEstimator(in);        }        public void writeExternal(ObjectOutput objOut) throws IOException {            ((Compilable) mEstimator.mTokenCategorizer).compileTo(objOut);            mEstimator.generateSymbols();            mEstimator.mTagSymbolTable.compileTo(objOut);            mEstimator.mTokenSymbolTable.compileTo(objOut);            mEstimator.writeEstimator(mEstimator.mRootTagNode,objOut);            mEstimator.writeEstimator(mEstimator.mRootTokenNode,objOut);            objOut.writeDouble(mEstimator.mLogUniformVocabEstimate);        }    }        /**     * Train the estimator for a specific token and tag outcome     * given a context.  If specified arguments are <code>null</code>,     * only the non-<code>null</code> outcomes and contexts are     * used for training.     *     * @param token Token outcome.     * @param tag Tag outcome.     * @param tagMinus1 Tag assigned to previous token.     * @param tokenMinus1 Previous token.     * @param tokenMinus2 Token occurring two tokens back.     */    public void trainOutcome(String token, String tag,                             String tagMinus1,                             String tokenMinus1, String tokenMinus2) {        mTagSymbolTable.addSymbol(tag);        mTokenSymbolTable.addSymbol(token);        String tagMinus1Interior            = (tagMinus1 == null)            ? null             : Tags.toInnerTag(tagMinus1);        trainTokenModel(token,tag,tagMinus1Interior,tokenMinus1);        trainTagModel(tag,tagMinus1Interior,tokenMinus1,tokenMinus2);    }    /**     * Generates the symbol tables from the trie structures     * representing the counts.     */    private void generateSymbols() {        mRootTagNode.generateSymbols();        // mRootTagNode.printSymbols();        mRootTokenNode.generateSymbols();        // mRootTokenNode.printSymbols();        // make sure all token category symbols have token ids        String[] tokenCategories = mTokenCategorizer.categories();        for (int i = 0; i < tokenCategories.length; ++i)            mTokenSymbolTable.addSymbol(tokenCategories[i]);    }    /**     * Train the token half of the model.  If specified tags or     * tokens are <code>null</code>, only the non-<code>null</code>     * events and contexts are used for training.     *     * @param token Token outcome.     * @param tag Tag outcome.     * @param tagMinus1 Tag assigned to previous token.     * @param tokenMinus1 Previous token.
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -