📄 trainableestimator.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.chunk;import com.aliasi.symbol.SymbolTableCompiler;import com.aliasi.tokenizer.TokenCategorizer;import com.aliasi.util.AbstractExternalizable;import com.aliasi.util.Compilable;import java.io.IOException;import java.io.ObjectInput;import java.io.ObjectOutput;import java.util.Iterator;import java.util.LinkedList;import java.util.TreeSet;// used only by TrainTokenShapeChunkerfinal class TrainableEstimator implements Compilable { /** * Root of the trie representing next tag contexts and outcomes. */ private Node mRootTagNode; /** * Root of the trie representing next token contexts and outcomes. */ private Node mRootTokenNode; /** * The symbol table used for tokens. */ private final SymbolTableCompiler mTokenSymbolTable = new SymbolTableCompiler(); /** * The symbol table used for tags. */ private final SymbolTableCompiler mTagSymbolTable = new SymbolTableCompiler(); /** * The value of the lambda factor. */ private double mLambdaFactor; /** * The natural log of the estimate of the likelihood of a lexical * item. */ private double mLogUniformVocabEstimate; /** * The token categorizer for this estimator to compute estimates * for unknown tokens. */ private final TokenCategorizer mTokenCategorizer; /** * Construct a trainable estimator with the specified * lambda factor and log uniform vocabulary estimate. * * @param lambdaFactor Lambda factor to use for this estimator. * @param logUniformVocabEstimate Natural log of the uniform * vocabulary estimate for smoothing. * @param categorizer Token categorizer to categorize unknown tokens. */ public TrainableEstimator(double lambdaFactor, double logUniformVocabEstimate, TokenCategorizer categorizer) { mLambdaFactor = lambdaFactor; mLogUniformVocabEstimate = logUniformVocabEstimate; mTokenCategorizer = categorizer; mRootTagNode = new Node(null,mTagSymbolTable,null); mRootTokenNode = new Node(null,mTokenSymbolTable,null); mTagSymbolTable.addSymbol(Tags.OUT_TAG); } /** * Construct a trainable estimator with default values for the * lambda factor of <code>4.0</code> and for the log uniform * vocabulary estimate of <code>Math.log(1.0/1000000.0)</code>. * These may be set before writing the estimator to file with * {@link #setLambdaFactor(double)} and {@link * #setLogUniformVocabularyEstimate(double)}. * * @param categorizer Token categorizer to categorize unknown tokens. */ public TrainableEstimator(TokenCategorizer categorizer) { this (4.0, Math.log(1.0/1000000.0), categorizer); } /** * Sets the lambda factor to the specified value, which must be a * non-negative, non-infinite double. * * @param lambdaFactor Lambda factor to set. * @throws IllegalArgumentException If the specified factor is * negative, infinite, or not a number. */ public void setLambdaFactor(double lambdaFactor) { if (lambdaFactor < 0.0 || Double.isNaN(lambdaFactor) || Double.isInfinite(lambdaFactor)) throw new IllegalArgumentException("Lambda factor must be > 0." + " Was=" + lambdaFactor); mLambdaFactor = lambdaFactor; } /** * Sets the log uniform vocabulary estimate to the specified * value, which must be a negative, non-infinite number. * * @param estimate Log Uniform vocabulary estimate to set. * @throws IllegalArgumentException If the specified factor is not * a number or is positive, zero or infinite. */ public void setLogUniformVocabularyEstimate(double estimate) { if (estimate >= 0.0 || Double.isNaN(estimate) || Double.isInfinite(estimate)) throw new IllegalArgumentException("Log vocab estimate must be < 0." + " Was=" + estimate); mLogUniformVocabEstimate = estimate; } /** * Train the estimator based on the specified parallel arrays of * tokens and tags. * * @param tokens Array of tokens on which to train. * @param tags Array of tags on which to train. */ public void handle(String[] tokens, String[] tags) { // System.out.println("tokens(" + java.util.Arrays.asList(tokens) + ")"); // System.out.println("tags(" + java.util.Arrays.asList(tags) + ")"); // train first token/tag pair given dummy starts if (tokens.length < 1) return; trainOutcome(tokens[0],tags[0], Tags.START_TAG, Tags.START_TOKEN,Tags.START_TOKEN); if (tokens.length < 2) { // train final token/tag pair trainOutcome(Tags.START_TOKEN,Tags.START_TAG, tags[0], tokens[0], Tags.START_TOKEN); return; } // train second token/tag pair w extensions trainOutcome(tokens[1],tags[1], tags[0], tokens[0],Tags.START_TOKEN); // train rest of pairs given for (int i = 2; i < tokens.length; ++i) trainOutcome(tokens[i],tags[i], tags[i-1], tokens[i-1],tokens[i-2]); // train final token/tag pair beyond the last trainOutcome(Tags.START_TOKEN, Tags.START_TAG, tags[tags.length-1], tokens[tokens.length-1], tokens[tokens.length-2]); } /** * Write a compiled version of this estimator to the specified * data output stream. Caller is responsible for closing the * stream. * * @param out Data output stream to which to write a compiled * version of this estimator. * @throws IOException If there is an exception in the writing to * data output stream. */ public void compileTo(ObjectOutput out) throws IOException { out.writeObject(new Externalizer(this)); } static class Externalizer extends AbstractExternalizable { private static final long serialVersionUID = 4179100933315980535L; final TrainableEstimator mEstimator; public Externalizer() { this(null); } public Externalizer(TrainableEstimator estimator) { mEstimator = estimator; } public Object read(ObjectInput in) throws ClassNotFoundException, IOException { return new CompiledEstimator(in); } public void writeExternal(ObjectOutput objOut) throws IOException { ((Compilable) mEstimator.mTokenCategorizer).compileTo(objOut); mEstimator.generateSymbols(); mEstimator.mTagSymbolTable.compileTo(objOut); mEstimator.mTokenSymbolTable.compileTo(objOut); mEstimator.writeEstimator(mEstimator.mRootTagNode,objOut); mEstimator.writeEstimator(mEstimator.mRootTokenNode,objOut); objOut.writeDouble(mEstimator.mLogUniformVocabEstimate); } } /** * Train the estimator for a specific token and tag outcome * given a context. If specified arguments are <code>null</code>, * only the non-<code>null</code> outcomes and contexts are * used for training. * * @param token Token outcome. * @param tag Tag outcome. * @param tagMinus1 Tag assigned to previous token. * @param tokenMinus1 Previous token. * @param tokenMinus2 Token occurring two tokens back. */ public void trainOutcome(String token, String tag, String tagMinus1, String tokenMinus1, String tokenMinus2) { mTagSymbolTable.addSymbol(tag); mTokenSymbolTable.addSymbol(token); String tagMinus1Interior = (tagMinus1 == null) ? null : Tags.toInnerTag(tagMinus1); trainTokenModel(token,tag,tagMinus1Interior,tokenMinus1); trainTagModel(tag,tagMinus1Interior,tokenMinus1,tokenMinus2); } /** * Generates the symbol tables from the trie structures * representing the counts. */ private void generateSymbols() { mRootTagNode.generateSymbols(); // mRootTagNode.printSymbols(); mRootTokenNode.generateSymbols(); // mRootTokenNode.printSymbols(); // make sure all token category symbols have token ids String[] tokenCategories = mTokenCategorizer.categories(); for (int i = 0; i < tokenCategories.length; ++i) mTokenSymbolTable.addSymbol(tokenCategories[i]); } /** * Train the token half of the model. If specified tags or * tokens are <code>null</code>, only the non-<code>null</code> * events and contexts are used for training. * * @param token Token outcome. * @param tag Tag outcome. * @param tagMinus1 Tag assigned to previous token. * @param tokenMinus1 Previous token.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -