📄 charlmrescoringchunker.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.chunk;import com.aliasi.corpus.ChunkHandler;import com.aliasi.corpus.TagHandler;import com.aliasi.corpus.ChunkTagHandlerAdapter;import com.aliasi.hmm.HmmCharLmEstimator;import com.aliasi.lm.LanguageModel;import com.aliasi.lm.NGramBoundaryLM;import com.aliasi.lm.NGramProcessLM;import com.aliasi.tokenizer.TokenizerFactory;import com.aliasi.util.AbstractExternalizable;import com.aliasi.util.Compilable;import java.io.IOException;import java.io.ObjectInput;import java.io.ObjectOutput;import java.util.HashMap;import java.util.Iterator;/** * A <code>CharLmRescoringChunker</code> provides a long-distance * character language model-based chunker that operates by rescoring * the output of a contained character language model HMM chunker. * <h3>The Underlying Chunker</h3> * * <p>This model performs rescoring over an underlying chunker. * The underlying chunkeris an instance of {@link CharLmHmmChunker}, * configured with the specified tokenizer factory, n-gram length, * number of characters and interpolation ratio provided in the * constructor. The underlying chunker may be configured after * retrieving it through the superclass's {@link #baseChunker()} * method. The typical use of this is to configure caching. * * <h3>The Rescoring Model</h3> * * <p>The rescoring model used by this chunker is based on a bounded * character language model per chunk type with an additional * process character language model for text not in chunks. The * remaining details are described in the class documentation for * the superclass {@link AbstractCharLmRescoringChunker}. * * <h3>Training and Compilation</h3> * * <p>This chunker is trained in the usual way through calls to the * appropriate <code>handle()</code> method. The method {@link * #handle(Chunking)} implements the {@link ChunkHandler} interface * and allows for training through chunking examples. The method * {@link #handle(String[],String[],String[])} provides implements the * {@link TagHandler} interface, allowing training through BIO-encoded * chunk taggings. A model is compiled by calling the {@link * Compilable} interface method {@link #compileTo(ObjectOutput)}. * The compiled model is an instance of a <code>AbstractCharLmRescoringChunker</code>, * and its underlying chunker may be recovered that way. * * <h3>Runtime Configuration</h3> * * <p>The underlying chunker is recoverable as a character language * model HMM chunker through {@link #baseChunker()}. The * non-chunk process n-gram character language model is returned by * {@link #outLM()}, whereas the chunk models are returned * by {@link #chunkLM(String)}. * * <p>The components of a character LM rescoring chunker are accessible * in their training format for methods on this class, as described above. * * <p>The compiled models are instances of {@link RescoringChunker}, * which allow their underlying chunker to be retrieved through {@link * #baseChunker()} and then configured. The other run-time models, for * may be retrieved through the superclass's * * <h3>Reserved Tag</h3> * * <p>The tag <code>BOS</code> is reserved for use by the system * for encoding document start/end positions. See {@link HmmChunker} * for more information. * * @author Bob Carpenter * @version 3.0 * @since LingPipe2.3 */public class CharLmRescoringChunker extends AbstractCharLmRescoringChunker<CharLmHmmChunker,NGramProcessLM,NGramBoundaryLM> implements ChunkHandler, Compilable, TagHandler { final int mNGram; final int mNumChars; final double mInterpolationRatio; char mNextCodeChar = (char) (BOS_CHAR - 2); /** * Construct a character language model rescoring chunker based on * the specified components. Tags in the underlying model are not * smoothed by default (see the full constructor's documentation: * {@link * #CharLmRescoringChunker(TokenizerFactory,int,int,int,double,boolean)}. * * @param tokenizerFactory Tokenizer factory for boundaries. * @param numChunkingsRescored Number of underlying chunkings rescored. * @param nGram N-gram length for all models. * @param numChars Number of characters in the training and * run-time character sets. * @param interpolationRatio Underlying language-model * interpolation ratios. */ public CharLmRescoringChunker(TokenizerFactory tokenizerFactory, int numChunkingsRescored, int nGram, int numChars, double interpolationRatio) { super(new CharLmHmmChunker(tokenizerFactory, new HmmCharLmEstimator(nGram, numChars, interpolationRatio)), numChunkingsRescored, new NGramProcessLM(nGram,numChars,interpolationRatio), new HashMap<String,Character>(), new HashMap<String,NGramBoundaryLM>()); mNGram = nGram; mNumChars = numChars; mInterpolationRatio = interpolationRatio; } /** * Construct a character language model rescoring chunker based on * the specified components. * * <p>Whether tags are smoothed in the underlying model is determined * by the flag in the constructor. See {@link CharLmHmmChunker}'s * class documentation for more information on the effects of * smoothing. * * * @param tokenizerFactory Tokenizer factory for boundaries. * @param numChunkingsRescored Number of underlying chunkings rescored. * @param nGram N-gram length for all models. * @param numChars Number of characters in the training and * run-time character sets. * @param interpolationRatio Underlying language-model * interpolation ratios. * @param smoothTags Set to <code>true</code> to smooth tags in underlying * chunker. */ public CharLmRescoringChunker(TokenizerFactory tokenizerFactory, int numChunkingsRescored, int nGram, int numChars, double interpolationRatio, boolean smoothTags) { super(new CharLmHmmChunker(tokenizerFactory, new HmmCharLmEstimator(nGram, numChars, interpolationRatio), smoothTags), numChunkingsRescored, new NGramProcessLM(nGram,numChars,interpolationRatio), new HashMap(), new HashMap()); mNGram = nGram; mNumChars = numChars; mInterpolationRatio = interpolationRatio; } /** * Trains this chunker with the specified chunking. * * @param chunking Training data. */ public void handle(Chunking chunking) { // train underlying baseChunker().handle(chunking); // train rescorer String text = chunking.charSequence().toString(); Iterator chunkIt = orderedIterator(chunking); char prevTagChar = BOS_CHAR; int pos = 0; while (chunkIt.hasNext()) { Chunk chunk = (Chunk) chunkIt.next(); int start = chunk.start(); int end = chunk.end(); String chunkType = chunk.type(); createTypeIfNecessary(chunkType); char tagChar = typeToChar(chunkType); trainOutLM(text.substring(pos,start), prevTagChar,tagChar); trainTypeLM(chunkType,text.substring(start,end)); pos = end; prevTagChar = tagChar; } trainOutLM(text.substring(pos), prevTagChar,EOS_CHAR); } /** * Compiles this model to the specified object output stream. The * model may then be read back in using {@link * java.io.ObjectInput#readObject()}; the resulting object will be * an instance of {@link AbstractCharLmRescoringChunker}. * * @throws IOException If there is an I/O error during the write. * @throws IllegalArgumentException If the tokenizer factory supplied to * the constructor of this class is not compilable. */ public void compileTo(ObjectOutput objOut) throws IOException { objOut.writeObject(new Externalizer(this)); } /** * Trains this chunker with the specified BIO-encoded chunk * tagging. For information on the external BIO format, as well * as the internal tagging format, see {@link HmmChunker}. * * @param toks Tokens of training data. * @param whitespaces Whitespaces in training data. * @param tags Tags for training data. */ public void handle(String[] toks, String[] whitespaces, String[] tags) { TagHandler handler = new ChunkTagHandlerAdapter(this); handler.handle(toks,whitespaces,tags); } /** * Provides the specified character sequence data as training data * for the language model of the specfied type. This method calls * the method of the same signature on the trainable base chunker. * The language model for the specified type will be created if it * has not been seen previously. * * <p><i>Warning:</i> It is not sufficient to only train a model * using this method. Annotated data with a representative * balance of entities and non-entity text is required to train * the overall likelihood of entities and the contexts in which * they occur. Use of this method will <i>not</i> bias the * likelihoods of entities occurring. But, it might cause the * common entities in the training data to be overwhelmed if a * large dictionary is used. One possibility is to train the * basic data multiple times relative to the dictionary (or * vice-versa). * * @param cSeq Character sequence for training. * @param type Type of character sequence. */ public void trainDictionary(CharSequence cSeq, String type) { baseChunker().trainDictionary(cSeq,type); trainTypeLM(type,cSeq); } /** * Trains the language model for non-entities using the specified * character sequence. * * <p><i>Warning</i>: Training using this method biases the * likelihood of entities downward, because it does not train the * likelihood of a non-entity character sequence ending and being * followed by an entity of a specified type. Thus this method is * best used to seed a dictionary of common words that are * relatively few in number relative to the entity-annotated * training data. * * @param cSeq Data to train the non-entity (out) model. */ public void trainOut(CharSequence cSeq) { outLM().train(cSeq); } void createTypeIfNecessary(String chunkType) { if (mTypeToChar.containsKey(chunkType)) return; Character c = new Character(mNextCodeChar--); mTypeToChar.put(chunkType,c); NGramBoundaryLM lm = new NGramBoundaryLM(mNGram,mNumChars,mInterpolationRatio, (char) 0xFFFF); mTypeToLM.put(chunkType,lm); } void trainTypeLM(String type, CharSequence text) { createTypeIfNecessary(type); NGramBoundaryLM lm = (NGramBoundaryLM) mTypeToLM.get(type); lm.train(text); } void trainOutLM(String text, char prevTagChar, char nextTagChar) { String trainSeq = prevTagChar + text + nextTagChar; outLM().train(trainSeq); outLM().substringCounter().decrementUnigram(prevTagChar); } static class Externalizer extends AbstractExternalizable { private static final long serialVersionUID = 3555143657918695241L; final CharLmRescoringChunker mChunker; public Externalizer() { this(null); } public Externalizer(CharLmRescoringChunker chunker) { mChunker = chunker; } // base chunker:NBestChunker // numChunkingsRescored:int // numTypes:int // (type:UTF,codeChar:char,ngramBoundaryLM:LanguageModel.Process)**numTypes // outLM:NGram.Process public void writeExternal(ObjectOutput objOut) throws IOException { mChunker.baseChunker().compileTo(objOut); objOut.writeInt(mChunker.numChunkingsRescored()); String[] types = (String[]) mChunker.mTypeToLM.keySet().toArray(new String[0]); objOut.writeInt(types.length); for (int i = 0; i < types.length; ++i) { objOut.writeUTF(types[i]); objOut.writeChar(mChunker.typeToChar(types[i])); NGramBoundaryLM lm = (NGramBoundaryLM) mChunker.mTypeToLM.get(types[i]); lm.compileTo(objOut); } mChunker.outLM().compileTo(objOut); } public Object read(ObjectInput in) throws ClassNotFoundException, IOException { NBestChunker baseChunker = (NBestChunker) in.readObject(); int numChunkingsRescored = in.readInt(); int numTypes = in.readInt(); HashMap typeToChar = new HashMap(); HashMap typeToLM = new HashMap(); for (int i = 0; i < numTypes; ++i) { String type = in.readUTF(); char c = in.readChar(); LanguageModel.Sequence lm = (LanguageModel.Sequence) in.readObject(); typeToChar.put(type,new Character(c)); typeToLM.put(type,lm); } LanguageModel.Process outLM = (LanguageModel.Process) in.readObject(); return new AbstractCharLmRescoringChunker(baseChunker, numChunkingsRescored, outLM, typeToChar, typeToLM); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -