📄 charlmhmmchunker.java
字号:
* a dictionary. They require regular training data in order * to train the contexts in which dictionary items show up. * Attempting to train with only a dictionary will lead to * null pointer exceptions when attempting to decode. * * @param cSeq Character sequence on which to train. * @param type Type of chunk. */ public void trainDictionary(CharSequence cSeq, String type) { char[] cs = Strings.toCharArray(cSeq); Tokenizer tokenizer = getTokenizerFactory().tokenizer(cs,0,cs.length); String[] tokens = tokenizer.tokenize(); if (tokens.length < 1) { String msg = "Did not find any tokens in entry." + "Char sequence=" + cSeq; throw new IllegalArgumentException(msg); } AbstractHmmEstimator estimator = getHmmEstimator(); SymbolTable table = estimator.stateSymbolTable(); smoothBaseTag(type,table,estimator); if (tokens.length == 1) { estimator.trainEmit("W_" + type, tokens[0]); return; } String initialTag = "B_" + type; estimator.trainEmit(initialTag, tokens[0]); String prevTag = initialTag; for (int i = 1; i+1 < tokens.length; ++i) { String tag = "M_" + type; estimator.trainEmit(tag, tokens[i]); estimator.trainTransit(prevTag,tag); prevTag = tag; } String finalTag = "E_" + type; estimator.trainEmit(finalTag, tokens[tokens.length-1]); estimator.trainTransit(prevTag,finalTag); } /** * Handle the specified chunking by tokenizing it, assigning tags * and training the underlying hidden Markov model. For a description * of how chunkings are broken down into taggings, see the parent * class documentation in {@link HmmChunker}. * * @param chunking Chunking to use for training. */ public void handle(Chunking chunking) { ChunkHandlerAdapter adapter = new ChunkHandlerAdapter(this, getTokenizerFactory(), false); // could make errors class param adapter.handle(chunking); } /** * Handle the specified tokens, whitespaces and tags by using them * (after conversion) to train the underlying hidden Markov model. * The description of tag format is given in the class documentation * above; this format is converted into the underlying format used * by the underlying HMM as described in {@link HmmChunker}. * * @param tokens Array of tokens. * @param whitespaces Array of whitespaces; unused and may be * <code>null</code>. * @param tags Array of tags in format described in class documentation. * @throws IllegalArgumentException If the token and tag arrays are not * the same length, or if the whitespaces array is non-null and not one * longer than the array of tokens. */ public void handle(String[] tokens, String[] whitespaces, String[] tags) { getHmmEstimator().handle(tokens,whitespaces,trainNormalize(tags)); smoothTags(tags); } /** * Compiles this model to the specified object output stream. The * model may then be read back in using {@link * java.io.ObjectInput#readObject()}; the resulting object will be * instance of {@link HmmChunker}. See the class documentation * above for information on setting the cache for a compiled * model. * * @throws IOException If there is an I/O error during the write. * @throws IllegalArgumentException If the tokenizer factory supplied to * the constructor of this class is not compilable. */ public void compileTo(ObjectOutput objOut) throws IOException { if (!(mTokenizerFactory instanceof Compilable)) { String msg = "Tokenizer factory must implement class=" + " com.aliasi.util.Compilable " + " Found class=" + mTokenizerFactory.getClass(); throw new IllegalArgumentException(msg); } objOut.writeObject(new Externalizer(this)); } /** * Returns a string representation of the complete topology of the * underlying HMM with log2 transition probabilities. Note that this * output does not represent the emission probabilities per category. * * @return String-based representation of this chunker. */ public String toString() { StringBuilder sb = new StringBuilder(); java.util.Set expandedTagSet = new java.util.TreeSet(); expandedTagSet.add("MM_O"); expandedTagSet.add("WW_O_BOS"); expandedTagSet.add("BB_O_BOS"); expandedTagSet.add("EE_O_BOS"); for (Object tag0 : mTagSet) { String x = tag0.toString(); expandedTagSet.add("B_" + x); expandedTagSet.add("M_" + x); expandedTagSet.add("E_" + x); expandedTagSet.add("W_" + x); expandedTagSet.add("BB_O_" + x); expandedTagSet.add("EE_O_" + x); expandedTagSet.add("WW_O_" + x); } for (Object tag0Obj : expandedTagSet) { String tag0 = tag0Obj.toString(); sb.append("\n"); sb.append("start(" + tag0 + ")=" + mHmmEstimator.startLog2Prob(tag0)); sb.append("\n"); sb.append(" end(" + tag0 + ")=" + mHmmEstimator.endLog2Prob(tag0)); sb.append("\n"); for (Object tag1Obj : expandedTagSet) { String tag1 = tag1Obj.toString(); sb.append("trans(" + tag0 + "," + tag1 + ")=" + mHmmEstimator.transitLog2Prob(tag0,tag1)); sb.append("\n"); } } return sb.toString(); } void smoothBoundaries() { // mTagSet.add("BOS"); // BOS used for begin and end AbstractHmmEstimator hmmEstimator = getHmmEstimator(); SymbolTable table = hmmEstimator.stateSymbolTable(); String bbO = "BB_O_BOS"; String mmO = "MM_O"; String eeO = "EE_O_BOS"; String wwO = "WW_O_BOS"; table.getOrAddSymbol(bbO); table.getOrAddSymbol(mmO); table.getOrAddSymbol(eeO); table.getOrAddSymbol(wwO); hmmEstimator.trainStart(bbO); hmmEstimator.trainStart(wwO); hmmEstimator.trainEnd(eeO); hmmEstimator.trainEnd(wwO); hmmEstimator.trainTransit(bbO,mmO); hmmEstimator.trainTransit(bbO,eeO); hmmEstimator.trainTransit(mmO,mmO); hmmEstimator.trainTransit(mmO,eeO); } void smoothTags(String[] tags) { if (!mSmoothTags) return; AbstractHmmEstimator hmmEstimator = getHmmEstimator(); SymbolTable table = hmmEstimator.stateSymbolTable(); for (int i = 0; i < tags.length; ++i) smoothTag(tags[i],table,hmmEstimator); } void smoothTag(String tag, SymbolTable table, AbstractHmmEstimator hmmEstimator) { smoothBaseTag(HmmChunker.baseTag(tag), table, hmmEstimator); } void smoothBaseTag(String baseTag, SymbolTable table, AbstractHmmEstimator hmmEstimator) { if (!mTagSet.add(baseTag)) return; // already added if ("O".equals(baseTag)) return; // constructor + other tags smooth "O" String b_x = "B_" + baseTag; String m_x = "M_" + baseTag; String e_x = "E_" + baseTag; String w_x = "W_" + baseTag; String bb_o_x = "BB_O_" + baseTag; // String mm_o = "MM_O"; // no tag modifier, just constant String ee_o_x = "EE_O_" + baseTag; String ww_o_x = "WW_O_" + baseTag; table.getOrAddSymbol(b_x); table.getOrAddSymbol(m_x); table.getOrAddSymbol(e_x); table.getOrAddSymbol(w_x); table.getOrAddSymbol(bb_o_x); // table.getOrAddSymbol("MM_O"); // in constructor table.getOrAddSymbol(ee_o_x); table.getOrAddSymbol(ww_o_x); hmmEstimator.trainStart(b_x); hmmEstimator.trainTransit(b_x,m_x); hmmEstimator.trainTransit(b_x,e_x); hmmEstimator.trainTransit(m_x,m_x); hmmEstimator.trainTransit(m_x,e_x); hmmEstimator.trainEnd(e_x); hmmEstimator.trainTransit(e_x,bb_o_x); hmmEstimator.trainStart(w_x); hmmEstimator.trainEnd(w_x); hmmEstimator.trainTransit(w_x,bb_o_x); hmmEstimator.trainTransit(bb_o_x,"MM_O"); hmmEstimator.trainTransit("MM_O",ee_o_x); // handles all MM_O to ends hmmEstimator.trainTransit(ee_o_x,b_x); hmmEstimator.trainTransit(ee_o_x,w_x); hmmEstimator.trainStart(ww_o_x); hmmEstimator.trainTransit(ww_o_x,b_x); hmmEstimator.trainTransit(ww_o_x,w_x); hmmEstimator.trainTransit(e_x,"WW_O_BOS"); hmmEstimator.trainTransit(w_x,"WW_O_BOS"); hmmEstimator.trainTransit(bb_o_x,"EE_O_BOS"); hmmEstimator.trainTransit("BB_O_BOS",ee_o_x); Iterator it = mTagSet.iterator(); while (it.hasNext()) { String type = it.next().toString(); if ("O".equals(type)) continue; if ("BOS".equals(type)) continue; String bb_o_y = "BB_O_" + type; String ww_o_y = "WW_O_" + type; String ee_o_y = "EE_O_" + type; String b_y = "B_" + type; String w_y = "W_" + type; String e_y = "E_" + type; hmmEstimator.trainTransit(e_x,ww_o_y); hmmEstimator.trainTransit(e_x,b_y); hmmEstimator.trainTransit(e_x,w_y); hmmEstimator.trainTransit(w_x,ww_o_y); hmmEstimator.trainTransit(w_x,b_y); hmmEstimator.trainTransit(w_x,w_y); hmmEstimator.trainTransit(e_y,b_x); hmmEstimator.trainTransit(e_y,w_x); hmmEstimator.trainTransit(e_y,ww_o_x); hmmEstimator.trainTransit(w_y,b_x); hmmEstimator.trainTransit(w_y,w_x); hmmEstimator.trainTransit(w_y,ww_o_x); hmmEstimator.trainTransit(bb_o_x,ee_o_y); hmmEstimator.trainTransit(bb_o_y,ee_o_x); } } static class Externalizer extends AbstractExternalizable { private static final long serialVersionUID = 4630707998932521821L; final CharLmHmmChunker mChunker; public Externalizer() { this(null); } public Externalizer(CharLmHmmChunker chunker) { mChunker = chunker; } public Object read(ObjectInput in) throws ClassNotFoundException, IOException { TokenizerFactory tokenizerFactory = (TokenizerFactory) in.readObject(); HiddenMarkovModel hmm = (HiddenMarkovModel) in.readObject(); HmmDecoder decoder = new HmmDecoder(hmm); return new HmmChunker(tokenizerFactory,decoder); } public void writeExternal(ObjectOutput objOut) throws IOException { Compilable tokenizerFactory = (Compilable) mChunker.getTokenizerFactory(); tokenizerFactory.compileTo(objOut); mChunker.getHmmEstimator().compileTo(objOut); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -