📄 charlmhmmchunker.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
     * a dictionary.  They require regular training data in order     * to train the contexts in which dictionary items show up.     * Attempting to train with only a dictionary will lead to     * null pointer exceptions when attempting to decode.     *     * @param cSeq Character sequence on which to train.     * @param type Type of chunk.     */    public void trainDictionary(CharSequence cSeq, String type) {        char[] cs = Strings.toCharArray(cSeq);        Tokenizer tokenizer = getTokenizerFactory().tokenizer(cs,0,cs.length);        String[] tokens = tokenizer.tokenize();        if (tokens.length < 1) {            String msg = "Did not find any tokens in entry."                + "Char sequence=" + cSeq;            throw new IllegalArgumentException(msg);        }        AbstractHmmEstimator estimator = getHmmEstimator();        SymbolTable table = estimator.stateSymbolTable();        smoothBaseTag(type,table,estimator);        if (tokens.length == 1) {            estimator.trainEmit("W_" + type, tokens[0]);            return;        }        String initialTag = "B_" + type;        estimator.trainEmit(initialTag, tokens[0]);        String prevTag = initialTag;        for (int i = 1; i+1 < tokens.length; ++i) {            String tag = "M_" + type;            estimator.trainEmit(tag, tokens[i]);            estimator.trainTransit(prevTag,tag);            prevTag = tag;        }        String finalTag = "E_" + type;        estimator.trainEmit(finalTag, tokens[tokens.length-1]);        estimator.trainTransit(prevTag,finalTag);    }    /**     * Handle the specified chunking by tokenizing it, assigning tags     * and training the underlying hidden Markov model.  For a description     * of how chunkings are broken down into taggings, see the parent     * class documentation in {@link HmmChunker}.     *     * @param chunking Chunking to use for training.     */    public void handle(Chunking chunking) {        ChunkHandlerAdapter adapter            = new ChunkHandlerAdapter(this,                                      getTokenizerFactory(),                                      false); // could make errors class param        adapter.handle(chunking);    }    /**     * Handle the specified tokens, whitespaces and tags by using them     * (after conversion) to train the underlying hidden Markov model.     * The description of tag format is given in the class documentation     * above; this format is converted into the underlying format used     * by the underlying HMM as described in {@link HmmChunker}.     *     * @param tokens Array of tokens.     * @param whitespaces Array of whitespaces; unused and may be     * <code>null</code>.     * @param tags Array of tags in format described in class documentation.     * @throws IllegalArgumentException If the token and tag arrays are not     * the same length, or if the whitespaces array is non-null and not one     * longer than the array of tokens.     */    public void handle(String[] tokens, String[] whitespaces, String[] tags) {        getHmmEstimator().handle(tokens,whitespaces,trainNormalize(tags));        smoothTags(tags);    }    /**     * Compiles this model to the specified object output stream.  The     * model may then be read back in using {@link     * java.io.ObjectInput#readObject()}; the resulting object will be     * instance of {@link HmmChunker}.  See the class documentation     * above for information on setting the cache for a compiled     * model.     *     * @throws IOException If there is an I/O error during the write.     * @throws IllegalArgumentException If the tokenizer factory supplied to     * the constructor of this class is not compilable.     */    public void compileTo(ObjectOutput objOut) throws IOException {        if (!(mTokenizerFactory instanceof Compilable)) {            String msg = "Tokenizer factory must implement class="                + " com.aliasi.util.Compilable "                + " Found class=" + mTokenizerFactory.getClass();            throw new IllegalArgumentException(msg);        }        objOut.writeObject(new Externalizer(this));    }    /**     * Returns a string representation of the complete topology of the     * underlying HMM with log2 transition probabilities.  Note that this     * output does not represent the emission probabilities per category.     *     * @return String-based representation of this chunker.     */    public String toString() {        StringBuilder sb = new StringBuilder();        java.util.Set expandedTagSet = new java.util.TreeSet();        expandedTagSet.add("MM_O");        expandedTagSet.add("WW_O_BOS");        expandedTagSet.add("BB_O_BOS");        expandedTagSet.add("EE_O_BOS");        for (Object tag0 : mTagSet) {            String x = tag0.toString();            expandedTagSet.add("B_" + x);            expandedTagSet.add("M_" + x);            expandedTagSet.add("E_" + x);            expandedTagSet.add("W_" + x);            expandedTagSet.add("BB_O_" + x);            expandedTagSet.add("EE_O_" + x);            expandedTagSet.add("WW_O_" + x);        }        for (Object tag0Obj : expandedTagSet) {            String tag0 = tag0Obj.toString();            sb.append("\n");            sb.append("start(" + tag0 + ")=" + mHmmEstimator.startLog2Prob(tag0));            sb.append("\n");            sb.append("  end(" + tag0 + ")=" + mHmmEstimator.endLog2Prob(tag0));            sb.append("\n");            for (Object tag1Obj : expandedTagSet) {                String tag1 = tag1Obj.toString();                sb.append("trans(" + tag0 + "," + tag1 + ")="                                   + mHmmEstimator.transitLog2Prob(tag0,tag1));                sb.append("\n");            }        }        return sb.toString();    }    void smoothBoundaries() {        // mTagSet.add("BOS"); // BOS used for begin and end        AbstractHmmEstimator hmmEstimator = getHmmEstimator();        SymbolTable table = hmmEstimator.stateSymbolTable();        String bbO = "BB_O_BOS";        String mmO = "MM_O";        String eeO = "EE_O_BOS";        String wwO = "WW_O_BOS";        table.getOrAddSymbol(bbO);        table.getOrAddSymbol(mmO);        table.getOrAddSymbol(eeO);        table.getOrAddSymbol(wwO);        hmmEstimator.trainStart(bbO);        hmmEstimator.trainStart(wwO);        hmmEstimator.trainEnd(eeO);        hmmEstimator.trainEnd(wwO);        hmmEstimator.trainTransit(bbO,mmO);        hmmEstimator.trainTransit(bbO,eeO);        hmmEstimator.trainTransit(mmO,mmO);        hmmEstimator.trainTransit(mmO,eeO);    }    void smoothTags(String[] tags) {        if (!mSmoothTags) return;        AbstractHmmEstimator hmmEstimator = getHmmEstimator();        SymbolTable table = hmmEstimator.stateSymbolTable();        for (int i = 0; i < tags.length; ++i)            smoothTag(tags[i],table,hmmEstimator);    }    void smoothTag(String tag, SymbolTable table,                   AbstractHmmEstimator hmmEstimator) {        smoothBaseTag(HmmChunker.baseTag(tag), table, hmmEstimator);    }    void smoothBaseTag(String baseTag, SymbolTable table,                       AbstractHmmEstimator hmmEstimator) {        if (!mTagSet.add(baseTag)) return; // already added        if ("O".equals(baseTag)) return;  // constructor + other tags smooth "O"        String b_x = "B_" + baseTag;        String m_x = "M_" + baseTag;        String e_x = "E_" + baseTag;        String w_x = "W_" + baseTag;        String bb_o_x = "BB_O_" + baseTag;        // String mm_o = "MM_O"; // no tag modifier, just constant        String ee_o_x = "EE_O_" + baseTag;        String ww_o_x = "WW_O_" + baseTag;        table.getOrAddSymbol(b_x);        table.getOrAddSymbol(m_x);        table.getOrAddSymbol(e_x);        table.getOrAddSymbol(w_x);        table.getOrAddSymbol(bb_o_x);        // table.getOrAddSymbol("MM_O");  // in constructor        table.getOrAddSymbol(ee_o_x);        table.getOrAddSymbol(ww_o_x);        hmmEstimator.trainStart(b_x);        hmmEstimator.trainTransit(b_x,m_x);        hmmEstimator.trainTransit(b_x,e_x);        hmmEstimator.trainTransit(m_x,m_x);        hmmEstimator.trainTransit(m_x,e_x);        hmmEstimator.trainEnd(e_x);        hmmEstimator.trainTransit(e_x,bb_o_x);        hmmEstimator.trainStart(w_x);        hmmEstimator.trainEnd(w_x);        hmmEstimator.trainTransit(w_x,bb_o_x);        hmmEstimator.trainTransit(bb_o_x,"MM_O");        hmmEstimator.trainTransit("MM_O",ee_o_x); // handles all MM_O to ends        hmmEstimator.trainTransit(ee_o_x,b_x);        hmmEstimator.trainTransit(ee_o_x,w_x);        hmmEstimator.trainStart(ww_o_x);        hmmEstimator.trainTransit(ww_o_x,b_x);        hmmEstimator.trainTransit(ww_o_x,w_x);        hmmEstimator.trainTransit(e_x,"WW_O_BOS");        hmmEstimator.trainTransit(w_x,"WW_O_BOS");        hmmEstimator.trainTransit(bb_o_x,"EE_O_BOS");        hmmEstimator.trainTransit("BB_O_BOS",ee_o_x);        Iterator it = mTagSet.iterator();        while (it.hasNext()) {            String type = it.next().toString();            if ("O".equals(type)) continue;            if ("BOS".equals(type)) continue;            String bb_o_y = "BB_O_" + type;            String ww_o_y = "WW_O_" + type;            String ee_o_y = "EE_O_" + type;            String b_y = "B_" + type;            String w_y = "W_" + type;            String e_y = "E_" + type;            hmmEstimator.trainTransit(e_x,ww_o_y);            hmmEstimator.trainTransit(e_x,b_y);            hmmEstimator.trainTransit(e_x,w_y);            hmmEstimator.trainTransit(w_x,ww_o_y);            hmmEstimator.trainTransit(w_x,b_y);            hmmEstimator.trainTransit(w_x,w_y);            hmmEstimator.trainTransit(e_y,b_x);            hmmEstimator.trainTransit(e_y,w_x);            hmmEstimator.trainTransit(e_y,ww_o_x);            hmmEstimator.trainTransit(w_y,b_x);            hmmEstimator.trainTransit(w_y,w_x);            hmmEstimator.trainTransit(w_y,ww_o_x);            hmmEstimator.trainTransit(bb_o_x,ee_o_y);            hmmEstimator.trainTransit(bb_o_y,ee_o_x);        }    }    static class Externalizer extends AbstractExternalizable {        private static final long serialVersionUID = 4630707998932521821L;        final CharLmHmmChunker mChunker;        public Externalizer() {            this(null);        }        public Externalizer(CharLmHmmChunker chunker) {            mChunker = chunker;        }        public Object read(ObjectInput in)            throws ClassNotFoundException, IOException {            TokenizerFactory tokenizerFactory                = (TokenizerFactory) in.readObject();            HiddenMarkovModel hmm                = (HiddenMarkovModel) in.readObject();            HmmDecoder decoder = new HmmDecoder(hmm);            return new HmmChunker(tokenizerFactory,decoder);        }        public void writeExternal(ObjectOutput objOut) throws IOException {            Compilable tokenizerFactory                = (Compilable) mChunker.getTokenizerFactory();            tokenizerFactory.compileTo(objOut);            mChunker.getHmmEstimator().compileTo(objOut);        }    }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -