crfextractor.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 247 行

JAVA
247
字号
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.base.extract;import edu.umass.cs.mallet.base.fst.CRF4;import edu.umass.cs.mallet.base.pipe.Noop;import edu.umass.cs.mallet.base.pipe.Pipe;import edu.umass.cs.mallet.base.pipe.SerialPipes;import edu.umass.cs.mallet.base.pipe.iterator.InstanceListIterator;import edu.umass.cs.mallet.base.pipe.iterator.PipeInputIterator;import edu.umass.cs.mallet.base.types.*;import java.io.*;import java.util.ArrayList;/** * Created: Oct 12, 2004 * * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A> * @version $Id: CRFExtractor.java,v 1.12 2005/11/08 23:01:01 culotta Exp $ */public class CRFExtractor implements Extractor {  private CRF4 crf;  private Pipe tokenizationPipe;  private Pipe featurePipe;  private String backgroundTag;  private TokenizationFilter filter;  public CRFExtractor (CRF4 crf) {    this (crf, new Noop ());  }  public CRFExtractor (File crfFile) throws IOException {    this (loadCrf(crfFile), new Noop ());  }  public CRFExtractor (CRF4 crf, Pipe tokpipe) {    this (crf, tokpipe, new BIOTokenizationFilter ());  }  public CRFExtractor (CRF4 crf, Pipe tokpipe, TokenizationFilter filter) {    this (crf, tokpipe, filter, "O");  }  public CRFExtractor (CRF4 crf, Pipe tokpipe, TokenizationFilter filter, String backgroundTag) {    this.crf = crf;    tokenizationPipe = tokpipe;    featurePipe = crf.getInputPipe ();    this.filter = filter;    this.backgroundTag = backgroundTag;   }  private static CRF4 loadCrf (File crfFile) throws IOException  {     ObjectInputStream ois = new ObjectInputStream( new FileInputStream( crfFile ) );    CRF4 crf = null;    // We shouldn't run into a ClassNotFound exception...    try {      crf = (CRF4)ois.readObject();    } catch (ClassNotFoundException e) {      System.err.println ("Internal MALLET error: Could not read CRF from file "+crfFile+"\n"+e);      e.printStackTrace ();      throw new RuntimeException (e);    }    ois.close();    return crf;  }  public Extraction extract (Object o)  {    // I don't think there's a polymorphic way to do this. b/c Java sucks. -cas    if (o instanceof Tokenization) {      return extract ((Tokenization) o);    } else  {      return extract (doTokenize (o));    }  }  private Tokenization doTokenize (Object obj)  {    Instance toked = new Instance (obj, null, null, null);    tokenizationPipe.pipe (toked);    return (Tokenization) toked.getData ();  }  public Extraction extract (Tokenization spans)  {    // We assume the input is unpiped.    Instance carrier = new Instance (spans, null, null, null, featurePipe);    Sequence output = crf.transduce ((Sequence) carrier.getData ());    Extraction extraction = new Extraction (this, getTargetAlphabet());    DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(),                                                         spans,                                                        output, null, backgroundTag,                                                        filter);    extraction.addDocumentExtraction (docseq);    return extraction;  }  public InstanceList pipeInstances (PipeInputIterator source)  {    // I think that pipes should be associated neither with InstanceLists, nor    //  with Instances. -cas    InstanceList toked = new InstanceList (tokenizationPipe);    toked.add (source);    InstanceList piped = new InstanceList (getFeaturePipe ());    piped.add (new InstanceListIterator (toked));    return piped;  }  public Extraction extract (PipeInputIterator source)  {    Extraction extraction = new Extraction (this, getTargetAlphabet ());    // Put all the instances through both pipes, then get viterbi path    InstanceList tokedList = new InstanceList (tokenizationPipe);    tokedList.add (source);    InstanceList pipedList = new InstanceList (getFeaturePipe ());    pipedList.add (new InstanceListIterator (tokedList));    InstanceList.Iterator it1 = tokedList.iterator ();    InstanceList.Iterator it2 = pipedList.iterator ();    while (it1.hasNext()) {      Instance toked = it1.nextInstance();      Instance piped = it2.nextInstance ();      Tokenization tok = (Tokenization) toked.getData();      String name = piped.getName().toString();      Sequence input = (Sequence) piped.getData ();      Sequence target = (Sequence) piped.getTarget ();      Sequence output = crf.transduce (input);      DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok,                                                          output, target, backgroundTag,                                                          filter);      extraction.addDocumentExtraction (docseq);    }    return extraction;  }  public Pipe getTokenizationPipe ()  {    return tokenizationPipe;  }  public void setTokenizationPipe (Pipe tokenizationPipe)  {    this.tokenizationPipe = tokenizationPipe;  }  public Pipe getFeaturePipe ()  {    return featurePipe;  }  //xxx This method is inherent dangerous!!! Should check that pipe.alphabet equals crf.alphabet  public void setFeaturePipe (Pipe featurePipe)  {    this.featurePipe = featurePipe;  }  public Alphabet getInputAlphabet ()  {    return crf.getInputAlphabet ();  }  public LabelAlphabet getTargetAlphabet ()  {    return (LabelAlphabet) crf.getOutputAlphabet ();  }  public CRF4 getCrf ()  {    return crf;  }  /**   * Transfer some Pipes from the feature pipe to the tokenization pipe.   *  The feature pipe must be a SerialPipes.  This will destructively modify the CRF object of the extractor.   *   This is useful if you have a CRF hat has been trained from a single pipe, which you need to split up   *    int feature and tokenization pipes   */  public void slicePipes (int num)  {    Pipe fpipe = getFeaturePipe ();    if (!(fpipe instanceof SerialPipes))      throw new IllegalArgumentException ("slicePipes: FeaturePipe must be a SerialPipes.");    SerialPipes sp = (SerialPipes) fpipe;    ArrayList pipes = new ArrayList ();    for (int i = 0; i < num; i++) {      pipes.add (sp.getPipe (0));      sp.removePipe (0);    }    setTokenizationPipe (sp);  }  // Java serialization nonsense  // Serial version 0:  Initial version  // Serial version 1:  Add featurePipe  // Serial version 2:  Add filter  private static final int CURRENT_SERIAL_VERSION = 2;  private static final long serialVersionUID = 1;  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException  {    in.defaultReadObject ();    int version = in.readInt ();    if ((version == 0) || (featurePipe == null)) {      featurePipe = crf.getInputPipe ();    }    if (version < 2) {      filter = new BIOTokenizationFilter ();    }  }  private void writeObject (ObjectOutputStream out) throws IOException  {    out.defaultWriteObject ();    out.writeInt (CURRENT_SERIAL_VERSION);  }  public Sequence pipeInput (Object input)  {    InstanceList all = new InstanceList (getFeaturePipe ());    all.add (input, null, null, null);    return (Sequence) all.getInstance (0).getData();  }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?