documentextraction.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 253 行

JAVA
253
字号
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.base.extract;import edu.umass.cs.mallet.base.types.*;import org.jdom.Element;import org.jdom.Document;import org.jdom.Namespace;import org.jdom.Text;import org.jdom.output.XMLOutputter;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import java.util.List;import gnu.trove.THashMap;/** * Created: Oct 12, 2004 * * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A> * @version $Id: DocumentExtraction.java,v 1.9 2005/03/18 00:14:59 casutton Exp $ *///TODO: Add place where user can have general Transducers to change CRF tokenization into LabeledSpans//TODO: Add field for CRF's labeled tokenizationpublic class DocumentExtraction {  private Tokenization input;  private Sequence predictedLabels;  private LabelSequence target;  private LabeledSpans extractedSpans;  private LabeledSpans targetSpans;  private Object document;  private Label backgroundTag;  private String name;  public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input, Sequence predicted, String background)  {    this (name, dict, input, predicted, null, background, new BIOTokenizationFilter ());  }  public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input, Sequence predicted,                             Sequence target, String background)  {    this (name, dict, input, predicted, target, background, new BIOTokenizationFilter ());  }  public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input,                             Sequence predicted, Sequence target, String background,                             TokenizationFilter filter)  {    this.document = input.getDocument ();    this.name = name;    assert (input.size() == predicted.size());    this.backgroundTag = dict.lookupLabel (background);    this.input = input;    this.predictedLabels = predicted;    this.extractedSpans = filter.constructLabeledSpans (dict, document, backgroundTag, input, predicted);    if (target != null) {      if (target instanceof LabelSequence) this.target = (LabelSequence) target;      this.targetSpans = filter.constructLabeledSpans (dict, document, backgroundTag, input, target);    }  }  public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input,                             LabeledSpans predictedSpans, LabeledSpans trueSpans, String background)  {    this.document = input.getDocument ();    this.name = name;    this.backgroundTag = dict.lookupLabel (background);    this.input = input;    this.extractedSpans = predictedSpans;    this.targetSpans = trueSpans;  }  public Object getDocument ()  {    return document;  }  public Tokenization getInput ()  {    return input;  }  public Sequence getPredictedLabels ()  {    return predictedLabels;  }  public LabeledSpans getExtractedSpans ()  {    return extractedSpans;  }  public LabeledSpans getTargetSpans ()  {    return targetSpans;  }  public LabelSequence getTarget ()  {    return target;  }  public String getName ()  {    return name;  }  //xxx nyi  public Span subspan (int start, int end)  {    throw new UnsupportedOperationException ("not yet implemented.");  }  public Document toXmlDocument ()  {    return toXmlDocument ("doc", Namespace.NO_NAMESPACE);  } /*  public Document toXmlDocument (String rootEltName, Namespace ns)  {    Element element = new Element (rootEltName, ns);    for (int i = 0; i < extractedSpans.size(); i++) {       LabeledSpan span = (LabeledSpan) extractedSpans.get(i);       Label tag = span.getLabel();       if (tag == backgroundTag) {         org.jdom.Parent p = element.addContent (span.getText ());       } else {         Element field = new Element (tag.toString(), ns);         field.setText (span.getText ());         element.addContent (field);       }     }    return new Document (element);  }   */  // does not do non-overlap sanity checking  public Document toXmlDocument (String rootEltName, Namespace ns)   {     ArrayList orderedByStart = new ArrayList (extractedSpans);     Collections.sort (orderedByStart, new Comparator () {       public int compare (Object o, Object o1)       {         int start1 = ((Span)o).getStartIdx ();         int start2 = ((Span)o1).getStartIdx ();         return Double.compare (start1, start2);       }     } );     ArrayList roots = new ArrayList (orderedByStart);     THashMap children = new THashMap ();     for (int i = 0; i < orderedByStart.size(); i++) {       LabeledSpan child = (LabeledSpan) orderedByStart.get (i);       for (int j = i-1; j >= 0; j--) {         LabeledSpan parent = (LabeledSpan) orderedByStart.get (j);         if (parent.isSubspan (child)) {           List childList = (List) children.get (parent);           if (childList == null) {             childList = new ArrayList ();             children.put (parent, childList);           }           roots.remove (child);           childList.add (child);           break;         }       }     }     CharSequence doc = (CharSequence) document;     Span wholeDoc = new StringSpan (doc, 0, doc.length ());     return new Document (generateElement (rootEltName, wholeDoc, roots, children));   }  private Element generateElement (String parentName, Span span, List childSpans, THashMap tree)  {    Element parentElt = new Element (parentName);    if (childSpans == null || childSpans.isEmpty ()) {      parentElt.setContent (new Text (span.getText ()));    } else {      List childElts = new ArrayList (childSpans.size());      int start = span.getStartIdx ();      int current = 0;      for (int i = 0; i < childSpans.size(); i++) {        LabeledSpan childSpan = (LabeledSpan) childSpans.get (i);        Label childLabel = childSpan.getLabel();        int childStart = childSpan.getStartIdx () - start;        if (childStart > current) {          childElts.add (new Text (span.getText().substring (current, childStart)));        }        if (childLabel == backgroundTag) {          childElts.add (new Text (childSpan.getText()));        } else {          String name = childLabel.getEntry ().toString();          List grandchildren = (List) tree.get (childSpan);          childElts.add (generateElement (name, childSpan, grandchildren, tree));        }        current = childSpan.getEndIdx () - start;      }      if (current < span.getEndIdx ())        childElts.add (new Text (span.getText().substring (current)));      parentElt.addContent (childElts);    }    return parentElt;  }  public String toXmlString ()  {    Document jdom = toXmlDocument ();    XMLOutputter outputter = new XMLOutputter ();    return outputter.outputString (jdom);  }  public int size ()  {    return extractedSpans.size();  }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?