📄 parsereventstream.java

📁 自然语言处理领域的一个开发包
💻 JAVA
字号:
/////////////////////////////////////////////////////////////////////////////////Copyright (C) 2003 Thomas Morton////This library is free software; you can redistribute it and/or//modify it under the terms of the GNU Lesser General Public//License as published by the Free Software Foundation; either//version 2.1 of the License, or (at your option) any later version.////This library is distributed in the hope that it will be useful,//but WITHOUT ANY WARRANTY; without even the implied warranty of//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the//GNU General Public License for more details.////You should have received a copy of the GNU Lesser General Public//License along with this program; if not, write to the Free Software//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.//////////////////////////////////////////////////////////////////////////////   package opennlp.tools.parser;import java.util.ArrayList;import java.util.List;import java.util.Set;import opennlp.maxent.DataStream;import opennlp.maxent.Event;import opennlp.maxent.EventStream;import opennlp.tools.chunker.ChunkerContextGenerator;import opennlp.tools.ngram.Dictionary;import opennlp.tools.postag.DefaultPOSContextGenerator;import opennlp.tools.postag.POSContextGenerator;/** * Wrapper class for one of four parser event streams.  The particular event stram is specified  * at construction. * @author Tom Morton * */public class ParserEventStream implements EventStream {  private BuildContextGenerator bcg;  private CheckContextGenerator kcg;  private ChunkerContextGenerator ccg;  private POSContextGenerator tcg;  private DataStream data;  private Event[] events;  private int ei;  private HeadRules rules;  private Set punctSet;  private EventTypeEnum etype;  /**   * Create an event stream based on the specified data stream of the specified type using the specified head rules.   * @param d A 1-parse-per-line Penn Treebank Style parse.    * @param rules The head rules.   * @param etype The type of events desired (tag, chunk, build, or check).   * @param dict A tri-gram dictionary to reduce feature generation.   */  public ParserEventStream(DataStream d, HeadRules rules, EventTypeEnum etype, Dictionary dict) {    if (etype == EventTypeEnum.BUILD) {      this.bcg = new BuildContextGenerator(dict);    }    else if (etype == EventTypeEnum.CHECK) {      this.kcg = new CheckContextGenerator();    }    else if (etype == EventTypeEnum.CHUNK) {      this.ccg = new ChunkContextGenerator();    }    else if (etype == EventTypeEnum.TAG) {      this.tcg = new DefaultPOSContextGenerator(dict);    }    this.rules = rules;    punctSet = rules.getPunctuationTags();    this.etype = etype;    data = d;    ei = 0;    if (d.hasNext()) {      addNewEvents();    }    else {      events = new Event[0];    }  }    public ParserEventStream(DataStream d, HeadRules rules, EventTypeEnum etype) {    this (d,rules,etype,null);  }  public boolean hasNext() {    return (ei < events.length || data.hasNext());  }  public Event nextEvent() {    if (ei == events.length) {      addNewEvents();      ei = 0;    }    return ((Event) events[ei++]);  }  private static void getInitialChunks(Parse p, List ichunks) {    if (p.isPosTag()) {      ichunks.add(p);    }    else {      Parse[] kids = p.getChildren();      boolean allKidsAreTags = true;      for (int ci = 0, cl = kids.length; ci < cl; ci++) {        if (!kids[ci].isPosTag()) {          allKidsAreTags = false;          break;        }      }      if (allKidsAreTags) {        ichunks.add(p);      }      else {        for (int ci = 0, cl = kids.length; ci < cl; ci++) {          getInitialChunks(kids[ci], ichunks);        }      }    }  }  public static Parse[] getInitialChunks(Parse p) {    List chunks = new ArrayList();    getInitialChunks(p, chunks);    return (Parse[]) chunks.toArray(new Parse[chunks.size()]);  }  /**   * Returns true if the specified child is the first child of the specified parent.   * @param child The child parse.   * @param parent The parent parse.   * @return true if the specified child is the first child of the specified parent; false otherwise.   */  private boolean firstChild(Parse child, Parse parent) {    return ParserME.collapsePunctuation(parent.getChildren(),punctSet)[0] == child;  }  /**   * Returns true if the specified child is the last child of the specified parent.   * @param child The child parse.   * @param parent The parent parse.   * @return true if the specified child is the last child of the specified parent; false otherwise.   */  private boolean lastChild(Parse child, Parse parent) {    Parse[] kids = ParserME.collapsePunctuation(parent.getChildren(),punctSet);    return (kids[kids.length - 1] == child);  }  private void addNewEvents() {    String parseStr = (String) data.nextToken();    //System.err.println("ParserEventStream.addNewEvents: "+parseStr);    List newEvents = new ArrayList();    Parse p = Parse.parseParse(parseStr);    p.updateHeads(rules);    Parse[] chunks = getInitialChunks(p);    if (etype == EventTypeEnum.TAG) {      addTagEvents(newEvents, chunks);    }    else if (etype == EventTypeEnum.CHUNK) {      addChunkEvents(newEvents, chunks);    }    else {      addParseEvents(newEvents, ParserME.collapsePunctuation(chunks,punctSet));    }    this.events = (Event[]) newEvents.toArray(new Event[newEvents.size()]);  }    public static  Parse[] reduceChunks(Parse[] chunks, int ci, Parse parent) {    String type = parent.getType();    //  perform reduce    int reduceStart = ci;    int reduceEnd = ci;    while (reduceStart >=0 && chunks[reduceStart].getParent() == parent) {      reduceStart--;    }    reduceStart++;    Parse[] reducedChunks;    if (!type.equals(ParserME.TOP_NODE)) {      reducedChunks = new Parse[chunks.length-(reduceEnd-reduceStart+1)+1]; //total - num_removed + 1 (for new node)      //insert nodes before reduction      for (int ri=0,rn=reduceStart;ri<rn;ri++) {        reducedChunks[ri]=chunks[ri];      }      //insert reduced node      reducedChunks[reduceStart]=parent;      //propagate punctuation sets      parent.setPrevPunctuation(chunks[reduceStart].getPreviousPunctuationSet());      parent.setNextPunctuation(chunks[reduceEnd].getNextPunctuationSet());      //insert nodes after reduction      int ri=reduceStart+1;      for (int rci=reduceEnd+1;rci<chunks.length;rci++) {        reducedChunks[ri]=chunks[rci];        ri++;      }      ci=reduceStart-1; //ci will be incremented at end of loop    }    else {      reducedChunks = new Parse[0];    }    return reducedChunks;  }  private void addParseEvents(List parseEvents, Parse[] chunks) {    int ci = 0;    while (ci < chunks.length) {      //System.err.println("parserEventStream.addParseEvents: chunks="+Arrays.asList(chunks));      Parse c = chunks[ci];      Parse parent = c.getParent();      if (parent != null) {        String type = parent.getType();        String outcome;        if (firstChild(c, parent)) {          outcome = ParserME.START + type;        }        else {          outcome = ParserME.CONT + type;        }        //System.err.println("parserEventStream.addParseEvents: chunks["+ci+"]="+c+" label="+outcome);        c.setLabel(outcome);        if (etype == EventTypeEnum.BUILD) {          parseEvents.add(new Event(outcome, bcg.getContext(chunks, ci)));        }        int start = ci - 1;        while (start >= 0 && chunks[start].getParent() == parent) {          start--;        }        if (lastChild(c, parent)) {          if (etype == EventTypeEnum.CHECK) {            parseEvents.add(new Event(ParserME.COMPLETE, kcg.getContext( chunks, type, start + 1, ci)));          }          //perform reduce          int reduceStart = ci;          while (reduceStart >=0 && chunks[reduceStart].getParent() == parent) {            reduceStart--;          }          reduceStart++;          chunks = reduceChunks(chunks,ci,parent);          ci=reduceStart-1; //ci will be incremented at end of loop        }        else {          if (etype == EventTypeEnum.CHECK) {            parseEvents.add(new Event(ParserME.INCOMPLETE, kcg.getContext(chunks, type, start + 1, ci)));          }        }      }      ci++;    }  }  private void addChunkEvents(List chunkEvents, Parse[] chunks) {    List toks = new ArrayList();    List tags = new ArrayList();    List preds = new ArrayList();    for (int ci = 0, cl = chunks.length; ci < cl; ci++) {      Parse c = chunks[ci];      if (c.isPosTag()) {        toks.add(c.toString());        tags.add(c.getType());        preds.add(ParserME.OTHER);      }      else {        boolean start = true;        String ctype = c.getType();        Parse[] kids = c.getChildren();        for (int ti=0,tl=kids.length;ti<tl;ti++) {          Parse tok = kids[ti];          toks.add(tok.toString());          tags.add(tok.getType());          if (start) {            preds.add(ParserME.START + ctype);            start = false;          }          else {            preds.add(ParserME.CONT + ctype);          }        }      }    }    for (int ti = 0, tl = toks.size(); ti < tl; ti++) {      chunkEvents.add(new Event((String) preds.get(ti), ccg.getContext(ti, toks.toArray(), (String[]) tags.toArray(new String[tags.size()]), (String[]) preds.toArray(new String[preds.size()]))));    }  }  private void addTagEvents(List tagEvents, Parse[] chunks) {    List toks = new ArrayList();    List preds = new ArrayList();    for (int ci = 0, cl = chunks.length; ci < cl; ci++) {      Parse c = (Parse) chunks[ci];      if (c.isPosTag()) {        toks.add(c.toString());        preds.add(c.getType());      }      else {        Parse[] kids = c.getChildren();        for (int ti=0,tl=kids.length;ti<tl;ti++) {          Parse tok = kids[ti];          toks.add(tok.toString());          preds.add(tok.getType());        }      }    }    for (int ti = 0, tl = toks.size(); ti < tl; ti++) {      tagEvents.add(new Event((String) preds.get(ti), tcg.getContext(ti, toks.toArray(), (String[]) preds.toArray(new String[preds.size()]), null)));    }  }  public static void main(String[] args) throws java.io.IOException {    if (args.length == 0) {      System.err.println("Usage ParserEventStream -[tag|chunk|build|check|fun] head_rules [dictionary] < parses");      System.exit(1);    }    EventTypeEnum etype = null;    boolean fun = false;    int ai = 0;    while (ai < args.length && args[ai].startsWith("-")) {      if (args[ai].equals("-build")) {        etype = EventTypeEnum.BUILD;      }      else if (args[ai].equals("-check")) {        etype = EventTypeEnum.CHECK;      }      else if (args[ai].equals("-chunk")) {        etype = EventTypeEnum.CHUNK;      }      else if (args[ai].equals("-tag")) {        etype = EventTypeEnum.TAG;      }      else if (args[ai].equals("-fun")) {        fun = true;      }      else {        System.err.println("Invalid option " + args[ai]);        System.exit(1);      }      ai++;    }    HeadRules rules = new opennlp.tools.lang.english.HeadRules(args[ai++]);    Dictionary dict = null;    if (ai < args.length) {      dict = new Dictionary(args[ai++]);    }    if (fun) {      Parse.useFunctionTags(true);    }    opennlp.maxent.EventStream es = new ParserEventStream(new opennlp.maxent.PlainTextByLineDataStream(new java.io.InputStreamReader(System.in)), rules, etype, dict);    while (es.hasNext()) {      System.out.println(es.nextEvent());    }  }}/** * Enumerated type of event types for the parser.  * */class EventTypeEnum {  private String name;  public static final EventTypeEnum BUILD = new EventTypeEnum("build");  public static final EventTypeEnum CHECK = new EventTypeEnum("check");  public static final EventTypeEnum CHUNK = new EventTypeEnum("chunk");  public static final EventTypeEnum TAG = new EventTypeEnum("tag");  private EventTypeEnum(String name) {    this.name = name;  }  public String toString() {    return name;  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -