📄 treebankchunker.java

📁 自然语言处理领域的一个开发包
💻 JAVA
字号:
/////////////////////////////////////////////////////////////////////////////////Copyright (C) 2003 Thomas Morton// //This library is free software; you can redistribute it and/or//modify it under the terms of the GNU Lesser General Public//License as published by the Free Software Foundation; either//version 2.1 of the License, or (at your option) any later version.// //This library is distributed in the hope that it will be useful,//but WITHOUT ANY WARRANTY; without even the implied warranty of//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the//GNU Lesser General Public License for more details.// //You should have received a copy of the GNU Lesser General Public//License along with this program; if not, write to the Free Software//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.//////////////////////////////////////////////////////////////////////////////package opennlp.tools.lang.english;import java.io.File;import java.io.IOException;import java.util.List;import opennlp.maxent.MaxentModel;import opennlp.maxent.io.SuffixSensitiveGISModelReader;import opennlp.tools.chunker.ChunkerContextGenerator;import opennlp.tools.chunker.ChunkerME;import opennlp.tools.util.Sequence;/** This is a chunker based on the CONLL chunking task which uses Penn Treebank constituents as the basis for the chunks. *   See   http://cnts.uia.ac.be/conll2000/chunking/ for data and task definition. * @author Tom Morton */public class TreebankChunker extends ChunkerME {    /**   * Creates an English Treebank Chunker which uses the specified model file.   * @param modelFile The name of the maxent model to be used.   * @throws IOException When the model file can't be open or read.   */  public TreebankChunker(String modelFile) throws IOException {    this(new SuffixSensitiveGISModelReader(new File(modelFile)).getModel());  }  /**   * Creates an English Treebank Chunker which uses the specified model.   * @param mod The maxent model to be used.   */  public TreebankChunker(MaxentModel mod) {    super(mod);  }  /**   * Creates an English Treebank Chunker which uses the specified model and context generator.   * @param mod The maxent model to be used.   * @param cg The context generator to be used.   */  public TreebankChunker(MaxentModel mod, ChunkerContextGenerator cg) {    super(mod, cg);  }  /**   * Creates an English Treebank Chunker which uses the specified model and context generator    * which will be decoded using the specified beamSize.    * @param mod The maxent model to be used.   * @param cg The context generator to be used.   * @param beamSize The size of the beam used for decoding.   */  public TreebankChunker(MaxentModel mod, ChunkerContextGenerator cg, int beamSize) {    super(mod, cg, beamSize);  }  private boolean validOutcome(String outcome, String prevOutcome) {    if (outcome.startsWith("I-")) {      if (prevOutcome == null) {        return (false);      }      else {        if (prevOutcome.equals("O")) {          return (false);        }        if (!prevOutcome.substring(2).equals(outcome.substring(2))) {          return (false);        }      }    }    return (true);  }    protected boolean validOutcome(String outcome, String[] sequence) {    String prevOutcome = null;    if (sequence.length > 0) {      prevOutcome = sequence[sequence.length-1];    }    return validOutcome(outcome,prevOutcome);  }  /* inherieted java doc */  protected boolean validOutcome(String outcome, Sequence sequence) {    String prevOutcome = null;    List tagList = sequence.getOutcomes();    int lti = tagList.size() - 1;    if (lti >= 0) {      prevOutcome = (String) tagList.get(lti);    }    return validOutcome(outcome,prevOutcome);  }  /**   * Chunks tokenized input from stdin. <br>   * Usage: java opennlp.tools.chunker.EnglishTreebankChunker model < tokenized_sentences <br>   */  public static void main(String[] args) throws IOException {    if (args.length == 0) {      System.err.println("Usage: java opennlp.tools.english.TreebankChunker model < tokenized_sentences");      System.exit(1);    }    TreebankChunker chunker = new TreebankChunker(args[0]);    java.io.BufferedReader inReader = new java.io.BufferedReader(new java.io.InputStreamReader(System.in));    for (String line = inReader.readLine(); line != null; line = inReader.readLine()) {      if (line.equals("")) {        System.out.println();      }      else {        String[] tts = line.split(" ");        String[] tokens = new String[tts.length];        String[] tags = new String[tts.length];        for (int ti=0,tn=tts.length;ti<tn;ti++) {          String[] tt = tts[ti].split("/");          tokens[ti]=tt[0];          tags[ti]=tt[1];         }        String[] chunks = chunker.chunk(tokens,tags);        //System.err.println(java.util.Arrays.asList(chunks));        for (int ci=0,cn=chunks.length;ci<cn;ci++) {          if (ci > 0 && !chunks[ci].startsWith("I-") && !chunks[ci-1].equals("O")) {            System.out.print(" ]");          }                      if (chunks[ci].startsWith("B-")) {            System.out.print(" ["+chunks[ci].substring(2));          }          /*           else {           System.out.print(" ");           }           */          System.out.print(" "+tokens[ci]+"/"+tags[ci]);        }        if (!chunks[chunks.length-1].equals("O")) {          System.out.print(" ]");        }        System.out.println();      }    }  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -