📄 sentencedetectorme.java

📁 自然语言处理领域的一个开发包
💻 JAVA
字号:
///////////////////////////////////////////////////////////////////////////////// Copyright (C) 2004 Jason Baldridge, Gann Bierner and Tom Morton// // This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.// // This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the// GNU Lesser General Public License for more details.// // You should have received a copy of the GNU Lesser General Public// License along with this program; if not, write to the Free Software// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.//////////////////////////////////////////////////////////////////////////////package opennlp.tools.sentdetect;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileReader;import java.io.IOException;import java.io.InputStreamReader;import java.io.Reader;import java.util.ArrayList;import java.util.List;import opennlp.maxent.ContextGenerator;import opennlp.maxent.DataStream;import opennlp.maxent.EventStream;import opennlp.maxent.GIS;import opennlp.maxent.GISModel;import opennlp.maxent.IntegerPool;import opennlp.maxent.MaxentModel;import opennlp.maxent.PlainTextByLineDataStream;import opennlp.maxent.io.SuffixSensitiveGISModelWriter;import opennlp.tools.util.Pair;/** * A sentence detector for splitting up raw text into sentences.  A maximum * entropy model is used to evaluate the characters ".", "!", and "?" in a * string to determine if they signify the end of a sentence. * * @author      Jason Baldridge and Tom Morton * @version     $Revision: 1.12 $, $Date: 2005/08/25 03:18:18 $ */public class SentenceDetectorME implements SentenceDetector {  /** The maximum entropy model to use to evaluate contexts. */  private MaxentModel model;  /** The feature context generator. */  private final ContextGenerator cgen;  /** The EndOfSentenceScanner to use when scanning for end of sentence offsets. */  private final EndOfSentenceScanner scanner;  /** A pool of read-only java.lang.Integer objects in the range 0..100 */  private static final IntegerPool INT_POOL = new IntegerPool(100);    /** The list of probabilities associated with each decision. */  private List sentProbs;  /**   * Constructor which takes a MaxentModel and calls the three-arg   * constructor with that model, an SDContextGenerator, and the   * default end of sentence scanner.   *   * @param m The MaxentModel which this SentenceDetectorME will use to   *          evaluate end-of-sentence decisions.   */  public SentenceDetectorME(MaxentModel m) {    this(m, new SDContextGenerator(DefaultEndOfSentenceScanner.eosCharacters), new DefaultEndOfSentenceScanner());    sentProbs = new ArrayList(50);  }  /**   * Constructor which takes a MaxentModel and a ContextGenerator.   * calls the three-arg constructor with a default ed of sentence scanner.   *   * @param m The MaxentModel which this SentenceDetectorME will use to   *          evaluate end-of-sentence decisions.   * @param cg The ContextGenerator object which this SentenceDetectorME   *           will use to turn Strings into contexts for the model to   *           evaluate.   */  public SentenceDetectorME(MaxentModel m, ContextGenerator cg) {    this(m, cg, new DefaultEndOfSentenceScanner());  }  /**   * Creates a new <code>SentenceDetectorME</code> instance.   *   * @param m The MaxentModel which this SentenceDetectorME will use to   *          evaluate end-of-sentence decisions.   * @param cg The ContextGenerator object which this SentenceDetectorME   *           will use to turn Strings into contexts for the model to   *           evaluate.   * @param s the EndOfSentenceScanner which this SentenceDetectorME   *          will use to locate end of sentence indexes.   */  public SentenceDetectorME(MaxentModel m, ContextGenerator cg, EndOfSentenceScanner s) {    model = m;    cgen = cg;    scanner = s;  }  /**   * Detect sentences in a String.   *   * @param s  The string to be processed.   * @return   A string array containing individual sentences as elements.   *              */  public String[] sentDetect(String s) {    int[] starts = sentPosDetect(s);    if (starts.length == 0) {	return new String[] {s};    }    boolean leftover = starts[starts.length - 1] != s.length();    String[] sents = new String[leftover? starts.length + 1 : starts.length];    sents[0] = s.substring(0,starts[0]);    for (int si = 1; si < starts.length; si++) {      sents[si] = s.substring(starts[si - 1], starts[si]);    }    if (leftover) {        sents[sents.length - 1] = s.substring(starts[starts.length - 1]);    }        return (sents);  }  private int getFirstWS(String s, int pos) {    while (pos < s.length() && !Character.isWhitespace(s.charAt(pos)))      pos++;    return pos;  }  private int getFirstNonWS(String s, int pos) {    while (pos < s.length() && Character.isWhitespace(s.charAt(pos)))      pos++;    return pos;  }  /**   * Detect the position of the first words of sentences in a String.   *   * @param s  The string to be processed.   * @return   A integer array containing the positions of the end index of   *          every sentence   *              */  public int[] sentPosDetect(String s) {    double sentProb = 1;    sentProbs.clear();    StringBuffer sb = new StringBuffer(s);    List enders = scanner.getPositions(s);    List positions = new ArrayList(enders.size());    for (int i = 0, end = enders.size(), index = 0; i < end; i++) {      Integer candidate = (Integer) enders.get(i);      int cint = candidate.intValue();      // skip over the leading parts of non-token final delimiters      int fws = getFirstWS(s,cint + 1);      if (((i + 1) < end) && (((Integer) enders.get(i + 1)).intValue() < fws)) {        continue;      }      Pair pair = new Pair(sb, candidate);      double[] probs = model.eval(cgen.getContext(pair));      String bestOutcome = model.getBestOutcome(probs);      sentProb *= probs[model.getIndex(bestOutcome)];      if (bestOutcome.equals("T") && isAcceptableBreak(s, index, cint)) {        if (index != cint) {          positions.add(INT_POOL.get(getFirstNonWS(s, getFirstWS(s,cint + 1))));          sentProbs.add(new Double(probs[model.getIndex(bestOutcome)]));        }        index = cint + 1;      }    }    int[] sentPositions = new int[positions.size()];    for (int i = 0; i < sentPositions.length; i++) {      sentPositions[i] = ((Integer) positions.get(i)).intValue();    }    return sentPositions;  }  /** Returns the probabilities associated with the most recent   * calls to sentDetect().   * @return probability for each sentence returned for the most recent   * call to sentDetect.  If not applicable an empty array is   * returned.   */  public double[] getSentenceProbabilities() {    double[] sentProbArray = new double[sentProbs.size()];    for (int i = 0; i < sentProbArray.length; i++) {      sentProbArray[i] = ((Double) sentProbs.get(i)).doubleValue();    }    return sentProbArray;  }  /**    * Allows subclasses to check an overzealous (read: poorly   * trained) model from flagging obvious non-breaks as breaks based   * on some boolean determination of a break's acceptability.   *   * <p>The implementation here always returns true, which means   * that the MaxentModel's outcome is taken as is.</p>   *    * @param s the string in which the break occured.    * @param fromIndex the start of the segment currently being evaluated    * @param candidateIndex the index of the candidate sentence ending    * @return true if the break is acceptable    */  protected boolean isAcceptableBreak(String s, int fromIndex, int candidateIndex) {    return true;  }  public static GISModel train(EventStream es, int iterations, int cut) throws IOException {    return GIS.trainModel(es, iterations, cut);  }  /**   * Use this training method if you wish to supply an end of   * sentence scanner which provides a different set of ending chars   * other than the default ones.  They are "\\.|!|\\?|\\\"|\\)".   *   */  public static GISModel train(File inFile, int iterations, int cut, EndOfSentenceScanner scanner) throws IOException {    EventStream es;    DataStream ds;    Reader reader;    reader = new BufferedReader(new FileReader(inFile));    ds = new PlainTextByLineDataStream(reader);    es = new SDEventStream(ds, scanner);    return GIS.trainModel(es, iterations, cut);  }    private static void usage() {    System.err.println("Usage: SentenceDetectorME trainData modelName");    System.exit(1);      }  /**   * <p>Trains a new sentence detection model.</p>   *   * <p>Usage: opennlp.tools.sentdetect.SentenceDetectorME data_file new_model_name (iterations cutoff)?</p>   *   */  public static void main(String[] args) throws IOException {    int ai=0;    String encoding = null;    while (args[ai].startsWith("-")) {      if (args[ai].equals("-encoding")) {        ai++;        if (ai < args.length) {          encoding = args[ai];          ai++;        }        else {          usage();        }      }    }    File inFile = new File(args[ai++]);    File outFile = new File(args[ai++]);    GISModel mod;        try {      EventStream es = new SDEventStream(new PlainTextByLineDataStream(new InputStreamReader(new FileInputStream(inFile),encoding)));      if (args.length > ai)        mod = train(es, Integer.parseInt(args[ai++]), Integer.parseInt(args[ai++]));      else        mod = train(es, 100, 5);      System.out.println("Saving the model as: " + outFile);      new SuffixSensitiveGISModelWriter(mod, outFile).persist();    }    catch (Exception e) {      e.printStackTrace();    }  }}
💿 文件大小 1863 K
👤 上传用户 yangbaochun
📂 所属分类多国语言处理
🏷️ 相关标签

#自然语言处理 #开发包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -