📄 parserme.java

📁 自然语言处理领域的一个开发包
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/////////////////////////////////////////////////////////////////////////////////Copyright (C) 2003 Thomas Morton////This library is free software; you can redistribute it and/or//modify it under the terms of the GNU Lesser General Public//License as published by the Free Software Foundation; either//version 2.1 of the License, or (at your option) any later version.////This library is distributed in the hope that it will be useful,//but WITHOUT ANY WARRANTY; without even the implied warranty of//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the//GNU General Public License for more details.////You should have received a copy of the GNU Lesser General Public//License along with this program; if not, write to the Free Software//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.//////////////////////////////////////////////////////////////////////////////   package opennlp.tools.parser;import java.util.ArrayList;import java.util.Collection;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Set;import java.util.SortedSet;import java.util.TreeSet;import opennlp.maxent.DataStream;import opennlp.maxent.GISModel;import opennlp.maxent.MaxentModel;import opennlp.maxent.TwoPassDataIndexer;import opennlp.tools.ngram.Dictionary;import opennlp.tools.ngram.MutableDictionary;import opennlp.tools.util.Sequence;import opennlp.tools.util.Span;/** * Class for a shift reduce style parser based on Adwait Ratnaparkhi's 1998 thesis.  *  */public class ParserME {  /** The maximum number of parses advanced from all preceeding parses at each derivation step. */  private int M;  /** The maximum number of parses to advance from a single preceeding parse. */  private int K;  /** The minimum total probability mass of advanced outcomes.*/  private double Q;  /** The default beam size used if no beam size is given. */  public static final int defaultBeamSize = 20;  /** The default amount of probability mass required of advanced outcomes. */  public static final double defaultAdvancePercentage = 0.95;  /** Completed parses. */  private SortedSet parses;  /** Incomplete parses which will be advanced. */  private SortedSet odh;  /** Incomplete parses which have been advanced. */  private SortedSet ndh;  /** The pos-tagger that the parser uses. */  protected ParserTagger tagger; //POS tagger  /** The chunker that the parser uses to chunk non-recursive structures. */  protected ParserChunker chunker; //Basal Chunker    private MaxentModel buildModel;  private MaxentModel checkModel;  private BuildContextGenerator buildContextGenerator;  private CheckContextGenerator checkContextGenerator;  private HeadRules headRules;  private Set punctSet;  private double[] bprobs;  private double[] cprobs;  public static final String TOP_NODE = "TOP";  public static final String TOK_NODE = "TK";  public static final Integer ZERO = new Integer(0);  /** Prefix for outcomes starting a constituent. */  public static final String START = "S-";  /** Prefix for outcomes continuing a constituent. */  public static final String CONT = "C-";  /** Outcome for token which is not contained in a basal constituent. */  public static final String OTHER = "O";  /** Outcome used when a constituent is complete. */  public static final String COMPLETE = "c";  /** Outcome used when a constituent is incomplete. */  public static final String INCOMPLETE = "i";  private static final String TOP_START = START + TOP_NODE;  private int topStartIndex;  private Map startTypeMap;  private Map contTypeMap;    private int completeIndex;  private int incompleteIndex;    private boolean createDerivationString = false;  private boolean debugOn = false;    protected boolean reportFailedParse;    /**   * Creates a new parser using the specified models and head rules.   * @param buildModel The model to assign constituent labels.   * @param checkModel The model to determine a constituent is complete.   * @param tagger The model to assign pos-tags.   * @param chunker The model to assign flat constituent labels.   * @param headRules The head rules for head word perculation.   */  public ParserME(MaxentModel buildModel, MaxentModel checkModel, ParserTagger tagger, ParserChunker chunker, HeadRules headRules) {  	this(buildModel,checkModel,tagger,chunker,headRules,defaultBeamSize,defaultAdvancePercentage);  }  /**   * Creates a new parser using the specified models and head rules using the specified beam size and advance percentage.   * @param buildModel The model to assign constituent labels.   * @param checkModel The model to determine a constituent is complete.   * @param tagger The model to assign pos-tags.   * @param chunker The model to assign flat constituent labels.   * @param headRules The head rules for head word perculation.   * @param beamSize The number of different parses kept during parsing.    * @param advancePercentage The minimal amount of probability mass which advanced outcomes must represent.     * Only outcomes which contribute to the top "advancePercentage" will be explored.       */  public ParserME(MaxentModel buildModel, MaxentModel checkModel, ParserTagger tagger, ParserChunker chunker, HeadRules headRules, int beamSize, double advancePercentage) {    this.tagger = tagger;     this.chunker = chunker;    this.buildModel = buildModel;    this.checkModel = checkModel;    this.M = beamSize;    this.K = beamSize;    this.Q = advancePercentage;    reportFailedParse = true;    bprobs = new double[buildModel.getNumOutcomes()];    cprobs = new double[checkModel.getNumOutcomes()];    this.buildContextGenerator = new BuildContextGenerator();    this.checkContextGenerator = new CheckContextGenerator();    this.headRules = headRules;    this.punctSet = headRules.getPunctuationTags();    odh = new TreeSet();    ndh = new TreeSet();    parses = new TreeSet();    startTypeMap = new HashMap();    contTypeMap = new HashMap();    for (int boi = 0, bon = buildModel.getNumOutcomes(); boi < bon; boi++) {      String outcome = buildModel.getOutcome(boi);      if (outcome.startsWith(START)) {        //System.err.println("startMap "+outcome+"->"+outcome.substring(START.length()));        startTypeMap.put(outcome, outcome.substring(START.length()));      }      else if (outcome.startsWith(CONT)) {        //System.err.println("contMap "+outcome+"->"+outcome.substring(CONT.length()));        contTypeMap.put(outcome, outcome.substring(CONT.length()));      }    }    topStartIndex = buildModel.getIndex(TOP_START);    completeIndex = checkModel.getIndex(COMPLETE);    incompleteIndex = checkModel.getIndex(INCOMPLETE);  }    /**   * Specifies whether the parser should report when it was unable to find a parse for   * a particular sentence.   * @param errorReporting If true then un-parsed sentences are reported, false otherwise.   */  public void setErrorReporting(boolean errorReporting) {    this.reportFailedParse = errorReporting;  }    /**   * Returns the specified number of parses or fewer for the specified tokens. <br>   * <b>Note:</b> The nodes within   * the returned parses are shared with other parses and therefore their parent node references will not be consistent   * with their child node reference.  {@link #setParents setParents} can be used to make the parents consistent   * with a partuicular parse, but subsequent calls to <code>setParents</code> can invalidate the results of earlier   * calls.<br>     * @param tokens A parse containing the tokens with a single parent node.   * @param numParses The number of parses desired.   * @return the specified number of parses for the specified tokens.   */  public Parse[] parse(Parse tokens, int numParses) {  	if (createDerivationString) tokens.setDerivation(new StringBuffer(100));    odh.clear();    ndh.clear();    parses.clear();    int i = 0; //derivation length    int maxDerivationLength = 2 * tokens.getChildCount() + 3;    odh.add(tokens);    Parse guess = null;    double bestComplete = -100000; //approximating -infinity/0 in ln domain    while (parses.size() < M && i < maxDerivationLength) {      ndh = new TreeSet();      if (odh.size() > 0) {        int j = 0;        for (Iterator pi = odh.iterator(); pi.hasNext() && j < K; j++) { // foearch derivation          Parse tp = (Parse) pi.next();          if (tp.getProb() < bestComplete) { //this parse and the ones which follow will never win, stop advancing.            break;          }          if (guess == null && i == 2) {            guess = tp;          }          if (debugOn) {            System.out.print(i + " " + j + " "+tp.getProb());            tp.show();            System.out.println();          }          Parse[] nd = null;          if (0 == i) {            nd = advanceTags(tp);          }          else if (1 == i) {            if (ndh.size() < K) {              //System.err.println("advancing ts "+j+" "+ndh.size()+" < "+K);              nd = advanceChunks(tp,bestComplete);            }            else {              //System.err.println("advancing ts "+j+" prob="+((Parse) ndh.last()).getProb());              nd = advanceChunks(tp,((Parse) ndh.last()).getProb());            }          }          else { // i > 1            nd = advanceParses(tp, Q);          }          if (nd != null) {            for (int k = 0, kl = nd.length; k < kl; k++) {              if (nd[k].complete()) {                advanceTop(nd[k]);                if (nd[k].getProb() > bestComplete) {                  bestComplete = nd[k].getProb();                }                parses.add(nd[k]);              }              else {                ndh.add(nd[k]);              }            }          }          else {            if (reportFailedParse) {              System.err.println("Couldn't advance parse "+i+" stage "+j+"!\n");            }            advanceTop(tp);            parses.add(tp);          }        }        i++;        odh = ndh;      }      else {        break;      }    }    if (parses.size() == 0) {      if (reportFailedParse) System.err.println("Couldn't find parse for: " + tokens);      //Parse r = (Parse) odh.first();      //r.show();      //System.out.println();      return new Parse[] {guess};    }    else if (numParses == 1){
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -