📄 parserme.java
字号:
/////////////////////////////////////////////////////////////////////////////////Copyright (C) 2003 Thomas Morton////This library is free software; you can redistribute it and/or//modify it under the terms of the GNU Lesser General Public//License as published by the Free Software Foundation; either//version 2.1 of the License, or (at your option) any later version.////This library is distributed in the hope that it will be useful,//but WITHOUT ANY WARRANTY; without even the implied warranty of//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the//GNU General Public License for more details.////You should have received a copy of the GNU Lesser General Public//License along with this program; if not, write to the Free Software//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.////////////////////////////////////////////////////////////////////////////// package opennlp.tools.parser;import java.util.ArrayList;import java.util.Collection;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Set;import java.util.SortedSet;import java.util.TreeSet;import opennlp.maxent.DataStream;import opennlp.maxent.GISModel;import opennlp.maxent.MaxentModel;import opennlp.maxent.TwoPassDataIndexer;import opennlp.tools.ngram.Dictionary;import opennlp.tools.ngram.MutableDictionary;import opennlp.tools.util.Sequence;import opennlp.tools.util.Span;/** * Class for a shift reduce style parser based on Adwait Ratnaparkhi's 1998 thesis. * */public class ParserME { /** The maximum number of parses advanced from all preceeding parses at each derivation step. */ private int M; /** The maximum number of parses to advance from a single preceeding parse. */ private int K; /** The minimum total probability mass of advanced outcomes.*/ private double Q; /** The default beam size used if no beam size is given. */ public static final int defaultBeamSize = 20; /** The default amount of probability mass required of advanced outcomes. */ public static final double defaultAdvancePercentage = 0.95; /** Completed parses. */ private SortedSet parses; /** Incomplete parses which will be advanced. */ private SortedSet odh; /** Incomplete parses which have been advanced. */ private SortedSet ndh; /** The pos-tagger that the parser uses. */ protected ParserTagger tagger; //POS tagger /** The chunker that the parser uses to chunk non-recursive structures. */ protected ParserChunker chunker; //Basal Chunker private MaxentModel buildModel; private MaxentModel checkModel; private BuildContextGenerator buildContextGenerator; private CheckContextGenerator checkContextGenerator; private HeadRules headRules; private Set punctSet; private double[] bprobs; private double[] cprobs; public static final String TOP_NODE = "TOP"; public static final String TOK_NODE = "TK"; public static final Integer ZERO = new Integer(0); /** Prefix for outcomes starting a constituent. */ public static final String START = "S-"; /** Prefix for outcomes continuing a constituent. */ public static final String CONT = "C-"; /** Outcome for token which is not contained in a basal constituent. */ public static final String OTHER = "O"; /** Outcome used when a constituent is complete. */ public static final String COMPLETE = "c"; /** Outcome used when a constituent is incomplete. */ public static final String INCOMPLETE = "i"; private static final String TOP_START = START + TOP_NODE; private int topStartIndex; private Map startTypeMap; private Map contTypeMap; private int completeIndex; private int incompleteIndex; private boolean createDerivationString = false; private boolean debugOn = false; protected boolean reportFailedParse; /** * Creates a new parser using the specified models and head rules. * @param buildModel The model to assign constituent labels. * @param checkModel The model to determine a constituent is complete. * @param tagger The model to assign pos-tags. * @param chunker The model to assign flat constituent labels. * @param headRules The head rules for head word perculation. */ public ParserME(MaxentModel buildModel, MaxentModel checkModel, ParserTagger tagger, ParserChunker chunker, HeadRules headRules) { this(buildModel,checkModel,tagger,chunker,headRules,defaultBeamSize,defaultAdvancePercentage); } /** * Creates a new parser using the specified models and head rules using the specified beam size and advance percentage. * @param buildModel The model to assign constituent labels. * @param checkModel The model to determine a constituent is complete. * @param tagger The model to assign pos-tags. * @param chunker The model to assign flat constituent labels. * @param headRules The head rules for head word perculation. * @param beamSize The number of different parses kept during parsing. * @param advancePercentage The minimal amount of probability mass which advanced outcomes must represent. * Only outcomes which contribute to the top "advancePercentage" will be explored. */ public ParserME(MaxentModel buildModel, MaxentModel checkModel, ParserTagger tagger, ParserChunker chunker, HeadRules headRules, int beamSize, double advancePercentage) { this.tagger = tagger; this.chunker = chunker; this.buildModel = buildModel; this.checkModel = checkModel; this.M = beamSize; this.K = beamSize; this.Q = advancePercentage; reportFailedParse = true; bprobs = new double[buildModel.getNumOutcomes()]; cprobs = new double[checkModel.getNumOutcomes()]; this.buildContextGenerator = new BuildContextGenerator(); this.checkContextGenerator = new CheckContextGenerator(); this.headRules = headRules; this.punctSet = headRules.getPunctuationTags(); odh = new TreeSet(); ndh = new TreeSet(); parses = new TreeSet(); startTypeMap = new HashMap(); contTypeMap = new HashMap(); for (int boi = 0, bon = buildModel.getNumOutcomes(); boi < bon; boi++) { String outcome = buildModel.getOutcome(boi); if (outcome.startsWith(START)) { //System.err.println("startMap "+outcome+"->"+outcome.substring(START.length())); startTypeMap.put(outcome, outcome.substring(START.length())); } else if (outcome.startsWith(CONT)) { //System.err.println("contMap "+outcome+"->"+outcome.substring(CONT.length())); contTypeMap.put(outcome, outcome.substring(CONT.length())); } } topStartIndex = buildModel.getIndex(TOP_START); completeIndex = checkModel.getIndex(COMPLETE); incompleteIndex = checkModel.getIndex(INCOMPLETE); } /** * Specifies whether the parser should report when it was unable to find a parse for * a particular sentence. * @param errorReporting If true then un-parsed sentences are reported, false otherwise. */ public void setErrorReporting(boolean errorReporting) { this.reportFailedParse = errorReporting; } /** * Returns the specified number of parses or fewer for the specified tokens. <br> * <b>Note:</b> The nodes within * the returned parses are shared with other parses and therefore their parent node references will not be consistent * with their child node reference. {@link #setParents setParents} can be used to make the parents consistent * with a partuicular parse, but subsequent calls to <code>setParents</code> can invalidate the results of earlier * calls.<br> * @param tokens A parse containing the tokens with a single parent node. * @param numParses The number of parses desired. * @return the specified number of parses for the specified tokens. */ public Parse[] parse(Parse tokens, int numParses) { if (createDerivationString) tokens.setDerivation(new StringBuffer(100)); odh.clear(); ndh.clear(); parses.clear(); int i = 0; //derivation length int maxDerivationLength = 2 * tokens.getChildCount() + 3; odh.add(tokens); Parse guess = null; double bestComplete = -100000; //approximating -infinity/0 in ln domain while (parses.size() < M && i < maxDerivationLength) { ndh = new TreeSet(); if (odh.size() > 0) { int j = 0; for (Iterator pi = odh.iterator(); pi.hasNext() && j < K; j++) { // foearch derivation Parse tp = (Parse) pi.next(); if (tp.getProb() < bestComplete) { //this parse and the ones which follow will never win, stop advancing. break; } if (guess == null && i == 2) { guess = tp; } if (debugOn) { System.out.print(i + " " + j + " "+tp.getProb()); tp.show(); System.out.println(); } Parse[] nd = null; if (0 == i) { nd = advanceTags(tp); } else if (1 == i) { if (ndh.size() < K) { //System.err.println("advancing ts "+j+" "+ndh.size()+" < "+K); nd = advanceChunks(tp,bestComplete); } else { //System.err.println("advancing ts "+j+" prob="+((Parse) ndh.last()).getProb()); nd = advanceChunks(tp,((Parse) ndh.last()).getProb()); } } else { // i > 1 nd = advanceParses(tp, Q); } if (nd != null) { for (int k = 0, kl = nd.length; k < kl; k++) { if (nd[k].complete()) { advanceTop(nd[k]); if (nd[k].getProb() > bestComplete) { bestComplete = nd[k].getProb(); } parses.add(nd[k]); } else { ndh.add(nd[k]); } } } else { if (reportFailedParse) { System.err.println("Couldn't advance parse "+i+" stage "+j+"!\n"); } advanceTop(tp); parses.add(tp); } } i++; odh = ndh; } else { break; } } if (parses.size() == 0) { if (reportFailedParse) System.err.println("Couldn't find parse for: " + tokens); //Parse r = (Parse) odh.first(); //r.show(); //System.out.println(); return new Parse[] {guess}; } else if (numParses == 1){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -