📄 maximumentropyparser.cs

📁 英语句子自然语言处理统计分析例子 Statistical parsing of English sentences Shows how to generate parse trees for
💻 CS
📖 第 1 页 / 共 2 页
字号:
12 下一页
//Copyright (C) 2005 Richard J. Northedge
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//This file is based on the ParserME.java source file found in the
//original java implementation of OpenNLP.  That source file contains the following header:

//Copyright (C) 2003 Thomas Morton
// 
//This library is free software; you can redistribute it and/or
//modify it under the terms of the GNU Lesser General Public
//License as published by the Free Software Foundation; either
//version 2.1 of the License, or (at your option) any later version.
// 
//This library is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Lesser General Public License for more details.
// 
//You should have received a copy of the GNU Lesser General Public
//License along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

using System;
using System.Collections;
using System.Text;

namespace OpenNLP.Tools.Parser
{
	/// <summary>
	/// Class for a shift reduce style parser based on Adwait Ratnaparki's 1998 thesis.
	/// </summary>
	public class MaximumEntropyParser
	{
		/// <summary>
		/// The maximum number of parses advanced from all preceding parses at each derivation step.
		/// </summary>
		private int M;

		///<summary>
		///The maximum number of parses to advance from a single preceding parse.
		///</summary>
		private int K;

		///<summary>
		///The minimum total probability mass of advanced outcomes.
		///</summary>
		private double Q;

		///<summary>
		///The default beam size used if no beam size is given.
		///</summary>
		public const int DefaultBeamSize = 20;

		///<summary>
		///The default amount of probability mass required of advanced outcomes.
		///</summary>
		public const double DefaultAdvancePercentage = 0.95;
		
		///<summary>
		///Completed parses.
		///</summary>
		private Util.SortedSet mParses;
		
		///<summary>
		///Incomplete parses which will be advanced.
		///</summary>
		private Util.SortedSet mOldDerivationsHeap;

		///<summary>
		///Incomplete parses which have been advanced.
		///</summary>
		private Util.SortedSet mNewDerivationsHeap;

		private IParserTagger mPosTagger; 
		private IParserChunker mBasalChunker; 
		
		private SharpEntropy.IMaximumEntropyModel mBuildModel;
		private SharpEntropy.IMaximumEntropyModel mCheckModel;
		
		private BuildContextGenerator mBuildContextGenerator;
		private CheckContextGenerator mCheckContextGenerator;
		
		private IHeadRules mHeadRules;
		
		private double[] mBuildProbabilities;
		private double[] mCheckProbabilities;
		
		public const string TopNode = "TOP";
		public const string TokenNode = "TK";
		
		public const int Zero = 0;
		
		/// <summary>
		/// Prefix for outcomes starting a constituent.
		/// </summary>
		public const string StartPrefix = "S-";

		/// <summary>
		/// Prefix for outcomes continuing a constituent.
		/// </summary>
		public const string ContinuePrefix = "C-";

		/// <summary>
		/// Outcome for token which is not contained in a basal constituent.
		/// </summary>
		public const string OtherOutcome = "O";
		
		/// <summary>
		/// Outcome used when a constituent is complete.
		/// </summary>
		public const string CompleteOutcome = "c";

		/// <summary>
		/// Outcome used when a constituent is incomplete.
		/// </summary>
		public const string IncompleteOutcome = "i";
		
		private const string mTopStart = StartPrefix + TopNode;

		private int mTopStartIndex;
		private Hashtable mStartTypeMap;
		private Hashtable mContinueTypeMap;
  
		private int mCompleteIndex;
		private int mIncompleteIndex;
  
		private bool mCreateDerivationString = false;

		///<summary>
		///Creates a new parser using the specified models and head rules.
		///</summary>
		///<param name="buildModel">
		///The model to assign constituent labels.
		///</param>
		///<param name="checkModel">
		///The model to determine a constituent is complete.
		///</param>
		///<param name="tagger">
		///The model to assign pos-tags.
		///</param>
		///<param name="chunker">
		///The model to assign flat constituent labels.
		///</param>
		///<param name="headRules">
		///The head rules for head word perculation.
		///</param>
		public MaximumEntropyParser(SharpEntropy.IMaximumEntropyModel buildModel, SharpEntropy.IMaximumEntropyModel checkModel, IParserTagger tagger, IParserChunker chunker, IHeadRules headRules) : this(buildModel, checkModel, tagger, chunker, headRules, DefaultBeamSize, DefaultAdvancePercentage)
		{}

		///<summary>
		///Creates a new parser using the specified models and head rules using the specified beam size and advance percentage.
		///</summary>
		///<param name="buildModel">
		///The model to assign constituent labels.
		///</param>
		///<param name="checkModel">
		///The model to determine a constituent is complete.
		///</param>
		///<param name="tagger">
		///The model to assign pos-tags.
		///</param>
		///<param name="chunker">
		///The model to assign flat constituent labels.
		///</param>
		///<param name="headRules">
		///The head rules for head word perculation.
		///</param>
		///<param name="beamSize">
		///The number of different parses kept during parsing.
		///</param>
		///<param name="advancePercentage">
		///The minimal amount of probability mass which advanced outcomes must represent.
		///Only outcomes which contribute to the top "advancePercentage" will be explored.
		///</param>    
		public MaximumEntropyParser(SharpEntropy.IMaximumEntropyModel buildModel, SharpEntropy.IMaximumEntropyModel checkModel, IParserTagger tagger, IParserChunker chunker, IHeadRules headRules, int beamSize, double advancePercentage) 
		{
			mPosTagger = tagger;
			mBasalChunker = chunker;
			mBuildModel = buildModel;
			mCheckModel = checkModel;
			M = beamSize;
			K = beamSize;
			Q = advancePercentage;

			mBuildProbabilities = new double[mBuildModel.OutcomeCount];
			mCheckProbabilities = new double[mCheckModel.OutcomeCount];
			mBuildContextGenerator = new BuildContextGenerator();
			mCheckContextGenerator = new CheckContextGenerator();
			mHeadRules = headRules;
			mOldDerivationsHeap = new Util.TreeSet();
			mNewDerivationsHeap = new Util.TreeSet();
			mParses = new Util.TreeSet();

			mStartTypeMap = new Hashtable();
			mContinueTypeMap = new Hashtable();
			for (int buildOutcomeIndex = 0, buildOutcomeCount = buildModel.OutcomeCount; buildOutcomeIndex < buildOutcomeCount; buildOutcomeIndex++) 
			{
				string outcome = buildModel.GetOutcomeName(buildOutcomeIndex);
				if (outcome.StartsWith(StartPrefix)) 
				{
					//System.Console.Error.WriteLine("startMap " + outcome + "->" + outcome.Substring(StartPrefix.Length));
					mStartTypeMap.Add(outcome, outcome.Substring(StartPrefix.Length));
				}
				else if (outcome.StartsWith(ContinuePrefix)) 
				{
					//System.Console.Error.WriteLine("contMap " + outcome + "->" + outcome.Substring(ContinuePrefix.Length));
					mContinueTypeMap.Add(outcome, outcome.Substring(ContinuePrefix.Length));
				}
			}
			mTopStartIndex = buildModel.GetOutcomeIndex(mTopStart);
			mCompleteIndex = checkModel.GetOutcomeIndex(CompleteOutcome);
			mIncompleteIndex = checkModel.GetOutcomeIndex(IncompleteOutcome);
		}
		
		/// <summary>
		/// Returns a parse for the specified parse of tokens.
		/// </summary>
		/// <param name="flatParse">
		/// A flat parse containing only tokens and a root node, p. 
		/// </param>
		/// <param name="parseCount">
		/// the number of parses required
		/// </param>
		/// <returns>
		/// A full parse of the specified tokens or the flat chunks of the tokens if a full parse could not be found.
		/// </returns>
		public virtual Parse[] FullParse(Parse flatParse, int parseCount)
		{
			if (mCreateDerivationString) 
			{
				flatParse.InitializeDerivationBuffer();
			}
			mOldDerivationsHeap.Clear();
			mNewDerivationsHeap.Clear();
			mParses.Clear();
			int derivationLength = 0; 
			int maxDerivationLength = 2 * flatParse.ChildCount + 3;
			mOldDerivationsHeap.Add(flatParse);
			Parse guessParse = null;
			double bestComplete = - 100000; //approximating -infinity/0 in ln domain
			while (mParses.Count < M && derivationLength < maxDerivationLength)
			{
				mNewDerivationsHeap = new Util.TreeSet();
				if (mOldDerivationsHeap.Count > 0)
				{
					int derivationsProcessed = 0;

					foreach (Parse currentParse in mOldDerivationsHeap)
						//for (System.Collections.IEnumerator pi = mOldDerivationsHeap.GetEnumerator(); pi.MoveNext() && derivationsProcessed < K; derivationsProcessed++)
					{
						derivationsProcessed++;
						if (derivationsProcessed >= K) 
						{
							break;
						}

						// for each derivation
						//Parse currentParse = (Parse) pi.Current;
						if (currentParse.Probability < bestComplete)  //this parse and the ones which follow will never win, stop advancing.
						{
							break;
						}
						if (guessParse == null && derivationLength == 2)
						{
							guessParse = currentParse;
						}

						//System.Console.Out.Write(derivationLength + " " + derivationsProcessed + " "+currentParse.Probability);
						//System.Console.Out.Write(currentParse.Show());
						//System.Console.Out.WriteLine();

						Parse[] newDerivations = null;
						if (0 == derivationLength) 
						{
							newDerivations = AdvanceTags(currentParse);
						}
						else if (1 == derivationLength) 
						{
							if (mNewDerivationsHeap.Count < K) 
							{
								//System.Console.Error.WriteLine("advancing ts " + derivationsProcessed + " " + mNewDerivationsHeap.Count + " < " + K);
								newDerivations = AdvanceChunks(currentParse, bestComplete);
							}
							else 
							{
								//System.Console.Error.WriteLine("advancing ts " + derivationsProcessed + " prob=" + ((Parse) mNewDerivationsHeap.Last()).Probability);
								newDerivations = AdvanceChunks(currentParse,((Parse) mNewDerivationsHeap.Last()).Probability);
							}
						}
						else 
						{ // derivationLength > 1
							newDerivations = AdvanceParses(currentParse, Q);
						}

						if (newDerivations != null)
						{
							for (int currentDerivation = 0, derivationCount = newDerivations.Length; currentDerivation < derivationCount; currentDerivation++)
							{
								//System.out.println("currentDerivation="+currentDerivation+" of "+newDerivations.length);
								if (newDerivations[currentDerivation].IsComplete)
								{
									AdvanceTop(newDerivations[currentDerivation]);
									if (newDerivations[currentDerivation].Probability > bestComplete)
									{
										bestComplete = newDerivations[currentDerivation].Probability;
									}
									mParses.Add(newDerivations[currentDerivation]);
									
								}
								else
								{
									mNewDerivationsHeap.Add(newDerivations[currentDerivation]);
								}
							}
							//RN added sort
							mNewDerivationsHeap.Sort();
						}
						else
						{
							System.Console.Error.WriteLine("Couldn't advance parse " + derivationLength + " stage " + derivationsProcessed + "!\n");
						}
					}
					derivationLength++;
					mOldDerivationsHeap = mNewDerivationsHeap;
				}
				else
				{
					break;
				}
			}
		
			//RN added sort
			mParses.Sort();
			
			if (mParses.Count == 0)
			{
				System.Console.Error.WriteLine("Couldn't find parse for: " + flatParse);
				//oFullParse = (Parse) mOldDerivationsHeap.First(); 
				return new Parse[] {guessParse};
			}
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -