📄 maximumentropyparser.cs
字号:
//Copyright (C) 2005 Richard J. Northedge
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//This file is based on the ParserME.java source file found in the
//original java implementation of OpenNLP. That source file contains the following header:
//Copyright (C) 2003 Thomas Morton
//
//This library is free software; you can redistribute it and/or
//modify it under the terms of the GNU Lesser General Public
//License as published by the Free Software Foundation; either
//version 2.1 of the License, or (at your option) any later version.
//
//This library is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU Lesser General Public License for more details.
//
//You should have received a copy of the GNU Lesser General Public
//License along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
using System;
using System.Collections;
using System.Text;
namespace OpenNLP.Tools.Parser
{
/// <summary>
/// Class for a shift reduce style parser based on Adwait Ratnaparki's 1998 thesis.
/// </summary>
public class MaximumEntropyParser
{
/// <summary>
/// The maximum number of parses advanced from all preceding parses at each derivation step.
/// </summary>
private int M;
///<summary>
///The maximum number of parses to advance from a single preceding parse.
///</summary>
private int K;
///<summary>
///The minimum total probability mass of advanced outcomes.
///</summary>
private double Q;
///<summary>
///The default beam size used if no beam size is given.
///</summary>
public const int DefaultBeamSize = 20;
///<summary>
///The default amount of probability mass required of advanced outcomes.
///</summary>
public const double DefaultAdvancePercentage = 0.95;
///<summary>
///Completed parses.
///</summary>
private Util.SortedSet mParses;
///<summary>
///Incomplete parses which will be advanced.
///</summary>
private Util.SortedSet mOldDerivationsHeap;
///<summary>
///Incomplete parses which have been advanced.
///</summary>
private Util.SortedSet mNewDerivationsHeap;
private IParserTagger mPosTagger;
private IParserChunker mBasalChunker;
private SharpEntropy.IMaximumEntropyModel mBuildModel;
private SharpEntropy.IMaximumEntropyModel mCheckModel;
private BuildContextGenerator mBuildContextGenerator;
private CheckContextGenerator mCheckContextGenerator;
private IHeadRules mHeadRules;
private double[] mBuildProbabilities;
private double[] mCheckProbabilities;
public const string TopNode = "TOP";
public const string TokenNode = "TK";
public const int Zero = 0;
/// <summary>
/// Prefix for outcomes starting a constituent.
/// </summary>
public const string StartPrefix = "S-";
/// <summary>
/// Prefix for outcomes continuing a constituent.
/// </summary>
public const string ContinuePrefix = "C-";
/// <summary>
/// Outcome for token which is not contained in a basal constituent.
/// </summary>
public const string OtherOutcome = "O";
/// <summary>
/// Outcome used when a constituent is complete.
/// </summary>
public const string CompleteOutcome = "c";
/// <summary>
/// Outcome used when a constituent is incomplete.
/// </summary>
public const string IncompleteOutcome = "i";
private const string mTopStart = StartPrefix + TopNode;
private int mTopStartIndex;
private Hashtable mStartTypeMap;
private Hashtable mContinueTypeMap;
private int mCompleteIndex;
private int mIncompleteIndex;
private bool mCreateDerivationString = false;
///<summary>
///Creates a new parser using the specified models and head rules.
///</summary>
///<param name="buildModel">
///The model to assign constituent labels.
///</param>
///<param name="checkModel">
///The model to determine a constituent is complete.
///</param>
///<param name="tagger">
///The model to assign pos-tags.
///</param>
///<param name="chunker">
///The model to assign flat constituent labels.
///</param>
///<param name="headRules">
///The head rules for head word perculation.
///</param>
public MaximumEntropyParser(SharpEntropy.IMaximumEntropyModel buildModel, SharpEntropy.IMaximumEntropyModel checkModel, IParserTagger tagger, IParserChunker chunker, IHeadRules headRules) : this(buildModel, checkModel, tagger, chunker, headRules, DefaultBeamSize, DefaultAdvancePercentage)
{}
///<summary>
///Creates a new parser using the specified models and head rules using the specified beam size and advance percentage.
///</summary>
///<param name="buildModel">
///The model to assign constituent labels.
///</param>
///<param name="checkModel">
///The model to determine a constituent is complete.
///</param>
///<param name="tagger">
///The model to assign pos-tags.
///</param>
///<param name="chunker">
///The model to assign flat constituent labels.
///</param>
///<param name="headRules">
///The head rules for head word perculation.
///</param>
///<param name="beamSize">
///The number of different parses kept during parsing.
///</param>
///<param name="advancePercentage">
///The minimal amount of probability mass which advanced outcomes must represent.
///Only outcomes which contribute to the top "advancePercentage" will be explored.
///</param>
public MaximumEntropyParser(SharpEntropy.IMaximumEntropyModel buildModel, SharpEntropy.IMaximumEntropyModel checkModel, IParserTagger tagger, IParserChunker chunker, IHeadRules headRules, int beamSize, double advancePercentage)
{
mPosTagger = tagger;
mBasalChunker = chunker;
mBuildModel = buildModel;
mCheckModel = checkModel;
M = beamSize;
K = beamSize;
Q = advancePercentage;
mBuildProbabilities = new double[mBuildModel.OutcomeCount];
mCheckProbabilities = new double[mCheckModel.OutcomeCount];
mBuildContextGenerator = new BuildContextGenerator();
mCheckContextGenerator = new CheckContextGenerator();
mHeadRules = headRules;
mOldDerivationsHeap = new Util.TreeSet();
mNewDerivationsHeap = new Util.TreeSet();
mParses = new Util.TreeSet();
mStartTypeMap = new Hashtable();
mContinueTypeMap = new Hashtable();
for (int buildOutcomeIndex = 0, buildOutcomeCount = buildModel.OutcomeCount; buildOutcomeIndex < buildOutcomeCount; buildOutcomeIndex++)
{
string outcome = buildModel.GetOutcomeName(buildOutcomeIndex);
if (outcome.StartsWith(StartPrefix))
{
//System.Console.Error.WriteLine("startMap " + outcome + "->" + outcome.Substring(StartPrefix.Length));
mStartTypeMap.Add(outcome, outcome.Substring(StartPrefix.Length));
}
else if (outcome.StartsWith(ContinuePrefix))
{
//System.Console.Error.WriteLine("contMap " + outcome + "->" + outcome.Substring(ContinuePrefix.Length));
mContinueTypeMap.Add(outcome, outcome.Substring(ContinuePrefix.Length));
}
}
mTopStartIndex = buildModel.GetOutcomeIndex(mTopStart);
mCompleteIndex = checkModel.GetOutcomeIndex(CompleteOutcome);
mIncompleteIndex = checkModel.GetOutcomeIndex(IncompleteOutcome);
}
/// <summary>
/// Returns a parse for the specified parse of tokens.
/// </summary>
/// <param name="flatParse">
/// A flat parse containing only tokens and a root node, p.
/// </param>
/// <param name="parseCount">
/// the number of parses required
/// </param>
/// <returns>
/// A full parse of the specified tokens or the flat chunks of the tokens if a full parse could not be found.
/// </returns>
public virtual Parse[] FullParse(Parse flatParse, int parseCount)
{
if (mCreateDerivationString)
{
flatParse.InitializeDerivationBuffer();
}
mOldDerivationsHeap.Clear();
mNewDerivationsHeap.Clear();
mParses.Clear();
int derivationLength = 0;
int maxDerivationLength = 2 * flatParse.ChildCount + 3;
mOldDerivationsHeap.Add(flatParse);
Parse guessParse = null;
double bestComplete = - 100000; //approximating -infinity/0 in ln domain
while (mParses.Count < M && derivationLength < maxDerivationLength)
{
mNewDerivationsHeap = new Util.TreeSet();
if (mOldDerivationsHeap.Count > 0)
{
int derivationsProcessed = 0;
foreach (Parse currentParse in mOldDerivationsHeap)
//for (System.Collections.IEnumerator pi = mOldDerivationsHeap.GetEnumerator(); pi.MoveNext() && derivationsProcessed < K; derivationsProcessed++)
{
derivationsProcessed++;
if (derivationsProcessed >= K)
{
break;
}
// for each derivation
//Parse currentParse = (Parse) pi.Current;
if (currentParse.Probability < bestComplete) //this parse and the ones which follow will never win, stop advancing.
{
break;
}
if (guessParse == null && derivationLength == 2)
{
guessParse = currentParse;
}
//System.Console.Out.Write(derivationLength + " " + derivationsProcessed + " "+currentParse.Probability);
//System.Console.Out.Write(currentParse.Show());
//System.Console.Out.WriteLine();
Parse[] newDerivations = null;
if (0 == derivationLength)
{
newDerivations = AdvanceTags(currentParse);
}
else if (1 == derivationLength)
{
if (mNewDerivationsHeap.Count < K)
{
//System.Console.Error.WriteLine("advancing ts " + derivationsProcessed + " " + mNewDerivationsHeap.Count + " < " + K);
newDerivations = AdvanceChunks(currentParse, bestComplete);
}
else
{
//System.Console.Error.WriteLine("advancing ts " + derivationsProcessed + " prob=" + ((Parse) mNewDerivationsHeap.Last()).Probability);
newDerivations = AdvanceChunks(currentParse,((Parse) mNewDerivationsHeap.Last()).Probability);
}
}
else
{ // derivationLength > 1
newDerivations = AdvanceParses(currentParse, Q);
}
if (newDerivations != null)
{
for (int currentDerivation = 0, derivationCount = newDerivations.Length; currentDerivation < derivationCount; currentDerivation++)
{
//System.out.println("currentDerivation="+currentDerivation+" of "+newDerivations.length);
if (newDerivations[currentDerivation].IsComplete)
{
AdvanceTop(newDerivations[currentDerivation]);
if (newDerivations[currentDerivation].Probability > bestComplete)
{
bestComplete = newDerivations[currentDerivation].Probability;
}
mParses.Add(newDerivations[currentDerivation]);
}
else
{
mNewDerivationsHeap.Add(newDerivations[currentDerivation]);
}
}
//RN added sort
mNewDerivationsHeap.Sort();
}
else
{
System.Console.Error.WriteLine("Couldn't advance parse " + derivationLength + " stage " + derivationsProcessed + "!\n");
}
}
derivationLength++;
mOldDerivationsHeap = mNewDerivationsHeap;
}
else
{
break;
}
}
//RN added sort
mParses.Sort();
if (mParses.Count == 0)
{
System.Console.Error.WriteLine("Couldn't find parse for: " + flatParse);
//oFullParse = (Parse) mOldDerivationsHeap.First();
return new Parse[] {guessParse};
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -