📄 maximumentropyparser.cs
字号:
else if (parseCount == 1)
{
//RN added parent adjustment
Parse topParse = (Parse) mParses.First();
topParse.UpdateChildParents();
return new Parse[] {topParse};
}
else
{
ArrayList topParses = new ArrayList(parseCount);
while(!mParses.IsEmpty() && topParses.Count < parseCount)
{
Parse topParse = (Parse) mParses.First();
//RN added parent adjustment
topParse.UpdateChildParents();
topParses.Add(topParse);
mParses.Remove(topParse);
}
return (Parse[]) topParses.ToArray(typeof(Parse));
}
}
private void AdvanceTop(Parse inputParse)
{
mBuildModel.Evaluate(mBuildContextGenerator.GetContext(inputParse.GetChildren(), 0), mBuildProbabilities);
inputParse.AddProbability(System.Math.Log(mBuildProbabilities[mTopStartIndex]));
mCheckModel.Evaluate(mCheckContextGenerator.GetContext(inputParse.GetChildren(), TopNode, 0, 0), mCheckProbabilities);
inputParse.AddProbability(System.Math.Log(mCheckProbabilities[mCompleteIndex]));
inputParse.Type = TopNode;
}
///<summary>
///Advances the specified parse and returns the an array advanced parses whose probability accounts for
///more than the speicficed amount of probability mass, Q.
///</summary>
///<param name="inputParse">
///The parse to advance.
///</param>
///<param name="Q">
///The amount of probability mass that should be accounted for by the advanced parses.
///</param>
private Parse[] AdvanceParses(Parse inputParse, double Q)
{
double q = 1 - Q;
Parse lastStartNode = null; // The closest previous node which has been labeled as a start node.
int lastStartIndex = -1; // The index of the closest previous node which has been labeled as a start node.
string lastStartType = null; // The type of the closest previous node which has been labeled as a start node.
int advanceNodeIndex; // The index of the node which will be labeled in this iteration of advancing the parse.
Parse advanceNode = null; // The node which will be labeled in this iteration of advancing the parse.
Parse[] children = inputParse.GetChildren();
int nodeCount = children.Length;
//determines which node needs to be labeled and prior labels.
for (advanceNodeIndex = 0; advanceNodeIndex < nodeCount; advanceNodeIndex++)
{
advanceNode = children[advanceNodeIndex];
if (advanceNode.Label == null)
{
break;
}
else if (mStartTypeMap.ContainsKey(advanceNode.Label))
{
lastStartType = (string) mStartTypeMap[advanceNode.Label];
lastStartNode = advanceNode;
lastStartIndex = advanceNodeIndex;
//System.Console.Error.WriteLine("lastStart " + lastStartIndex + " " + lastStartNode.Label + " " + lastStartNode.Probability);
}
}
ArrayList newParsesList = new ArrayList(mBuildModel.OutcomeCount);
//call build
mBuildModel.Evaluate(mBuildContextGenerator.GetContext(children, advanceNodeIndex), mBuildProbabilities);
double buildProbabilitiesSum = 0;
while (buildProbabilitiesSum < Q)
{
// The largest unadvanced labeling.
int highestBuildProbabilityIndex = 0;
for (int probabilityIndex = 1; probabilityIndex < mBuildProbabilities.Length; probabilityIndex++)
{ //for each build outcome
if (mBuildProbabilities[probabilityIndex] > mBuildProbabilities[highestBuildProbabilityIndex])
{
highestBuildProbabilityIndex = probabilityIndex;
}
}
if (mBuildProbabilities[highestBuildProbabilityIndex] == 0)
{
break;
}
double highestBuildProbability = mBuildProbabilities[highestBuildProbabilityIndex];
mBuildProbabilities[highestBuildProbabilityIndex] = 0; //zero out so new max can be found
buildProbabilitiesSum += highestBuildProbability;
string tag = mBuildModel.GetOutcomeName(highestBuildProbabilityIndex);
//System.Console.Out.WriteLine("trying " + tag + " " + buildProbabilitiesSum + " lst=" + lst);
if (highestBuildProbabilityIndex == mTopStartIndex)
{ // can't have top until complete
continue;
}
//System.Console.Error.WriteLine(probabilityIndex + " " + tag + " " + highestBuildProbability);
if (mStartTypeMap.ContainsKey(tag))
{ //update last start
lastStartIndex = advanceNodeIndex;
lastStartNode = advanceNode;
lastStartType = (string) mStartTypeMap[tag];
}
else if (mContinueTypeMap.ContainsKey(tag))
{
if (lastStartNode == null || lastStartType != (string)mContinueTypeMap[tag])
{
continue; //Cont must match previous start or continue
}
}
Parse newParse1 = (Parse) inputParse.Clone(); //clone parse
if (mCreateDerivationString)
{
newParse1.AppendDerivationBuffer(highestBuildProbabilityIndex.ToString(System.Globalization.CultureInfo.InvariantCulture));
newParse1.AppendDerivationBuffer("-");
}
newParse1.SetChild(advanceNodeIndex, tag); //replace constituent labeled
newParse1.AddProbability(System.Math.Log(highestBuildProbability));
//check
mCheckModel.Evaluate(mCheckContextGenerator.GetContext(newParse1.GetChildren(), lastStartType, lastStartIndex, advanceNodeIndex), mCheckProbabilities);
//System.Console.Out.WriteLine("check " + mCheckProbabilities[mCompleteIndex] + " " + mCheckProbabilities[mIncompleteIndex]);
Parse newParse2 = newParse1;
if (mCheckProbabilities[mCompleteIndex] > q)
{ //make sure a reduce is likely
newParse2 = (Parse) newParse1.Clone();
if (mCreateDerivationString)
{
newParse2.AppendDerivationBuffer("1");
newParse2.AppendDerivationBuffer(".");
}
newParse2.AddProbability(System.Math.Log(mCheckProbabilities[1]));
Parse[] constituent = new Parse[advanceNodeIndex - lastStartIndex + 1];
bool isFlat = true;
//first
constituent[0] = lastStartNode;
if (constituent[0].Type != constituent[0].Head.Type)
{
isFlat = false;
}
//last
constituent[advanceNodeIndex - lastStartIndex] = advanceNode;
if (isFlat && constituent[advanceNodeIndex - lastStartIndex].Type != constituent[advanceNodeIndex - lastStartIndex].Head.Type)
{
isFlat = false;
}
//middle
for (int constituentIndex = 1; constituentIndex < advanceNodeIndex - lastStartIndex; constituentIndex++)
{
constituent[constituentIndex] = children[constituentIndex + lastStartIndex];
if (isFlat && constituent[constituentIndex].Type != constituent[constituentIndex].Head.Type)
{
isFlat = false;
}
}
if (!isFlat)
{ //flat chunks are done by chunker
newParse2.Insert(new Parse(inputParse.Text, new Util.Span(lastStartNode.Span.Start, advanceNode.Span.End), lastStartType, mCheckProbabilities[1], mHeadRules.GetHead(constituent, lastStartType)));
newParsesList.Add(newParse2);
}
}
if (mCheckProbabilities[mIncompleteIndex] > q)
{ //make sure a shift is likely
if (mCreateDerivationString)
{
newParse1.AppendDerivationBuffer("0");
newParse1.AppendDerivationBuffer(".");
}
if (advanceNodeIndex != nodeCount - 1)
{ //can't shift last element
newParse1.AddProbability(System.Math.Log(mCheckProbabilities[0]));
newParsesList.Add(newParse1);
}
}
}
Parse[] newParses = (Parse[])newParsesList.ToArray(typeof(Parse));
return newParses;
}
///<summary>
///Returns the top chunk sequences for the specified parse.
///</summary>
///<param name="inputParse">
///A pos-tag assigned parse.
///</param>
/// <param name="minChunkScore">
/// the minimum probability for an allowed chunk sequence.
/// </param>
///<returns>
///The top chunk assignments to the specified parse.
///</returns>
private Parse[] AdvanceChunks(Parse inputParse, double minChunkScore)
{
// chunk
Parse[] children = inputParse.GetChildren();
string[] words = new string[children.Length];
string[] parseTags = new string[words.Length];
double[] probabilities = new double[words.Length];
Parse currentChildParse = null;
for (int childParseIndex = 0, childParseCount = children.Length; childParseIndex < childParseCount; childParseIndex++)
{
currentChildParse = children[childParseIndex];
words[childParseIndex] = currentChildParse.Head.ToString();
parseTags[childParseIndex] = currentChildParse.Type;
}
//System.Console.Error.WriteLine("adjusted min chunk score = " + (minChunkScore - inputParse.Probability));
Util.Sequence[] chunkerSequences = mBasalChunker.TopKSequences(words, parseTags, minChunkScore - inputParse.Probability);
Parse[] newParses = new Parse[chunkerSequences.Length];
for (int sequenceIndex = 0, sequenceCount = chunkerSequences.Length; sequenceIndex < sequenceCount; sequenceIndex++)
{
newParses[sequenceIndex] = (Parse) inputParse.Clone(); //copies top level
if (mCreateDerivationString)
{
newParses[sequenceIndex].AppendDerivationBuffer(sequenceIndex.ToString(System.Globalization.CultureInfo.InvariantCulture));
newParses[sequenceIndex].AppendDerivationBuffer(".");
}
string[] tags = (string[]) chunkerSequences[sequenceIndex].Outcomes.ToArray(typeof(string));
chunkerSequences[sequenceIndex].GetProbabilities(probabilities);
int start = -1;
int end = 0;
string type = null;
//System.Console.Error.Write("sequence " + sequenceIndex + " ");
for (int tagIndex = 0; tagIndex <= tags.Length; tagIndex++)
{
//if (tagIndex != tags.Length)
//{
// System.Console.Error.WriteLine(words[tagIndex] + " " + parseTags[tagIndex] + " " + tags[tagIndex] + " " + probabilities[tagIndex]);
//}
if (tagIndex != tags.Length)
{
newParses[sequenceIndex].AddProbability(System.Math.Log(probabilities[tagIndex]));
}
if (tagIndex != tags.Length && tags[tagIndex].StartsWith(ContinuePrefix))
{ // if continue just update end chunking tag don't use mContinueTypeMap
end = tagIndex;
}
else
{ //make previous constituent if it exists
if (type != null)
{
//System.Console.Error.WriteLine("inserting tag " + tags[tagIndex]);
Parse startParse = children[start];
Parse endParse = children[end];
//System.Console.Error.WriteLine("Putting " + type + " at " + start + "," + end + " " + newParses[sequenceIndex].Probability);
Parse[] consitituents = new Parse[end - start + 1];
consitituents[0] = startParse;
//consitituents[0].Label = "Start-" + type;
if (end - start != 0)
{
consitituents[end - start] = endParse;
//consitituents[end - start].Label = "Cont-" + type;
for (int constituentIndex = 1; constituentIndex < end - start; constituentIndex++)
{
consitituents[constituentIndex] = children[constituentIndex + start];
//consitituents[constituentIndex].Label = "Cont-" + type;
}
}
newParses[sequenceIndex].Insert(new Parse(startParse.Text, new Util.Span(startParse.Span.Start, endParse.Span.End), type, 1, mHeadRules.GetHead(consitituents, type)));
}
if (tagIndex != tags.Length)
{ //update for new constituent
if (tags[tagIndex].StartsWith(StartPrefix))
{ // don't use mStartTypeMap these are chunk tags
type = tags[tagIndex].Substring(StartPrefix.Length);
start = tagIndex;
end = tagIndex;
}
else
{ // other
type = null;
}
}
}
}
//newParses[sequenceIndex].Show();
//System.Console.Out.WriteLine();
}
return newParses;
}
///<summary>
///Advances the parse by assigning it POS tags and returns multiple tag sequences.
///</summary>
///<param name="inputParse">
///The parse to be tagged.
///</param>
///<returns>
///Parses with different pos-tag sequence assignments.
///</returns>
private Parse[] AdvanceTags(Parse inputParse)
{
Parse[] children = inputParse.GetChildren();
string[] words = new string[children.Length];
double[] probabilities = new double[words.Length];
for (int childParseIndex = 0; childParseIndex < children.Length; childParseIndex++)
{
words[childParseIndex] = (children[childParseIndex]).ToString();
}
Util.Sequence[] tagSequences = mPosTagger.TopKSequences(words);
if (tagSequences.Length == 0)
{
System.Console.Error.WriteLine("no tag sequence");
}
Parse[] newParses = new Parse[tagSequences.Length];
for (int tagSequenceIndex = 0; tagSequenceIndex < tagSequences.Length; tagSequenceIndex++)
{
string[] tags = (string[]) tagSequences[tagSequenceIndex].Outcomes.ToArray(typeof(string));
tagSequences[tagSequenceIndex].GetProbabilities(probabilities);
newParses[tagSequenceIndex] = (Parse) inputParse.Clone(); //copies top level
if (mCreateDerivationString)
{
newParses[tagSequenceIndex].AppendDerivationBuffer(tagSequenceIndex.ToString(System.Globalization.CultureInfo.InvariantCulture));
newParses[tagSequenceIndex].AppendDerivationBuffer(".");
}
for (int wordIndex = 0; wordIndex < words.Length; wordIndex++)
{
Parse wordParse = children[wordIndex];
//System.Console.Error.WriteLine("inserting tag " + tags[wordIndex]);
double wordProbability = probabilities[wordIndex];
newParses[tagSequenceIndex].Insert(new Parse(wordParse.Text, wordParse.Span, tags[wordIndex], wordProbability));
newParses[tagSequenceIndex].AddProbability(System.Math.Log(wordProbability));
//newParses[tagSequenceIndex].Show();
}
}
return newParses;
}
private static SharpEntropy.GisModel Train(SharpEntropy.ITrainingEventReader eventStream, int iterations, int cut)
{
SharpEntropy.GisTrainer trainer = new SharpEntropy.GisTrainer();
trainer.TrainModel(iterations, new SharpEntropy.TwoPassDataIndexer(eventStream, cut));
return new SharpEntropy.GisModel(trainer);
}
public static SharpEntropy.GisModel TrainModel(string trainingFile, EventType modelType, string headRulesFile)
{
return TrainModel(trainingFile, modelType, headRulesFile, 100, 5);
}
public static SharpEntropy.GisModel TrainModel(string trainingFile, EventType modelType, string headRulesFile, int iterations, int cutoff)
{
EnglishHeadRules rules = new EnglishHeadRules(headRulesFile);
SharpEntropy.ITrainingEventReader eventReader = new ParserEventReader(new SharpEntropy.PlainTextByLineDataReader(new System.IO.StreamReader(trainingFile)), rules, modelType);
return Train(eventReader, iterations, cutoff);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -