📄 maximumentropyparser.cs

📁 英语句子自然语言处理统计分析例子 Statistical parsing of English sentences Shows how to generate parse trees for
💻 CS
📖 第 1 页 / 共 2 页
字号:
上一页 12
			else if (parseCount == 1)
			{
				//RN added parent adjustment
				Parse topParse = (Parse) mParses.First();
				topParse.UpdateChildParents();
				return new Parse[] {topParse};
			}
			else
			{
				ArrayList topParses = new ArrayList(parseCount);
				while(!mParses.IsEmpty() && topParses.Count < parseCount) 
				{
					Parse topParse = (Parse) mParses.First();
					//RN added parent adjustment
					topParse.UpdateChildParents();
					topParses.Add(topParse);
					mParses.Remove(topParse);
				}
				return (Parse[]) topParses.ToArray(typeof(Parse));
			}
		}
		
		private void AdvanceTop(Parse inputParse)
		{
			mBuildModel.Evaluate(mBuildContextGenerator.GetContext(inputParse.GetChildren(), 0), mBuildProbabilities);
			inputParse.AddProbability(System.Math.Log(mBuildProbabilities[mTopStartIndex]));
			mCheckModel.Evaluate(mCheckContextGenerator.GetContext(inputParse.GetChildren(), TopNode, 0, 0), mCheckProbabilities);
			inputParse.AddProbability(System.Math.Log(mCheckProbabilities[mCompleteIndex]));
			inputParse.Type = TopNode;
		}
		
		
		///<summary>
		///Advances the specified parse and returns the an array advanced parses whose probability accounts for
		///more than the speicficed amount of probability mass, Q.
		///</summary>
		///<param name="inputParse">
		///The parse to advance.
		///</param>
		///<param name="Q">
		///The amount of probability mass that should be accounted for by the advanced parses.
		///</param> 
		private Parse[] AdvanceParses(Parse inputParse, double Q) 
		{
			double q = 1 - Q;
			Parse lastStartNode = null;		// The closest previous node which has been labeled as a start node.
			int lastStartIndex = -1;			// The index of the closest previous node which has been labeled as a start node. 
			string lastStartType = null;	// The type of the closest previous node which has been labeled as a start node.
			int advanceNodeIndex;			// The index of the node which will be labeled in this iteration of advancing the parse.
			Parse advanceNode = null;		// The node which will be labeled in this iteration of advancing the parse.

			Parse[] children = inputParse.GetChildren();
			int nodeCount = children.Length;

			//determines which node needs to be labeled and prior labels.
			for (advanceNodeIndex = 0; advanceNodeIndex < nodeCount; advanceNodeIndex++) 
			{
				advanceNode = children[advanceNodeIndex];
				if (advanceNode.Label == null) 
				{
					break;
				}
				else if (mStartTypeMap.ContainsKey(advanceNode.Label)) 
				{
					lastStartType = (string) mStartTypeMap[advanceNode.Label];
					lastStartNode = advanceNode;
					lastStartIndex = advanceNodeIndex;
					//System.Console.Error.WriteLine("lastStart " + lastStartIndex + " " + lastStartNode.Label + " " + lastStartNode.Probability);
				}
			}
			ArrayList newParsesList = new ArrayList(mBuildModel.OutcomeCount);
			//call build
			mBuildModel.Evaluate(mBuildContextGenerator.GetContext(children, advanceNodeIndex), mBuildProbabilities);
			double buildProbabilitiesSum = 0;
			while (buildProbabilitiesSum < Q) 
			{
				//  The largest unadvanced labeling.
				int highestBuildProbabilityIndex = 0;
				for (int probabilityIndex = 1; probabilityIndex < mBuildProbabilities.Length; probabilityIndex++) 
				{ //for each build outcome
					if (mBuildProbabilities[probabilityIndex] > mBuildProbabilities[highestBuildProbabilityIndex]) 
					{
						highestBuildProbabilityIndex = probabilityIndex;
					}
				}
				if (mBuildProbabilities[highestBuildProbabilityIndex] == 0) 
				{
					break;
				}

				double highestBuildProbability = mBuildProbabilities[highestBuildProbabilityIndex];		

				mBuildProbabilities[highestBuildProbabilityIndex] = 0; //zero out so new max can be found
				buildProbabilitiesSum += highestBuildProbability;

				string tag = mBuildModel.GetOutcomeName(highestBuildProbabilityIndex);
				//System.Console.Out.WriteLine("trying " + tag + " " + buildProbabilitiesSum + " lst=" + lst);
				if (highestBuildProbabilityIndex == mTopStartIndex) 
				{ // can't have top until complete
					continue;
				}
				//System.Console.Error.WriteLine(probabilityIndex + " " + tag + " " + highestBuildProbability);
				if (mStartTypeMap.ContainsKey(tag)) 
				{ //update last start
					lastStartIndex = advanceNodeIndex;
					lastStartNode = advanceNode;
					lastStartType = (string) mStartTypeMap[tag];
				}
				else if (mContinueTypeMap.ContainsKey(tag)) 
				{
					if (lastStartNode == null || lastStartType != (string)mContinueTypeMap[tag]) 
					{
						continue; //Cont must match previous start or continue
					}
				}
				Parse newParse1 = (Parse) inputParse.Clone(); //clone parse
				if (mCreateDerivationString)
				{
					newParse1.AppendDerivationBuffer(highestBuildProbabilityIndex.ToString(System.Globalization.CultureInfo.InvariantCulture));
					newParse1.AppendDerivationBuffer("-");
				}
				newParse1.SetChild(advanceNodeIndex, tag); //replace constituent labeled

				newParse1.AddProbability(System.Math.Log(highestBuildProbability));
				//check
				mCheckModel.Evaluate(mCheckContextGenerator.GetContext(newParse1.GetChildren(), lastStartType, lastStartIndex, advanceNodeIndex), mCheckProbabilities);
				//System.Console.Out.WriteLine("check " + mCheckProbabilities[mCompleteIndex] + " " + mCheckProbabilities[mIncompleteIndex]);
				Parse newParse2 = newParse1;
				if (mCheckProbabilities[mCompleteIndex] > q) 
				{ //make sure a reduce is likely
					newParse2 = (Parse) newParse1.Clone();
					if (mCreateDerivationString)
					{
						newParse2.AppendDerivationBuffer("1");
						newParse2.AppendDerivationBuffer(".");
					}
					newParse2.AddProbability(System.Math.Log(mCheckProbabilities[1]));
					Parse[] constituent = new Parse[advanceNodeIndex - lastStartIndex + 1];
					bool isFlat = true;
					//first
					constituent[0] = lastStartNode;
					if (constituent[0].Type != constituent[0].Head.Type)
					{
						isFlat = false;
					}
					//last
					constituent[advanceNodeIndex - lastStartIndex] = advanceNode;
					if (isFlat && constituent[advanceNodeIndex - lastStartIndex].Type != constituent[advanceNodeIndex - lastStartIndex].Head.Type) 
					{
						isFlat = false;
					}
					//middle
					for (int constituentIndex = 1; constituentIndex < advanceNodeIndex - lastStartIndex; constituentIndex++) 
					{
						constituent[constituentIndex] = children[constituentIndex + lastStartIndex];
						if (isFlat && constituent[constituentIndex].Type != constituent[constituentIndex].Head.Type) 
						{
							isFlat = false;
						}
					}
					if (!isFlat) 
					{ //flat chunks are done by chunker
						newParse2.Insert(new Parse(inputParse.Text, new Util.Span(lastStartNode.Span.Start, advanceNode.Span.End), lastStartType, mCheckProbabilities[1], mHeadRules.GetHead(constituent, lastStartType)));
						newParsesList.Add(newParse2);
					}
				}
				if (mCheckProbabilities[mIncompleteIndex] > q) 
				{ //make sure a shift is likely
					if (mCreateDerivationString)
					{
						newParse1.AppendDerivationBuffer("0");
						newParse1.AppendDerivationBuffer(".");
					}
					if (advanceNodeIndex != nodeCount - 1) 
					{ //can't shift last element
						newParse1.AddProbability(System.Math.Log(mCheckProbabilities[0]));
						newParsesList.Add(newParse1);
					}
				}
			}
			Parse[] newParses = (Parse[])newParsesList.ToArray(typeof(Parse));
			return newParses;
		}

		///<summary>
		///Returns the top chunk sequences for the specified parse.
		///</summary>
		///<param name="inputParse">
		///A pos-tag assigned parse.
		///</param>
		/// <param name="minChunkScore">
		/// the minimum probability for an allowed chunk sequence.
		/// </param>
		///<returns>
		///The top chunk assignments to the specified parse.
		///</returns>
		private Parse[] AdvanceChunks(Parse inputParse, double minChunkScore) 
		{
			// chunk
			Parse[] children = inputParse.GetChildren();
			string[] words = new string[children.Length];
			string[] parseTags = new string[words.Length];
			double[] probabilities = new double[words.Length];
			Parse currentChildParse = null;
			for (int childParseIndex = 0, childParseCount = children.Length; childParseIndex < childParseCount; childParseIndex++) 
			{
				currentChildParse = children[childParseIndex];
				words[childParseIndex] = currentChildParse.Head.ToString();
				parseTags[childParseIndex] = currentChildParse.Type;
			}
			//System.Console.Error.WriteLine("adjusted min chunk score = " + (minChunkScore - inputParse.Probability));
			Util.Sequence[] chunkerSequences = mBasalChunker.TopKSequences(words, parseTags, minChunkScore - inputParse.Probability);
			Parse[] newParses = new Parse[chunkerSequences.Length];
			for (int sequenceIndex = 0, sequenceCount = chunkerSequences.Length; sequenceIndex < sequenceCount; sequenceIndex++) 
			{
				newParses[sequenceIndex] = (Parse) inputParse.Clone(); //copies top level
				if (mCreateDerivationString)
				{
					newParses[sequenceIndex].AppendDerivationBuffer(sequenceIndex.ToString(System.Globalization.CultureInfo.InvariantCulture));
					newParses[sequenceIndex].AppendDerivationBuffer(".");
				}
				string[] tags = (string[]) chunkerSequences[sequenceIndex].Outcomes.ToArray(typeof(string));
				chunkerSequences[sequenceIndex].GetProbabilities(probabilities);
				int start = -1;
				int end = 0;
				string type = null;
				//System.Console.Error.Write("sequence " + sequenceIndex + " ");
				for (int tagIndex = 0; tagIndex <= tags.Length; tagIndex++) 
				{
					//if (tagIndex != tags.Length)
					//{
					//	System.Console.Error.WriteLine(words[tagIndex] + " " + parseTags[tagIndex] + " " + tags[tagIndex] + " " + probabilities[tagIndex]);
					//}
					if (tagIndex != tags.Length) 
					{
						newParses[sequenceIndex].AddProbability(System.Math.Log(probabilities[tagIndex]));
					}
					if (tagIndex != tags.Length && tags[tagIndex].StartsWith(ContinuePrefix)) 
					{ // if continue just update end chunking tag don't use mContinueTypeMap
						end = tagIndex;
					}
					else 
					{ //make previous constituent if it exists
						if (type != null) 
						{
							//System.Console.Error.WriteLine("inserting tag " + tags[tagIndex]);
							Parse startParse = children[start];
							Parse endParse = children[end];
							//System.Console.Error.WriteLine("Putting " + type + " at " + start + "," + end + " " + newParses[sequenceIndex].Probability);
							Parse[] consitituents = new Parse[end - start + 1];
							consitituents[0] = startParse;
							//consitituents[0].Label = "Start-" + type;
							if (end - start != 0) 
							{
								consitituents[end - start] = endParse;
								//consitituents[end - start].Label = "Cont-" + type;
								for (int constituentIndex = 1; constituentIndex < end - start; constituentIndex++) 
								{
									consitituents[constituentIndex] = children[constituentIndex + start];
									//consitituents[constituentIndex].Label = "Cont-" + type;
								}
							}
							newParses[sequenceIndex].Insert(new Parse(startParse.Text, new Util.Span(startParse.Span.Start, endParse.Span.End), type, 1, mHeadRules.GetHead(consitituents, type)));
						}
						if (tagIndex != tags.Length) 
						{ //update for new constituent
							if (tags[tagIndex].StartsWith(StartPrefix)) 
							{ // don't use mStartTypeMap these are chunk tags
								type = tags[tagIndex].Substring(StartPrefix.Length);
								start = tagIndex;
								end = tagIndex;
							}
							else 
							{ // other 
								type = null;
							}
						}
					}
				}
				//newParses[sequenceIndex].Show();
				//System.Console.Out.WriteLine();
			}
			return newParses;
		}
		
		///<summary>
		///Advances the parse by assigning it POS tags and returns multiple tag sequences.
		///</summary>
		///<param name="inputParse">
		///The parse to be tagged.
		///</param>
		///<returns>
		///Parses with different pos-tag sequence assignments.
		///</returns>
		private Parse[] AdvanceTags(Parse inputParse) 
		{
			Parse[] children = inputParse.GetChildren();
			string[] words = new string[children.Length];
			double[] probabilities = new double[words.Length];
			for (int childParseIndex = 0; childParseIndex < children.Length; childParseIndex++) 
			{
				words[childParseIndex] = (children[childParseIndex]).ToString();
			}
			Util.Sequence[] tagSequences = mPosTagger.TopKSequences(words);
			if (tagSequences.Length == 0) 
			{
				System.Console.Error.WriteLine("no tag sequence");
			}
			Parse[] newParses = new Parse[tagSequences.Length];
			for (int tagSequenceIndex = 0; tagSequenceIndex < tagSequences.Length; tagSequenceIndex++) 
			{
				string[] tags = (string[]) tagSequences[tagSequenceIndex].Outcomes.ToArray(typeof(string));
				tagSequences[tagSequenceIndex].GetProbabilities(probabilities);
				newParses[tagSequenceIndex] = (Parse) inputParse.Clone(); //copies top level
				if (mCreateDerivationString)
				{
					newParses[tagSequenceIndex].AppendDerivationBuffer(tagSequenceIndex.ToString(System.Globalization.CultureInfo.InvariantCulture));
					newParses[tagSequenceIndex].AppendDerivationBuffer(".");
				}
				for (int wordIndex = 0; wordIndex < words.Length; wordIndex++) 
				{
					Parse wordParse = children[wordIndex];
					//System.Console.Error.WriteLine("inserting tag " + tags[wordIndex]);
					double wordProbability = probabilities[wordIndex];
					newParses[tagSequenceIndex].Insert(new Parse(wordParse.Text, wordParse.Span, tags[wordIndex], wordProbability));
					newParses[tagSequenceIndex].AddProbability(System.Math.Log(wordProbability));
					//newParses[tagSequenceIndex].Show();
				}
			}
			return newParses;
		}

		private static SharpEntropy.GisModel Train(SharpEntropy.ITrainingEventReader eventStream, int iterations, int cut)
		{
			SharpEntropy.GisTrainer trainer = new SharpEntropy.GisTrainer();
			trainer.TrainModel(iterations, new SharpEntropy.TwoPassDataIndexer(eventStream, cut));
			return new SharpEntropy.GisModel(trainer);
		}
		
		public static SharpEntropy.GisModel TrainModel(string trainingFile, EventType modelType, string headRulesFile)
		{
			return TrainModel(trainingFile, modelType, headRulesFile, 100, 5);
		}

		public static SharpEntropy.GisModel TrainModel(string trainingFile, EventType modelType, string headRulesFile, int iterations, int cutoff)
		{
			EnglishHeadRules rules = new EnglishHeadRules(headRulesFile);
			SharpEntropy.ITrainingEventReader eventReader = new ParserEventReader(new SharpEntropy.PlainTextByLineDataReader(new System.IO.StreamReader(trainingFile)), rules, modelType);
			return Train(eventReader, iterations, cutoff);
		}
	}
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -