📄 chunkcontextgenerator.cs

📁 英语句子自然语言处理统计分析例子 Statistical parsing of English sentences Shows how to generate parse trees for
💻 CS
字号:
//Copyright (C) 2005 Richard J. Northedge
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//This file is based on the ChunkContextGenerator.java source file found in the
//original java implementation of OpenNLP.  That source file contains the following header:

//Copyright (C) 2003 Thomas Morton
// 
//This library is free software; you can redistribute it and/or
//modify it under the terms of the GNU Lesser General Public
//License as published by the Free Software Foundation; either
//version 2.1 of the License, or (at your option) any later version.
// 
//This library is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Lesser General Public License for more details.
// 
//You should have received a copy of the GNU Lesser General Public
//License along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

using System;
using System.Collections;

namespace OpenNLP.Tools.Parser
{
	/// <summary>
	/// Creates predivtive context for the pre-chunking phases of parsing.
	/// </summary>
	public class ChunkContextGenerator : Chunker.IChunkerContextGenerator
	{
		private const string mEndOfSentence = "eos";
		
		private Util.Cache mContextsCache;
		private object mWordsKey;

		public ChunkContextGenerator() : this(0)
		{
		}
  
		public ChunkContextGenerator(int cacheSize) : base()
		{
			if (cacheSize > 0) 
			{
				mContextsCache = new Util.Cache(cacheSize);
			}
		}

		public virtual string[] GetContext(object input)
		{
			object[] data = (object[]) input;
			return (GetContext(((int) data[0]), (string[]) data[1], (string[]) data[2], (string[]) data[3]));
		}
		
		public virtual string[] GetContext(int index, object[] words, string[] previousDecisions, object[] additionalContext) 
		{
			return(GetContext(index, words, (string[]) additionalContext[0], previousDecisions));
		}
		
		/// <summary>
		/// Returns the contexts for chunking of the specified index.
		/// </summary>
		/// <param name="index">
		/// The index of the token in the specified tokens array for which the context should be constructed. 
		/// </param>
		/// <param name="words">
		/// The tokens of the sentence.  The <code>ToString()</code> methods of these objects should return the token text.
		/// </param>
		/// <param name="predicates">
		/// The previous decisions made in the taging of this sequence.  Only indices less than i will be examined.
		/// </param>
		/// <param name="tags">
		/// The POS tags for the the specified tokens.
		/// </param>
		/// <returns>
		/// An array of predictive contexts on which a model basis its decisions.
		/// </returns>
		public virtual string[] GetContext(int index, object[] words, string[] tags, string[] predicates)
		{
			ArrayList features = new ArrayList(19);
			int currentTokenIndex = index;
			int previousPreviousTokenIndex = currentTokenIndex - 2;
			int previousTokenIndex = currentTokenIndex - 1;
			int nextNextTokenIndex = currentTokenIndex + 2;
			int nextTokenIndex = currentTokenIndex + 1;

			string previousPreviousWord, previousWord, currentWord, nextWord, nextNextWord;
			string previousPreviousTag, previousTag, currentTag, nextTag, nextNextTag;
			string previousPreviousPriorDecision, previousPriorDecision;
			
			string[] contexts;

			// ChunkAndPosTag(-2)
			if (previousPreviousTokenIndex >= 0)
			{
				previousPreviousTag = tags[previousPreviousTokenIndex];
				previousPreviousPriorDecision = predicates[previousPreviousTokenIndex];
				previousPreviousWord = words[previousPreviousTokenIndex].ToString();
			}
			else
			{
				previousPreviousTag = mEndOfSentence;
				previousPreviousPriorDecision = mEndOfSentence;
				previousPreviousWord = mEndOfSentence;
			}
			
			// ChunkAndPosTag(-1)
			if (previousTokenIndex >= 0)
			{
				previousTag = tags[previousTokenIndex];
				previousPriorDecision = predicates[previousTokenIndex];
				previousWord = words[previousTokenIndex].ToString();
			}
			else
			{
				previousTag = mEndOfSentence;
				previousPriorDecision = mEndOfSentence;
				previousWord = mEndOfSentence;
			}
			
			// ChunkAndPosTag(0)
			currentTag = tags[currentTokenIndex];
			currentWord = words[currentTokenIndex].ToString();
			
			// ChunkAndPosTag(1)
			if (nextTokenIndex < tags.Length)
			{
				nextTag = tags[nextTokenIndex];
				nextWord = words[nextTokenIndex].ToString();
			}
			else
			{
				nextTag = mEndOfSentence;
				nextWord = mEndOfSentence;
			}
			
			// ChunkAndPosTag(2)
			if (nextNextTokenIndex < tags.Length)
			{
				nextNextTag = tags[nextNextTokenIndex];
				nextNextWord = words[nextNextTokenIndex].ToString();
			}
			else
			{
				nextNextTag = mEndOfSentence;
				nextNextWord = mEndOfSentence;
			}
			
			string cacheKey = currentTokenIndex.ToString(System.Globalization.CultureInfo.InvariantCulture) + previousPreviousTag + previousTag
				+ currentTag + nextTag + nextNextTag + previousPreviousPriorDecision
				+ previousPriorDecision;

			if (mContextsCache != null) 
			{
				if (mWordsKey == words) 
				{
					contexts = (string[]) mContextsCache[cacheKey];
					if (contexts != null) 
					{
						return contexts;
					}
				}
				else 
				{
					mContextsCache.Clear();
					mWordsKey = words;
				}
			}

			string previousPreviousChunkTag = ChunkAndPosTag(-2, previousPreviousWord, previousPreviousTag, previousPreviousPriorDecision);
			string previousPreviousChunkTagBackOff = ChunkAndPosTagBackOff(-2, previousPreviousTag, previousPreviousPriorDecision);
			string previousChunkTag = ChunkAndPosTag(-1, previousWord, previousTag, previousPriorDecision);
			string previousChunkTagBackOff = ChunkAndPosTagBackOff(-1, previousTag, previousPriorDecision);
			string currentChunkTag = ChunkAndPosTag(0, currentWord, currentTag, null);
			string currentChunkTagBackOff = ChunkAndPosTagBackOff(0, currentTag, null);
			string nextChunkTag = ChunkAndPosTag(1, nextWord, nextTag, null);
			string nextChunkTagBackOff = ChunkAndPosTagBackOff(1, nextTag, null);
			string nextNextChunkTag = ChunkAndPosTag(2, nextNextWord, nextNextTag, null);
			string nextNextChunkTagBackOff = ChunkAndPosTagBackOff(2, nextNextTag, null);

			features.Add("default");
			features.Add(previousPreviousChunkTag);
			features.Add(previousPreviousChunkTagBackOff);
			features.Add(previousChunkTag);
			features.Add(previousChunkTagBackOff);
			features.Add(currentChunkTag);
			features.Add(currentChunkTagBackOff);
			features.Add(nextChunkTag);
			features.Add(nextChunkTagBackOff);
			features.Add(nextNextChunkTag);
			features.Add(nextNextChunkTagBackOff);
			
			//ChunkAndPosTag(-1,0)
			features.Add(previousChunkTag + "," + currentChunkTag);
			features.Add(previousChunkTagBackOff + "," + currentChunkTag);
			features.Add(previousChunkTag + "," + currentChunkTagBackOff);
			features.Add(previousChunkTagBackOff + "," + currentChunkTagBackOff);
			
			//ChunkAndPosTag(0,1)
			features.Add(currentChunkTag + "," + nextChunkTag);
			features.Add(currentChunkTagBackOff + "," + nextChunkTag);
			features.Add(currentChunkTag + "," + nextChunkTagBackOff);
			features.Add(currentChunkTagBackOff + "," + nextChunkTagBackOff);
			
			contexts = (string[]) features.ToArray(typeof(string));
			if (mContextsCache != null)
			{
				mContextsCache[cacheKey] = contexts;
			}
			return (contexts);
		}
		
		private string ChunkAndPosTag(int index, string token, string tag, string chunk)
		{
			System.Text.StringBuilder feature = new System.Text.StringBuilder(20);
			feature.Append(index).Append("=").Append(token).Append("|").Append(tag);
			if (index < 0)
			{
				feature.Append("|").Append(chunk);
			}
			return (feature.ToString());
		}
		
		private string ChunkAndPosTagBackOff(int index, string tag, string chunk)
		{
			System.Text.StringBuilder feature = new System.Text.StringBuilder(20);
			feature.Append(index).Append("*=").Append(tag);
			if (index < 0)
			{
				feature.Append("|").Append(chunk);
			}
			return (feature.ToString());
		}
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -