📄 defaultnamecontextgenerator.cs

📁 英语句子自然语言处理统计分析例子 Statistical parsing of English sentences Shows how to generate parse trees for
💻 CS
字号:
//Copyright (C) 2005 Richard J. Northedge
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//This file is based on the DefaultNameContextGenerator.java source file found in the
//original java implementation of OpenNLP.  That source file contains the following header:

//Copyright (C) 2003 Thomas Morton
// 
//This library is free software; you can redistribute it and/or
//modify it under the terms of the GNU Lesser General Public
//License as published by the Free Software Foundation; either
//version 2.1 of the License, or (at your option) any later version.
// 
//This library is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Lesser General Public License for more details.
// 
//You should have received a copy of the GNU Lesser General Public
//License along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

using System;
using System.Text.RegularExpressions;
using System.Collections;
using OpenNLP.Tools.Util;

namespace OpenNLP.Tools.NameFind
{
	/// <summary>
	/// Class for determining contextual features for a tag/chunk style named-entity recognizer.
	/// </summary>
	/// 
	public class DefaultNameContextGenerator : INameContextGenerator
	{
		
		// patterns
		private Regex mLowercasePattern;
		private Regex mTwoDigitsPattern;
		private Regex mFourDigitsPattern;
		private Regex mContainsNumberPattern;
		private Regex mContainsLetterPattern;
		private Regex mContainsHyphensPattern;
		private Regex mContainsBackslashPattern;
		private Regex mContainsCommaPattern;
		private Regex mContainsPeriodPattern;
		private Regex mAllCapsPattern;
		private Regex mCapPeriodPattern;
		private Regex mInitialCapPattern;
		
		private Util.Cache mContextsCache;
		private object mWordsKey;
		private int mPreviousIndex = -1;
		private ArrayList mPreviousStaticFeatures;

		/// <summary>
		/// Creates a name context generator.
		/// </summary>
		public DefaultNameContextGenerator() : this(0)
		{
		}
		
		/// <summary>
		/// Creates a name context generator with the specified cache size.
		/// </summary>
		public DefaultNameContextGenerator(int cacheSize) : base()
		{
			InitializePatterns();
			if (cacheSize > 0)
			{
				mContextsCache = new Cache(cacheSize);
			}
		}

		private void InitializePatterns()
		{
			mLowercasePattern = new Regex("^[a-z]+$");
			mTwoDigitsPattern = new Regex("^[0-9][0-9]$");
			mFourDigitsPattern = new Regex("^[0-9][0-9][0-9][0-9]$");
			mContainsNumberPattern = new Regex("[0-9]");
			mContainsLetterPattern = new Regex("[a-zA-Z]");
			mContainsHyphensPattern = new Regex("-");
			mContainsBackslashPattern = new Regex("/");
			mContainsCommaPattern = new Regex(",");
			mContainsPeriodPattern = new Regex("\\.");
			mAllCapsPattern = new Regex("^[A-Z]+$");
			mCapPeriodPattern = new Regex("^[A-Z]\\.$");
			mInitialCapPattern = new Regex("^[A-Z]");
		}
		
		public virtual string[] GetContext(object context)
		{
			object[] contextData = (object[]) context;
			return (GetContext(((int) contextData[0]), (ArrayList) contextData[1], (ArrayList) contextData[2], (IDictionary) contextData[3]));
		}
		
		public virtual string[] GetContext(int index, ArrayList sequence, Sequence outcomesSequence, object[] additionalContext)
		{
			return GetContext(index, sequence, outcomesSequence.Outcomes, (IDictionary) additionalContext[0]);
		}
		
		public virtual string[] GetContext(int index, ArrayList tokens, ArrayList predicates, IDictionary previousTags)
		{
			return (GetContext(index, tokens.ToArray(), (string[]) predicates.ToArray(typeof(string)), previousTags));
		}
		
		public virtual string[] GetContext(int index, object[] sequence, string[] priorDecisions, object[] additionalContext) 
		{
			return (GetContext(index, sequence, priorDecisions, (IDictionary) additionalContext[0]));
		}

		/// <summary>
		/// Return the context for finding names at the specified index.
		/// </summary>
		/// <param name="index">
		/// The index of the token in the specified tokens array for which the context should be constructed. 
		/// </param>
		/// <param name="tokens">
		/// tokens of the sentence.  The <code>ToString()</code> methods of these objects should return the token text.
		/// </param>
		/// <param name="predicates">
		/// The previous decisions made in the tagging of this sequence.  Only indices less than {index} will be examined.
		/// </param>
		/// <param name="previousTags">
		/// A mapping between tokens and the previous outcome for these tokens. 
		/// </param>
		/// <returns>
		/// the context for finding names at the specified index.
		/// </returns>
		public virtual string[] GetContext(int index, object[] tokens, string[] predicates, IDictionary previousTags)
		{
			string previous = MaximumEntropyNameFinder.Other;
			string previousPrevious = MaximumEntropyNameFinder.Other;
			if (index > 1)
			{
				previousPrevious = predicates[index - 2];
			}
			if (index > 0)
			{
				previous = predicates[index - 1];
			}

			string cacheKey = index.ToString(System.Globalization.CultureInfo.InvariantCulture) + previous + previousPrevious;
			if (mContextsCache != null)
			{
				if (mWordsKey == tokens)
				{
					string[] cachedContexts = (string[])mContextsCache[cacheKey];
					if (cachedContexts != null)
					{
						return cachedContexts;
					}
				}
				else
				{
					mContextsCache.Clear();
					mWordsKey = tokens;
				}
			}
			ArrayList features;
			if (mWordsKey == tokens && index == mPreviousIndex)
			{
				features = mPreviousStaticFeatures;
			}
			else
			{
				features = GetStaticFeatures(tokens, index, previousTags);
				mPreviousIndex = index;
				mPreviousStaticFeatures = features;
			}
			
			int featureCount = features.Count;
			string[] contexts = new string[featureCount + 4];
			for (int currentFeature = 0; currentFeature < featureCount; currentFeature++)
			{
				contexts[currentFeature] = ((string)features[currentFeature]);
			}
			contexts[featureCount] = "po=" + previous;
			contexts[featureCount + 1] = "pow=" + previous + tokens[index];
			contexts[featureCount + 2] = "powf=" + previous + WordFeature(tokens[index].ToString());
			contexts[featureCount + 3] = "ppo=" + previousPrevious;
			if (mContextsCache != null)
			{
				mContextsCache[cacheKey] = contexts;
			}
			return contexts;
		}
		
		/// <summary>
		/// Returns a list of the features for <code>tokens[index]</code> that can
		/// be safely cached.  In other words, return a list of all
		/// features that do not depend on previous outcome or decision
		/// features.  This method is called by <code>search</code>.
		/// </summary>
		/// <param name="tokens">
		/// The list of tokens being processed.
		/// </param>
		/// <param name="index">
		/// The index of the token whose features should be
		/// returned.
		/// </param>
		/// <param name="previousTags">
		/// The list of previous tags.
		/// </param>
		/// <returns> a list of the features for <code>tokens[index]</code> that can
		/// be safely cached.
		/// </returns>
		private ArrayList GetStaticFeatures(object[] tokens, int index, IDictionary previousTags)
		{
			ArrayList features = new ArrayList();
			features.Add("def");
			
			//current word
			string currentWord = tokens[index].ToString().ToLower(System.Globalization.CultureInfo.InvariantCulture);
			features.Add("w=" + currentWord);
			string wordFeature = WordFeature(tokens[index].ToString());
			features.Add("wf=" + wordFeature);
			features.Add("w&wf=" + currentWord + "," + wordFeature);
			
			string previousTag = (string) previousTags[tokens[index].ToString()];
			features.Add("pd=" + previousTag);
			if (index == 0)
			{
				features.Add("df=it");
			}
			// previous previous word
			if (index - 2 >= 0)
			{
				string previousPreviousWord = tokens[index - 2].ToString().ToLower(System.Globalization.CultureInfo.InvariantCulture);
				features.Add("ppw=" + previousPreviousWord);
				string previousPreviousWordFeature = WordFeature(tokens[index - 2].ToString());
				features.Add("ppwf=" + previousPreviousWordFeature);
				features.Add("ppw&f=" + previousPreviousWord + "," + previousPreviousWordFeature);
			}
			else
			{
				features.Add("ppw=BOS");
			}
			// previous word
			if (index == 0)
			{
				features.Add("pw=BOS");
				features.Add("pw=BOS,w=" + currentWord);
				features.Add("pwf=BOS,wf" + wordFeature);
			}
			else
			{
				string previousWord = tokens[index - 1].ToString().ToLower(System.Globalization.CultureInfo.InvariantCulture);
				features.Add("pw=" + previousWord);
				System.String previousWordFeature = WordFeature(tokens[index - 1].ToString());
				features.Add("pwf=" + previousWordFeature);
				features.Add("pw&f=" + previousWord + "," + previousWordFeature);
				features.Add("pw=" + previousWord + ",w=" + currentWord);
				features.Add("pwf=" + previousWordFeature + ",wf=" + wordFeature);
			}
			//next word
			if (index + 1 >= tokens.Length)
			{
				features.Add("nw=EOS");
				features.Add("w=" + currentWord + ",nw=EOS");
				features.Add("wf=" + wordFeature + ",nw=EOS");
			}
			else
			{
				string nextWord = tokens[index + 1].ToString().ToLower(System.Globalization.CultureInfo.InvariantCulture);
				features.Add("nw=" + nextWord);
				System.String nextWordFeature = WordFeature(tokens[index + 1].ToString());
				features.Add("nwf=" + nextWordFeature);
				features.Add("nw&f=" + nextWord + "," + nextWordFeature);
				features.Add("w=" + currentWord + ",nw=" + nextWord);
				features.Add("wf=" + wordFeature + ",nwf=" + nextWordFeature);
			}
			if (index + 2 >= tokens.Length)
			{
				features.Add("nnw=EOS");
			}
			else
			{
				string nextNextWord = tokens[index + 2].ToString().ToLower(System.Globalization.CultureInfo.InvariantCulture);
				features.Add("nnw=" + nextNextWord);
				string nextNextWordFeature = WordFeature(tokens[index + 2].ToString());
				features.Add("nnwf=" + nextNextWordFeature);
				features.Add("nnw&f=" + nextNextWord + "," + nextNextWordFeature);
			}
			
			return features;
		}
		
		
		/// <summary>
		/// Return the most relevant feature for a given word.  This method
		/// is used to get the features for words
		/// within a window of the word being analyzed.  Typical features
		/// are "2d" (2 digits); "4d" (4 digits); and "ac" (all caps).
		/// Note that only a single feature is returned.  The default
		/// feature is "other".
		/// </summary>
		/// <param name="word">
		/// The word whose features should be returned.
		/// </param>
		/// <returns>
		/// A feature code.
		/// </returns>
		private string WordFeature(string word)
		{
			string feature;
			if (mLowercasePattern.IsMatch(word))
			{
				feature = "lc";
			}
			else if (mTwoDigitsPattern.IsMatch(word))
			{
				feature = "2d";
			}
			else if (mFourDigitsPattern.IsMatch(word))
			{
				feature = "4d";
			}
			else if (mContainsNumberPattern.IsMatch(word))
			{
				if (mContainsLetterPattern.IsMatch(word))
				{
					feature = "an";
				}
				else if (mContainsHyphensPattern.IsMatch(word))
				{
					feature = "dd";
				}
				else if (mContainsBackslashPattern.IsMatch(word))
				{
					feature = "ds";
				}
				else if (mContainsCommaPattern.IsMatch(word))
				{
					feature = "dc";
				}
				else if (mContainsPeriodPattern.IsMatch(word))
				{
					feature = "dp";
				}
				else
				{
					feature = "num";
				}
			}
			else if (mAllCapsPattern.IsMatch(word) && word.Length == 1)
			{
				feature = "sc";
			}
			else if (mAllCapsPattern.IsMatch(word))
			{
				feature = "ac";
			}
			else if (mCapPeriodPattern.IsMatch(word))
			{
				feature = "cp";
			}
			else if (mInitialCapPattern.IsMatch(word))
			{
				feature = "ic";
			}
			else
			{
				feature = "other";
			}
			
			return feature;
		}

	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -