📄 defaultposcontextgenerator.cs

📁 英语句子自然语言处理统计分析例子 Statistical parsing of English sentences Shows how to generate parse trees for
💻 CS
字号:
//Copyright (C) 2005 Richard J. Northedge
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//This file is based on the DefaultPOSContextGenerator.java source file found in the
//original java implementation of OpenNLP.  That source file contains the following header:

// Copyright (C) 2002 Jason Baldridge and Gann Bierner
// 
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// 
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

using System;
using System.Collections;
using System.Text.RegularExpressions;

namespace OpenNLP.Tools.PosTagger
{
	/// <summary> 
	/// A context generator for the POS Tagger.
	/// </summary>	
	public class DefaultPosContextGenerator : IPosContextGenerator
	{
		protected internal const string SentenceEnd = "*SE*";
		protected internal const string SentenceBeginning = "*SB*";

		private const int mPrefixLength = 4;
		private const int mSuffixLength = 4;
		
		private static Regex mHasCapitalRegex = new Regex("[A-Z]");
		private static Regex mHasNumericRegex = new Regex("[0-9]");
		
		private Util.Cache mContextsCache;
		private object mWordsKey;

		public DefaultPosContextGenerator() : this(0)
		{
		}
		
		public DefaultPosContextGenerator(int cacheSize) 
		{
			if (cacheSize > 0) 
			{
				mContextsCache = new Util.Cache(cacheSize);
			}
		}

		public virtual string[] GetContext(object input)
		{
			object[] data = (object[]) input;
			return GetContext(((int) data[0]), (object[]) data[1], (string[]) data[2], null);
		}
		
		protected internal static string[] GetPrefixes(string lex)
		{
			string[] prefixes = new string[mPrefixLength];
			for (int currentPrefix = 0; currentPrefix < mPrefixLength; currentPrefix++)
			{
				prefixes[currentPrefix] = lex.Substring(0, (System.Math.Min(currentPrefix + 1, lex.Length)) - (0));
			}
			return prefixes;
		}
		
		protected internal static string[] GetSuffixes(string lex)
		{
			string[] suffixes = new string[mSuffixLength];
			for (int currentSuffix = 0; currentSuffix < mSuffixLength; currentSuffix++)
			{
				suffixes[currentSuffix] = lex.Substring(System.Math.Max(lex.Length - currentSuffix - 1, 0));
			}
			return suffixes;
		}
		
		public virtual string[] GetContext(int index, object[] sequence, string[] priorDecisions, object[] additionalContext) 
		{
			return GetContext(index, sequence, priorDecisions);
		}

		/// <summary>
		/// Returns the context for making a pos tag decision at the specified token index given the specified tokens and previous tags.
		/// </summary>
		/// <param name="index">
		/// The index of the token for which the context is provided.
		/// </param>
		/// <param name="tokens">
		/// The tokens in the sentence.
		/// </param>
		/// <param name="tags">
		/// The tags assigned to the previous words in the sentence.
		/// </param>
		/// <returns>
		/// The context for making a pos tag decision at the specified token index given the specified tokens and previous tags.
		/// </returns>
		public virtual string[] GetContext(int index, object[] tokens, string[] tags) 
		{
			string next, nextNext, lex, previous, previousPrevious;
			string tagPrevious, tagPreviousPrevious;
			tagPrevious = tagPreviousPrevious = null;
			next = nextNext = lex = previous = previousPrevious = null;
			
			lex = tokens[index].ToString();
			if (tokens.Length > index + 1) 
			{
				next = tokens[index + 1].ToString();
				if (tokens.Length > index + 2)
				{
					nextNext = tokens[index + 2].ToString();
				}
				else
				{
					nextNext = SentenceEnd; 
				}
			}
			else
			{
				next = SentenceEnd; 
			}
			
			if (index - 1 >= 0) 
			{
				previous = tokens[index - 1].ToString();
				tagPrevious = tags[index - 1].ToString();

				if (index - 2 >= 0) 
				{
					previousPrevious = tokens[index - 2].ToString();
					tagPreviousPrevious = tags[index - 2].ToString();
				}
				else
				{
					previousPrevious = SentenceBeginning; 
				}
			}
			else
			{
				previous = SentenceBeginning; 
			}
			
			String cacheKey = index.ToString(System.Globalization.CultureInfo.InvariantCulture) + tagPrevious + tagPreviousPrevious;
			if (!(mContextsCache == null)) 
			{
				if (mWordsKey == tokens)
				{
					string[] cachedContexts = (string[]) mContextsCache[cacheKey];    
					if (cachedContexts != null) 
					{
						return cachedContexts;
					}
				}
				else 
				{
					mContextsCache.Clear();
					mWordsKey = tokens;
				}

			}

			ArrayList eventList = new ArrayList();
			
			// add the word itself
			eventList.Add("w=" + lex);
			
			// do some basic suffix analysis
			string[] suffixes = GetSuffixes(lex);
			for (int currentSuffix = 0; currentSuffix < suffixes.Length; currentSuffix++)
			{
				eventList.Add("suf=" + suffixes[currentSuffix]);
			}
			
			string[] prefixes = GetPrefixes(lex);
			for (int currentPrefix = 0; currentPrefix < prefixes.Length; currentPrefix++)
			{
				eventList.Add("pre=" + prefixes[currentPrefix]);
			}
			// see if the word has any special characters
			if (lex.IndexOf((char) '-') != - 1)
			{
				eventList.Add("h");
			}
			
			if (mHasCapitalRegex.IsMatch(lex)) 
			{
				eventList.Add("c");
			}
			
			if (mHasNumericRegex.IsMatch(lex))
			{
				eventList.Add("d");
			}
			
			// add the words and positions of the surrounding context
			if ((object) previous != null)
			{
				eventList.Add("p=" + previous);
				if ((object) tagPrevious != null)
				{
					eventList.Add("t=" + tagPrevious);
				}
				if ((object) previousPrevious != null)
				{
					eventList.Add("pp=" + previousPrevious);
					if ((object) tagPreviousPrevious != null)
					{
						eventList.Add("tt=" + tagPreviousPrevious);
					}
				}
			}
			
			if ((object) next != null)
			{
				eventList.Add("n=" + next);
				if ((object) nextNext != null)
				{
					eventList.Add("nn=" + nextNext);
				}
			}

			string[] contexts = (string[]) eventList.ToArray(typeof(string));
			if (mContextsCache != null) 
			{
				mContextsCache[cacheKey] = contexts;
			}
			return (contexts);
		}
		
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -