📄 englishtreebankchunker.cs

📁 英语句子自然语言处理统计分析例子 Statistical parsing of English sentences Shows how to generate parse trees for
💻 CS
字号:
//Copyright (C) 2005 Richard J. Northedge
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//This file is based on the EnglishTreebankChunker.java source file found in the
//original java implementation of OpenNLP.  That source file contains the following header:

//Copyright (C) 2003 Thomas Morton
// 
//This library is free software; you can redistribute it and/or
//modify it under the terms of the GNU Lesser General Public
//License as published by the Free Software Foundation; either
//version 2.1 of the License, or (at your option) any later version.
// 
//This library is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Lesser General Public License for more details.
// 
//You should have received a copy of the GNU Lesser General Public
//License along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

using System;
using System.Collections;
using System.Text;

namespace OpenNLP.Tools.Chunker
{
	/// <summary>
	/// This is a chunker based on the CONLL chunking task which uses Penn Treebank constituents as the basis for the chunks.
	/// See   http://cnts.uia.ac.be/conll2000/chunking/ for data and task definition.
	/// </summary>
	/// <author> 
	/// Tom Morton
	/// </author>
	public class EnglishTreebankChunker : MaximumEntropyChunker
	{
		/// <summary>
		/// Creates an English Treebank Chunker which uses the specified model file.
		/// </summary>
		/// <param name="modelFile">
		/// The name of the maxent model to be used.
		/// </param>
		public EnglishTreebankChunker(string modelFile) : base(new SharpEntropy.GisModel(new SharpEntropy.IO.BinaryGisModelReader(modelFile)))
		{
		}
						
		/// <summary>
		/// This method determines whether the outcome is valid for the preceding sequence.  
		/// This can be used to implement constraints on what sequences are valid.  
		/// </summary>
		/// <param name="outcome">
		/// The outcome.
		/// </param>
		/// <param name="sequence">
		/// The preceding sequence of outcome assignments. 
		/// </param>
		/// <returns>
		/// true if the outcome is valid for the sequence, false otherwise.
		/// </returns>
		protected internal override bool ValidOutcome(string outcome, Util.Sequence sequence)
		{
			if (outcome.StartsWith("I-"))
			{
				ArrayList tagList = sequence.Outcomes;
				int lastTagIndex = tagList.Count - 1;
				if (lastTagIndex == - 1)
				{
					return (false);
				}
				else
				{
					string lastTag = (string) tagList[lastTagIndex];
					if (lastTag == "O")
					{
						return false;
					}
					if (lastTag.Substring(2) != outcome.Substring(2))
					{
						return false;
					}
				}
			}
			return true;
		}
		
		/// <summary>
		/// Gets formatted chunk information for a specified sentence.
		/// </summary>
		/// <param name="tokens">
		/// string array of tokens in the sentence
		/// </param>
		/// <param name="tags">
		/// string array of POS tags for the tokens in the sentence
		/// </param>
		/// <returns>
		/// A string containing the formatted chunked sentence
		/// </returns>
		public string GetChunks(string[] tokens, string[] tags)
		{
			StringBuilder output = new StringBuilder();

			string[] chunks = Chunk(tokens, tags);
			for (int currentChunk = 0, chunkCount = chunks.Length; currentChunk < chunkCount; currentChunk++)
			{
				if (currentChunk > 0 && !chunks[currentChunk].StartsWith("I-") && chunks[currentChunk - 1] != "O")
				{
					output.Append(" ]");
				}
				if (chunks[currentChunk].StartsWith("B-"))
				{
					output.Append(" [" + chunks[currentChunk].Substring(2));
				}
				
				output.Append(" " + tokens[currentChunk] + "/" + tags[currentChunk]);
			}
			if (chunks[chunks.Length - 1] != "O")
			{
				output.Append(" ]");
			}
			output.Append("\r\n");

			return output.ToString();

		}

		/// <summary>
		/// Gets formatted chunk information for a specified sentence.
		/// </summary>
		/// <param name="data">
		/// a string containing a list of tokens and tags, separated by / characters. For example:
		/// Battle-tested/JJ Japanese/NNP industrial/JJ managers/NNS 
		/// </param>
		/// <returns>
		/// A string containing the formatted chunked sentence
		/// </returns>
		public string GetChunks(string data)
		{
			StringBuilder output = new StringBuilder();
			string[] tokenAndTags = data.Split(' ');
			string[] tokens = new string[tokenAndTags.Length];
			string[] tags = new string[tokenAndTags.Length];
			for (int currentTokenAndTag = 0, tokenAndTagCount = tokenAndTags.Length; currentTokenAndTag < tokenAndTagCount; currentTokenAndTag++)
			{
				string[] tokenAndTag = tokenAndTags[currentTokenAndTag].Split('/');
				tokens[currentTokenAndTag] = tokenAndTag[0];
				tags[currentTokenAndTag] = tokenAndTag[1];
			}

			return GetChunks(tokens, tags);
		}
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -