⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 maximumentropychunker.cs

📁 英语句子自然语言处理统计分析例子 Statistical parsing of English sentences Shows how to generate parse trees for
💻 CS
字号:
//Copyright (C) 2005 Richard J. Northedge
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//This file is based on the ChunkerME.java source file found in the
//original java implementation of OpenNLP.  That source file contains the following header:

//Copyright (C) 2003 Thomas Morton
// 
//This library is free software; you can redistribute it and/or
//modify it under the terms of the GNU Lesser General Public
//License as published by the Free Software Foundation; either
//version 2.1 of the License, or (at your option) any later version.
// 
//This library is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Lesser General Public License for more details.
// 
//You should have received a copy of the GNU Lesser General Public
//License along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
using System;
using System.Collections;

namespace OpenNLP.Tools.Chunker
{
	/// <summary>
	/// This class represents a maximum-entropy-based chunker.  Such a chunker can be used to
	/// find flat structures based on sequence inputs such as noun phrases or named entities.
	/// </summary>
	public class MaximumEntropyChunker : IChunker
	{		
		private Util.BeamSearch mBeam;
		private Util.Sequence mBestSequence;
		private SharpEntropy.IMaximumEntropyModel mModel; 

		/// <summary>
		/// The beam used to search for sequences of chunk tag assignments.
		/// </summary>
		protected internal Util.BeamSearch Beam
		{
			get
			{
				return mBeam;
			}
		}

		/// <summary>
		/// The model used to assign chunk tags to a sequence of tokens.
		/// </summary>
		protected internal SharpEntropy.IMaximumEntropyModel Model
		{
			get
			{
				return mModel;
			}
		}

		/// <summary>
		/// Creates a chunker using the specified model.
		/// </summary>
		/// <param name="model">
		/// The maximum entropy model for this chunker.
		/// </param>
		public MaximumEntropyChunker(SharpEntropy.IMaximumEntropyModel model):this(model, new DefaultChunkerContextGenerator(), 10)
		{
		}
		
		/// <summary>
		/// Creates a chunker using the specified model and context generator.
		/// </summary>
		/// <param name="model">
		/// The maximum entropy model for this chunker.
		/// </param>
		/// <param name="contextGenerator">
		/// The context generator to be used by the specified model.
		/// </param>
		public MaximumEntropyChunker(SharpEntropy.IMaximumEntropyModel model, IChunkerContextGenerator contextGenerator):this(model, contextGenerator, 10)
		{
		}
		
		/// <summary>
		/// Creates a chunker using the specified model and context generator and decodes the
		/// model using a beam search of the specified size.
		/// </summary>
		/// <param name="model">
		/// The maximum entropy model for this chunker.
		/// </param>
		/// <param name="contextGenerator">
		/// The context generator to be used by the specified model.
		/// </param>
		/// <param name="beamSize">
		/// The size of the beam that should be used when decoding sequences.
		/// </param>
		public MaximumEntropyChunker(SharpEntropy.IMaximumEntropyModel model, IChunkerContextGenerator contextGenerator, int beamSize)
		{
			mBeam = new ChunkBeamSearch(this, beamSize, contextGenerator, model);
			mModel = model;
		}
		
		/// <summary>
		/// Performs a chunking operation.
		/// </summary>
		/// <param name="tokens">
		/// ArrayList of tokens
		/// </param>
		/// <param name="tags">
		/// ArrayList of tags corresponding to the tokens
		/// </param>
		/// <returns>
		/// ArrayList of results, containing a value for each token, indicating the chunk that that token belongs to.
		/// </returns>
		public virtual ArrayList Chunk(ArrayList tokens, ArrayList tags)
		{
			mBestSequence = mBeam.BestSequence(tokens, new object[] { (string[]) tags.ToArray(typeof(string)) });
			return mBestSequence.Outcomes;
		}
		
		/// <summary>
		/// Performs a chunking operation.
		/// </summary>
		/// <param name="tokens">
		/// Object array of tokens
		/// </param>
		/// <param name="tags">
		/// String array of POS tags corresponding to the tokens in the object array
		/// </param>
		/// <returns>
		/// String array containing a value for each token, indicating the chunk that that token belongs to.
		/// </returns>
		public virtual string[] Chunk(object[] tokens, string[] tags)
		{
			mBestSequence = mBeam.BestSequence(new ArrayList(tokens), new object[]{tags});
			ArrayList chunks = mBestSequence.Outcomes;
			return (string[]) chunks.ToArray(typeof(string));
		}
		
		/// <summary>
		/// Gets a list of all the possible chunking tags.
		/// </summary>
		/// <returns>
		/// String array, each entry containing a chunking tag.
		/// </returns>
		public virtual string[] AllTags()
		{
			string[] tags = new string[mModel.OutcomeCount];
			for (int currentTag = 0; currentTag < mModel.OutcomeCount; currentTag++)
			{
				tags[currentTag] = mModel.GetOutcomeName(currentTag);
			}
			return tags;
		}
		/// <summary>
		/// This method determines wheter the outcome is valid for the preceding sequence.  
		/// This can be used to implement constraints on what sequences are valid.  
		/// </summary>
		/// <param name="outcome">
		/// The outcome.
		/// </param>
		/// <param name="sequence">
		/// The preceding sequence of outcomes assignments. 
		/// </param>
		/// <returns>
		/// true if the outcome is valid for the sequence, false otherwise.
		/// </returns>
		protected internal virtual bool ValidOutcome(string outcome, Util.Sequence sequence)
		{
			return true;
		}
		
		/// <summary>
		/// This method determines wheter the outcome is valid for the preceeding sequence.  
		/// This can be used to implement constraints on what sequences are valid.  
		/// </summary>
		/// <param name="outcome">
		/// The outcome.
		/// </param>
		/// <param name="sequence">
		/// The preceding sequence of outcomes assignments. 
		/// </param>
		/// <returns>
		/// true if the outcome is valid for the sequence, false otherwise.
		/// </returns>
		protected internal virtual bool ValidOutcome(string outcome, string[] sequence) 
		{
			return true;
		}

		/// <summary>
		/// This class implements the abstract BeamSearch class to allow for the chunker to use
		/// the common beam search code. 
		/// </summary>
		private class ChunkBeamSearch : Util.BeamSearch
		{
			private MaximumEntropyChunker mMaxentChunker;
			
			public ChunkBeamSearch(MaximumEntropyChunker maxentChunker, int size, IChunkerContextGenerator contextGenerator, SharpEntropy.IMaximumEntropyModel model):base(size, contextGenerator, model)
			{
				mMaxentChunker = maxentChunker;
			}
			
			protected internal override bool ValidSequence(int index, ArrayList inputSequence, Util.Sequence outcomesSequence, string outcome)
			{
				return mMaxentChunker.ValidOutcome(outcome, outcomesSequence);
			}
    
			protected internal override bool ValidSequence(int index, object[] inputSequence, string[] outcomesSequence, string outcome) 
			{
				return mMaxentChunker.ValidOutcome(outcome, outcomesSequence);
			}
		}
		
		/// <summary>
		/// Populates the specified array with the probabilities of the last decoded sequence.  The
		/// sequence was determined based on the previous call to <code>chunk</code>.  The 
		/// specified array should be at least as large as the numbe of tokens in the previous call to <code>chunk</code>.
		/// </summary>
		/// <param name="probabilities">
		/// An array used to hold the probabilities of the last decoded sequence.
		/// </param>
		public virtual void GetProbabilities(double[] probabilities)
		{
			mBestSequence.GetProbabilities(probabilities);
		}
		
		/// <summary>
		/// Returns an array with the probabilities of the last decoded sequence.  The
		/// sequence was determined based on the previous call to <code>chunk</code>.
		/// </summary>
		/// <returns>
		/// An array with the same number of probabilities as tokens were sent to <code>chunk</code>
		/// when it was last called.   
		/// </returns>
		public virtual double[] GetProbabilities()
		{
			return mBestSequence.GetProbabilities();
		}
		
		/// <summary>
		/// Trains the chunker.  Training file should be one word per line where each line consists of a
		/// space-delimited triple of "word pos outcome".  Sentence breaks are indicated by blank lines.
		/// </summary>
		/// <param name="eventReader">
		/// The chunker event reader.
		/// </param>
		/// <returns>
		/// Trained model.
		/// </returns>
		public static SharpEntropy.GisModel Train(SharpEntropy.ITrainingEventReader eventReader)
		{
			return Train(eventReader, 100, 5);
		}

		/// <summary>
		/// Trains the chunker.  Training file should be one word per line where each line consists of a
		/// space-delimited triple of "word pos outcome".  Sentence breaks are indicated by blank lines.
		/// </summary>
		/// <param name="eventReader">
		/// The chunker event reader.
		/// </param>
		/// <param name="iterations">
		/// The number of iterations to perform.
		/// </param>
		/// <param name="cutoff">
		/// The number of times a predicate must be seen in order
		/// to be relevant for training.
		/// </param>
		/// <returns>
		/// Trained model.
		/// </returns>
		public static SharpEntropy.GisModel Train(SharpEntropy.ITrainingEventReader eventReader, int iterations, int cutoff)
		{
			SharpEntropy.GisTrainer trainer = new SharpEntropy.GisTrainer();
			trainer.TrainModel(iterations, new SharpEntropy.TwoPassDataIndexer(eventReader, cutoff));
			return new SharpEntropy.GisModel(trainer);
		}
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -