📄 maximumentropytokenizer.cs
字号:
//Copyright (C) 2005 Richard J. Northedge
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//This file is based on the TokenizerME.java source file found in the
//original java implementation of OpenNLP. That source file contains the following header:
// Copyright (C) 2002 Jason Baldridge, Gann Bierner, and Tom Morton
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
using System;
using System.Collections;
using System.Text.RegularExpressions;
namespace OpenNLP.Tools.Tokenize
{
/// <summary>
/// A Tokenizer for converting raw text into separated tokens. It uses
/// Maximum Entropy to make its decisions. The features are loosely
/// based on Jeff Reynar's UPenn thesis "Topic Segmentation:
/// Algorithms and Applications.", which is available from his
/// homepage: http://www.cis.upenn.edu/~jcreynar.
/// </summary>
public class MaximumEntropyTokenizer : ITokenizer
{
/// <summary>
/// the maximum entropy model to use to evaluate contexts.
/// </summary>
private SharpEntropy.IMaximumEntropyModel mModel;
/// <summary>
/// The context generator.
/// </summary>
private SharpEntropy.IContextGenerator mContextGenerator;
private const double mOne = 1.0;
internal static Regex AlphaNumeric = new Regex("^[A-Za-z0-9]+$");
/// <summary>
/// optimization flag to skip alpha numeric tokens for further
/// tokenization
/// </summary>
private bool mAlphaNumericOptimization;
/// <summary>
/// list of probabilities for each token returned from call to
/// Tokenize()
/// </summary>
private ArrayList mTokenProbabilities;
private ArrayList mNewTokens;
/// <summary>
/// Used to have the tokenizer ignore tokens which only contain alpha-numeric characters.
/// </summary>
virtual public bool AlphaNumericOptimization
{
get
{
return mAlphaNumericOptimization;
}
set
{
mAlphaNumericOptimization = value;
}
}
/// <summary>
/// Class constructor which takes the string locations of the
/// information which the maxent model needs.
/// </summary>
public MaximumEntropyTokenizer(SharpEntropy.IMaximumEntropyModel model)
{
mContextGenerator = new TokenContextGenerator();
mAlphaNumericOptimization = false;
mModel = model;
mNewTokens = new ArrayList();
mTokenProbabilities = new ArrayList(50);
}
/// <summary>
/// Tokenizes the string.
/// </summary>
/// <param name="input">
/// The string to be tokenized.
/// </param>
/// <returns>
/// A span array containing individual tokens as elements.
/// </returns>
public virtual Util.Span[] TokenizePositions(string input)
{
Util.Span[] tokens = Split(input);
mNewTokens.Clear();
mTokenProbabilities.Clear();
for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++)
{
Util.Span tokenSpan = tokens[currentToken];
string token = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start));
// Can't tokenize single characters
if (token.Length < 2)
{
mNewTokens.Add(tokenSpan);
mTokenProbabilities.Add(mOne);
}
else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token))
{
mNewTokens.Add(tokenSpan);
mTokenProbabilities.Add(1.0);
}
else
{
int startPosition = tokenSpan.Start;
int endPosition = tokenSpan.End;
int originalStart = tokenSpan.Start;
double tokenProbability = 1.0;
for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++)
{
double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(new Util.ObjectIntPair(token, currentPosition - originalStart)));
string bestOutcome = mModel.GetBestOutcome(probabilities);
tokenProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)];
if (bestOutcome == TokenContextGenerator.SplitIndicator)
{
mNewTokens.Add(new Util.Span(startPosition, currentPosition));
mTokenProbabilities.Add(tokenProbability);
startPosition = currentPosition;
tokenProbability = 1.0;
}
}
mNewTokens.Add(new Util.Span(startPosition, endPosition));
mTokenProbabilities.Add(tokenProbability);
}
}
return (Util.Span[]) mNewTokens.ToArray(typeof(Util.Span));
}
/// <summary>
/// Tokenize a string.
/// </summary>
/// <param name="input">
/// The string to be tokenized.
/// </param>
/// <returns>
/// A string array containing individual tokens as elements.
/// </returns>
public virtual string[] Tokenize(string input)
{
Util.Span[] tokenSpans = TokenizePositions(input);
string[] tokens = new string[tokenSpans.Length];
for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++)
{
tokens[currentToken] = input.Substring(tokenSpans[currentToken].Start, (tokenSpans[currentToken].End) - (tokenSpans[currentToken].Start));
}
return tokens;
}
/// <summary>
/// Constructs a list of Span objects, one for each whitespace
/// delimited token. Token strings can be constructed form these
/// spans as follows: input.Substring(span.Start, span.End);
/// </summary>
/// <param name="input">
/// string to tokenize.
/// </param>
/// <returns>
/// ArrayList is spans.
/// </returns>
internal static Util.Span[] Split(string input)
{
int tokenStart = - 1;
ArrayList tokens = new ArrayList();
bool isInToken = false;
//gather up potential tokens
int endPosition = input.Length;
for (int currentChar = 0; currentChar < endPosition; currentChar++)
{
if (System.Char.IsWhiteSpace(input[currentChar]))
{
if (isInToken)
{
tokens.Add(new Util.Span(tokenStart, currentChar));
isInToken = false;
tokenStart = - 1;
}
}
else
{
if (!isInToken)
{
tokenStart = currentChar;
isInToken = true;
}
}
}
if (isInToken)
{
tokens.Add(new Util.Span(tokenStart, endPosition));
}
return (Util.Span[]) tokens.ToArray(typeof(Util.Span));
}
/// <summary>
/// Returns the probabilities associated with the most recent
/// calls to Tokenize() or TokenizePositions().
/// </summary>
/// <returns>
/// probability for each token returned for the most recent
/// call to tokenize. If not applicable an empty array is
/// returned.
/// </returns>
public virtual double[] GetTokenProbabilities()
{
double[] tokenProbabilities = new double[mTokenProbabilities.Count];
for (int iCurrentTokenProbability = 0; iCurrentTokenProbability < tokenProbabilities.Length; iCurrentTokenProbability++)
{
tokenProbabilities[iCurrentTokenProbability] = ((System.Double) mTokenProbabilities[iCurrentTokenProbability]);
}
return tokenProbabilities;
}
public static void Train(SharpEntropy.ITrainingEventReader eventReader, string outputFilename)
{
SharpEntropy.GisTrainer trainer = new SharpEntropy.GisTrainer(0.1);
trainer.TrainModel(100, new SharpEntropy.TwoPassDataIndexer(eventReader, 5));
SharpEntropy.GisModel tokenizeModel = new SharpEntropy.GisModel(trainer);
new SharpEntropy.IO.BinaryGisModelWriter().Persist(tokenizeModel, outputFilename);
}
public static void Train(string input, string output)
{
System.IO.StreamReader dataReader = new System.IO.StreamReader(new System.IO.FileInfo(input).FullName);
SharpEntropy.ITrainingEventReader eventReader = new TokenEventReader(dataReader);
Train(eventReader, output);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -