📄 spamfilter.cs

📁 sample bayesian in c++ design view and free source program
💻 CS
字号:
using System;
using System.Collections;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics;
using System.IO;
using System.Text.RegularExpressions;

namespace Expat.Bayesian
{
	/// <summary>
	/// Naive Baysiam Spam Filter.  Basically, an implementation of this:
	/// http://www.paulgraham.com/spam.html
	/// </summary>
	public class SpamFilter
	{
		#region knobs for dialing in performance
		/// <summary>
		/// These are constants used in the Bayesian algorithm, presented in a form that lets you monkey with them.
		/// </summary>
		public class KnobList
		{
														// Values in PG's original article:
			public int GoodTokenWeight = 2;				// 2
			public int MinTokenCount = 0;				// 0
			public int MinCountForInclusion = 5;		// 5
			public double MinScore = 0.011;				// 0.01
			public double MaxScore = 0.99;				// 0.99
			public double LikelySpamScore = 0.9998;		// 0.9998
			public double CertainSpamScore = 0.9999;	// 0.9999
			public int CertainSpamCount = 10;			// 10
			public int InterestingWordCount = 15;		// 15 (later changed to 20)
		}

		private KnobList _knobs = new KnobList();

		/// <summary>
		/// These are the knobs you can turn to dial in performance on the algorithm.
		/// Hopefully the names make a little bit of sense and you can find where
		/// they fit into the original algorithm.
		/// </summary>
		public KnobList Knobs
		{
			get { return _knobs; }
			set { _knobs = value; }
		}

		#endregion

		private Corpus _good;
		private Corpus _bad;
		private SortedDictionary<string, double> _prob;
		private int _ngood;
		private int _nbad;

		#region properties
		/// <summary>
		/// A list of words that show tend to show up in Spam text
		/// </summary>
		public Corpus Bad
		{
			get { return _bad; }
			set { _bad = value; }
		}

		/// <summary>
		/// A list of words that tend to show up in non-spam text
		/// </summary>
		public Corpus Good
		{
			get { return _good; }
			set { _good = value; }
		}

		/// <summary>
		/// A list of probabilities that the given word might appear in a Spam text
		/// </summary>
		public SortedDictionary<string, double> Prob
		{
			get { return _prob; }
			set { _prob = value; }
		}
		#endregion

		#region population

		/// <summary>
		/// Initialize the SpamFilter based on the supplied text
		/// </summary>
		/// <param name="goodReader"></param>
		/// <param name="badReader"></param>
		public void Load(TextReader goodReader, TextReader badReader)
		{
			_good = new Corpus(goodReader);
			_bad = new Corpus(badReader);

			CalculateProbabilities();
		}

		/// <summary>
		/// Initialize the SpamFilter based on the contents of the supplied Corpuseses
		/// </summary>
		/// <param name="good"></param>
		/// <param name="bad"></param>
		public void Load(Corpus good, Corpus bad)
		{
			_good = good;
			_bad = bad;

			CalculateProbabilities();
		}

		/// <summary>
		/// Initialize the SpamFilter based on a DataTable containing columns "IsSpam" and "Body".
		/// This is only useful to me the author, but hey, it's my code so I can do what I want!
		/// </summary>
		/// <param name="table"></param>
		public void Load(DataTable table)
		{
			_good = new Corpus();
			_bad = new Corpus();

			foreach (DataRow row in table.Rows)
			{
				bool isSpam = (bool)row["IsSpam"];
				string body = row["Body"].ToString();
				if (isSpam)
				{
					_bad.LoadFromReader(new StringReader(body));
				}
				else
				{
					_good.LoadFromReader(new StringReader(body));
				}
			}

			CalculateProbabilities();
		}

		/// <summary>
		/// Do the math to populate the probabilities collection
		/// </summary>
		private void CalculateProbabilities()
		{
			_prob = new SortedDictionary<string, double>();

			_ngood = _good.Tokens.Count;
			_nbad = _bad.Tokens.Count;
			foreach (string token in _good.Tokens.Keys)
			{
				CalculateTokenProbability(token);
			}
			foreach (string token in _bad.Tokens.Keys)
			{
				if (!_prob.ContainsKey(token))
				{
					CalculateTokenProbability(token);
				}
			}
		}

		/// <summary>
		/// For a given token, calculate the probability that will appear in a spam text
		/// by comparing the number of good and bad texts it appears in already.
		/// </summary>
		/// <param name="token"></param>
		private void CalculateTokenProbability(string token)
		{
			/*
			 * This is a direct implementation of Paul Graham's algorithm from
			 * http://www.paulgraham.com/spam.html
			 * 
			 *	(let ((g (* 2 (or (gethash word good) 0)))
			 *		  (b (or (gethash word bad) 0)))
			 *	   (unless (< (+ g b) 5)
			 *		 (max .01
			 *			  (min .99 (float (/ (min 1 (/ b nbad))
			 *								 (+ (min 1 (/ g ngood))   
			 *									(min 1 (/ b nbad)))))))))
			 */

			int g = _good.Tokens.ContainsKey(token) ? _good.Tokens[token] * Knobs.GoodTokenWeight : 0;
			int b = _bad.Tokens.ContainsKey(token) ? _bad.Tokens[token] : 0;

			if (g + b >= Knobs.MinCountForInclusion)
			{
				double goodfactor = Min(1, (double)g / (double)_ngood);
				double badfactor = Min(1, (double)b / (double)_nbad);

				double prob = Max(Knobs.MinScore,
								Min(Knobs.MaxScore, badfactor / (goodfactor + badfactor))
							);

				// special case for Spam-only tokens.
				// .9998 for tokens only found in spam, or .9999 if found more than 10 times
				if (g == 0)
				{
					prob = (b > Knobs.CertainSpamCount) ? Knobs.CertainSpamScore : Knobs.LikelySpamScore;
				}

				_prob[token] = prob;
			}
		}
#endregion

#region serialization
		/// <summary>
		/// Dumps the probability list to a file, preceded by a line containing good, bad and probability counts.
		/// </summary>
		/// <param name="filePath"></param>
		public void ToFile(string filePath)
		{
			using (FileStream fs = new FileStream(filePath, FileMode.Create, FileAccess.Write))
			{
				StreamWriter writer = new StreamWriter(fs);

				writer.WriteLine(String.Format("{0},{1},{2}", _ngood, _nbad, _prob.Count));
				foreach (string key in _prob.Keys)
				{
					writer.WriteLine(String.Format("{0},{1}", _prob[key].ToString("#.#####"), key));
				}

				writer.Flush();
				fs.Close();
			}
		}

		/// <summary>
		/// Populate from a file created with ToFile().
		/// </summary>
		/// <param name="filePath"></param>
		public void FromFile(string filePath)
		{
			_prob = new SortedDictionary<string, double>();
			using (FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read))
			{
				StreamReader reader = new StreamReader(fs);
				
				ParseCounts(reader.ReadLine());

				while (!reader.EndOfStream)
				{
					ParseProb(reader.ReadLine());
				}

				fs.Close();
			}
		}

		private void ParseCounts(string line)
		{
			string[] tokens = line.Split(',');
			if (tokens.Length > 1)
			{
				_ngood = Convert.ToInt32(tokens[0]);
				_nbad = Convert.ToInt32(tokens[1]);
			}
		}

		private void ParseProb(string line)
		{
			string[] tokens = line.Split(',');
			if (tokens.Length > 1)
			{
				_prob.Add(tokens[1], Convert.ToDouble(tokens[0]));
			}
		}

#endregion

		#region spam testing
		/// <summary>
		/// Returns the probability that the supplied body of text is spam
		/// </summary>
		/// <param name="body"></param>
		/// <returns></returns>
		public double Test(string body)
		{
			SortedList probs = new SortedList();

			// Spin through every word in the body and look up its individual spam probability.
			// Keep the list in decending order of "Interestingness"
			Regex re = new Regex(Corpus.TokenPattern, RegexOptions.Compiled);
			Match m = re.Match(body);
			int index=0;
			while (m.Success)
			{
				string token = m.Groups[1].Value;
				if (_prob.ContainsKey(token))
				{
					// "interestingness" == how far our score is from 50%.  
					// The crazy math below is building a string that lets us sort alphabetically by interestingness.
					double prob = _prob[token];
					string key = (0.5 - Math.Abs(0.5 - prob)).ToString(".00000") + token + index++;
					probs.Add(key, prob);

				}

				m = m.NextMatch();
			}

			/* Combine the 15 most interesting probabilities together into one.  
			 * The algorithm to do this is shown below and described here:
			 * http://www.paulgraham.com/naivebayes.html
			 * 
			 *				abc           
			 *	---------------------------
			 *	abc + (1 - a)(1 - b)(1 - c)
			 *
			 */

			double mult = 1;  // for holding abc..n
			double comb = 1;  // for holding (1 - a)(1 - b)(1 - c)..(1-n)
			index = 0;
			foreach (string key in probs.Keys)
			{
				double prob = (double)probs[key];
				mult = mult * prob;
				comb = comb * (1 - prob);

				Debug.WriteLine(index + " " + probs[key] + " " + key );

				if (++index > Knobs.InterestingWordCount)
					break;
			}

			return mult / (mult + comb);

		}
		#endregion

		#region helpers

		private double Max(double one, double two)
		{
			return one > two ? one : two;
		}

		private double Min(double one, double two)
		{
			return one < two ? one : two;
		}
		#endregion
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -