⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bayes.java

📁 本源程序搜集整理贝叶斯算法判断垃圾邮件
💻 JAVA
字号:
/* Description:贝叶斯算法判断垃圾邮件
 * * author:Tony
 * * date: 2007/10/23
 */

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.cn.*;

import java.util.*;
import java.io.*;

public class Bayes 
{
	 private Hashtable badHash;
	 private Hashtable goodHash;
	 private Hashtable probabilityHash;
	 //初始化
	 public void init()throws Exception
	 {
		 badHash = new Hashtable();
		 goodHash = new Hashtable();
		 buildEmailHash("D:\\JavaWorkSpace\\Bayes\\src\\data\\badEmail",badHash);
		 buildEmailHash("D:\\JavaWorkSpace\\Bayes\\src\\data\\goodEmail",goodHash);
		 buildNewHashTable();
		
	 }
	 //创建垃圾邮件哈希表
	 public void buildEmailHash(String path, Hashtable table) throws Exception
	 {
		 String badInput = readFile(path);
	     ChineseTokenizer tokenizer = new ChineseTokenizer(
	    		                       new StringReader(badInput));
	     Token token;
	     Hashtable tempHash = new Hashtable();
	     double rate = 0.0;
	     int total = 0;
         while ((token = tokenizer.next()) != null)
         {
        	 total++;
        	 String temp = token.termText();
        	 if(table.containsKey(temp))
        	 {
        		 int counter = (Integer)tempHash.get(temp);
        		 counter++;
        		 tempHash.remove(temp);
        		 tempHash.put(temp, counter);
        	 }
        	 else
        	 {
        		 tempHash.put(temp, 1);
        	 }
         }
         //将垃圾邮件中字符及其概率放入badhash中
         for(Iterator it = tempHash.keySet().iterator(); it.hasNext();) 
         {   
        	 String key = (String)it.next();
        	 rate = (Integer)tempHash.get(key)/total;  
        	 table.put(key, rate);     
         }
	 }
	 /*创建新的probability Hash表,其中的概率表示
	 在邮件中出现 TOKEN 串 ti 时,该邮件为垃圾邮件的概率
	 */
	 public void buildNewHashTable()
	 {
        for(Iterator it = badHash.keySet().iterator(); it.hasNext();) 
         {   
        	 String key = (String)it.next();
        	 // P ( A|ti ) =P2 ( ti ) /[ ( P1 ( ti ) +P2 ( ti ) ] 
        	 double badRate = (Double)badHash.get(key);
        	 double allRate = badRate;
        	 if(goodHash.containsKey(key))
        	 {
        		 allRate  += (Double)goodHash.get(key); 
        	 }
        	 probabilityHash.put(key, (badRate/allRate));
         }
	 }

	 //读文件
	 public String readFile(String path)throws Exception
	 {
		 BufferedReader br = new BufferedReader(
				 new FileReader(path));
		 String str = "";
		 while(true)
		 {
			 if(br.readLine() == null) break;
			 str += br.readLine();
		 }
		 br.close();
		 return str;
	 }
	 //判断是否为垃圾邮件 返回true表示非垃圾邮件 返回false表示是垃圾邮件
	 public boolean judgeEmail(String email, double weight) throws Exception
	 {
		 //P(A|t1 ,t2, t3……tn)=(P1*P2*……PN)/[P1*P2*……PN+(1-P1)*(1-P2)*……(1-PN)] 
		 ChineseTokenizer tokenizer = new ChineseTokenizer(
				                       new StringReader(email));
	     Token token;
	     boolean flag = true;
	     double rate = 1.0;
	     double tempRate = 1.0;
	     double finalRate = 0.0;
         while ((token = tokenizer.next()) != null)
         {
        	 String key = token.termText();
        	 if(!probabilityHash.containsKey(key))
        	 {
        		 break;
        	 }
        	 else
        	 {
        		 double temp = (Double)probabilityHash.get(key);
        		 tempRate *= 1 - temp;
        		 rate *= temp;
        	 }
         }
         finalRate = rate/(rate + tempRate);
         if(finalRate > weight) flag = false;
         return flag;
	 }
	 public static void main(String args[]) throws Exception
	 {
		 Bayes bayes = new Bayes();
		 bayes.init();
		 String email ="";
		 double weight = 0.5;
		 if(bayes.judgeEmail(email, weight))
		 {
			 System.out.println("It is OK!");
		 }
		 else
		 {
			 System.out.println("It is wrong!");
		 }
	 }

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -