📄 spamfilter.java

📁 spam filter (java source code)
💻 JAVA
字号:
/* Daniel Shiffman               *//* Bayesian Spam Filter Example  *//* Programming from A to Z       *//* Spring 2007                   *//* http://www.shiffman.net       *//* daniel.shiffman@nyu.edu       */// A class to describe a filter// Implemented as a HashMap of "Word" objects// The key for each "Word" object is the String// A better spam filter would be more sophisticated// This may have trouble with huge training files// And only looks at "word" tokens (whereas e-mails// may contain html, etc. . .//package bayes;import java.io.IOException;import java.util.*;import java.util.regex.*;//import a2z.A2ZFileReader;public class SpamFilter {	// A HashMap to keep track of all words	HashMap words;	// How to split the String into  tokens	String splitregex;	// Regex to eliminate junk (although we really should welcome the junk)	Pattern wordregex;	public SpamFilter() {		// Initialize fields		words = new HashMap();        splitregex = "\\W";		wordregex = Pattern.compile("\\w+");	}	// Receive a file that is marked as "Spam"	// Perhaps this should just receive a String	public void trainSpam(String file) throws IOException {		A2ZFileReader fr = new A2ZFileReader(file);		// Read the content and break up into words		String content = fr.getContent();		String[] tokens = content.split(splitregex);        int spamTotal = 0;//tokenizer.countTokens(); // How many words total		// For every word token		for (int i = 0; i < tokens.length; i++) {            String word = tokens[i].toLowerCase();			Matcher m = wordregex.matcher(word);			if (m.matches()) {				spamTotal++;				// If it exists in the HashMap already				// Increment the count				if (words.containsKey(word)) {					Word w = (Word) words.get(word);					w.countBad();				// Otherwise it's a new word so add it				} else {					Word w = new Word(word);					w.countBad();					words.put(word,w);				}			}		}		// Go through all the words and divide		// by total words		Iterator iterator = words.values().iterator();		while (iterator.hasNext()) {			Word word = (Word) iterator.next();			word.calcBadProb(spamTotal);		}	}//	 Receive a file that is marked as "Good"	// Perhaps this should just receive a String	public void trainGood(String file) throws IOException {		A2ZFileReader fr = new A2ZFileReader(file);		// Read the content and break up into words		String content = fr.getContent();        String[] tokens = content.split(splitregex);		int goodTotal = 0;//tokenizer.countTokens(); // How many words total		// For every word token		for (int i = 0; i < tokens.length; i++) {            String word = tokens[i].toLowerCase();			Matcher m = wordregex.matcher(word);			if (m.matches()) {				goodTotal++;				// If it exists in the HashMap already				// Increment the count				if (words.containsKey(word)) {					Word w = (Word) words.get(word);					w.countGood();				// Otherwise it's a new word so add it				} else {					Word w = new Word(word);					w.countGood();					words.put(word,w);				}			}		}		// Go through all the words and divide		// by total words		Iterator iterator = words.values().iterator();		while (iterator.hasNext()) {			Word word = (Word) iterator.next();			word.calcGoodProb(goodTotal);		}	}	// This method is derived from Paul Graham: http://www.paulgraham.com/spam.html	public boolean analyze(String stuff) {		// Create an arraylist of 15 most "interesting" words		// Words are most interesting based on how different their Spam probability is from 0.5		ArrayList interesting = new ArrayList();		// For every word in the String to be analyzed        String[] tokens = stuff.split(splitregex);		for (int i = 0; i < tokens.length; i++) {            String s = tokens[i].toLowerCase();			Matcher m = wordregex.matcher(s);			if (m.matches()) {				Word w;				// If the String is in our HashMap get the word out				if (words.containsKey(s)) {					w = (Word) words.get(s);				// Otherwise, make a new word with a Spam probability of 0.4;				} else {					w = new Word(s);					w.setPSpam(0.4f);				}				// We will limit ourselves to the 15 most interesting word				int limit = 15;				// If this list is empty, then add this word in!				if (interesting.isEmpty()) {					interesting.add(w);				// Otherwise, add it in sorted order by interesting level				} else {					for (int j = 0; j < interesting.size(); j++) {						// For every word in the list already						Word nw = (Word) interesting.get(j);						// If it's the same word, don't bother						if (w.getWord().equals(nw.getWord())) {							break;						// If it's more interesting stick it in the list						} else if (w.interesting() > nw.interesting()) {							interesting.add(j,w);							break;						// If we get to the end, just tack it on there						} else if (j == interesting.size()-1) {							interesting.add(w);						}					}				}				// If the list is bigger than the limit, delete entries				// at the end (the more "interesting" ones are at the				// start of the list				while (interesting.size() > limit) interesting.remove(interesting.size()-1);			}		}		// Apply Bayes' rule (via Graham)		float pposproduct = 1.0f;		float pnegproduct = 1.0f;		// For every word, multiply Spam probabilities ("Pspam") together		// (As well as 1 - Pspam)		for (int i = 0; i < interesting.size(); i++) {			Word w = (Word) interesting.get(i);			//System.out.println(w.getWord() + " " + w.getPSpam());			pposproduct *= w.getPSpam();			pnegproduct *= (1.0f - w.getPSpam());		}		// Apply formula		float pspam = pposproduct / (pposproduct + pnegproduct);		System.out.println("\nSpam rating: " + pspam);		// If the computed value is great than 0.9 we have a Spam!!		if (pspam > 0.9f) return true;		else return false;	}	// Display info about the words in the HashMap	public void displayStats() {		Iterator iterator = words.keySet().iterator();		while (iterator.hasNext()) {			String key = (String) iterator.next();			Word word = (Word) words.get(key);			if (word != null) {				//System.out.println(key + " pBad: " + word.getPBad() + " pGood: " + word.getPGood() + " pSpam: " + word.getPSpam());				System.out.println(key + " " + word.getPSpam());			}		}	}	// For every word, calculate the Spam probability	public void finalizeTraining() {		Iterator iterator = words.values().iterator();		while (iterator.hasNext()) {			Word word = (Word) iterator.next();			word.finalizeProb();		}	}}
💿 文件大小 3 K
👤 上传用户 boobyb
📂 所属分类 Java编程
📄 代码行数 221 行
💻 语言类型 Java
🏷️ 相关标签

#filter #source #spam #code
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -