📄 bayesiantrainer.java
字号:
import java.io.*;import java.util.*;/** * Use Paul Graham's formula to calculate the probabilities of words indicating SPAM: */public class BayesianTrainer { Hashtable spamHash = new Hashtable(); Hashtable notSpamHash = new Hashtable(); Hashtable wordProbability = new Hashtable(); int num_spam_messages = 0; int num_non_spam_messages = 0; int i_temp = 0; public static void main(String[] args) { new BayesianTrainer(); } public Hashtable getWordProbabilityHashtable() { return wordProbability; } public BayesianTrainer() { spamHash = makeHashes("./training_spam"); num_spam_messages = i_temp; notSpamHash = makeHashes("./training_not_spam"); num_non_spam_messages = i_temp; // now make the word probability hashtable: HashSet allWords = new HashSet(); Enumeration keys = spamHash.keys(); while (keys.hasMoreElements()) allWords.add(keys.nextElement()); keys = notSpamHash.keys(); while (keys.hasMoreElements()) allWords.add(keys.nextElement()); Iterator iter = allWords.iterator(); // // Paul Graham's formula: // //(let ((g (* 2 (or (gethash word good) 0))) // (b (or (gethash word bad) 0))) // (unless (< (+ g b) 5) // (max .01 // (min .99 (float (/ (min 1 (/ b nbad)) ;; term1 // (+ (min 1 (/ g ngood)) ;; term2 // (min 1 (/ b nbad))))))))) ;; i1 // while (iter.hasNext()) { String word = (String) iter.next(); Integer spam = (Integer) spamHash.get(word); int b = 0; if (spam != null) b = spam.intValue(); Integer notspam = (Integer) notSpamHash.get(word); int g = 0; if (notspam != null) g = 2 * notspam.intValue(); if ((b + g) > 4) { int term1 = b / num_spam_messages; if (term1 == 0) term1 = 1; int term2 = g / num_non_spam_messages; if (term2 == 0) term2 = 1; float f3 = term1 + term2; float f4 = (float) term1 / f3; if (f4 < 0.01f) f4 = 0.01f; if (f4 > 0.98f) f4 = 0.98f; wordProbability.put(word, new Float(f4)); System.out.println("word: " + word + " probability of SPAM word: " + f4); } } } /** * Read files with .txt file extensions to build PSAM and NOT SPAM word hashes */ Hashtable makeHashes(String dirPath) { i_temp = 0; Hashtable ret = new Hashtable(); Vector files = getFiles(dirPath); for (int j = 0, size = files.size(); j < size; j++) { System.out.println(" Processing " + dirPath + " : " + files.elementAt(j)); getHashFromFile(ret, (String) files.elementAt(j)); i_temp++; } return ret; } /** * Utility to get all word tokens in a text file */ Hashtable getHashFromFile(Hashtable h, String file_name) { try { Reader reader = new FileReader(file_name); return readWords(h, reader); } catch (Exception e) { e.printStackTrace(); } return new Hashtable(1); } Hashtable readWords(Hashtable counts, Reader reader) throws Exception { Vector words = Tokenizer.getTokens(reader); if (words.size() < 2) return new Hashtable(1); String[] wrds = new String[words.size()]; for (int i = 0, size = wrds.length; i < size; i++) wrds[i] = (String) words.elementAt(i); for (int i = 0, size = wrds.length; i < size; i++) { if (noiseWords.contains(wrds[i])) continue; Integer cnt = (Integer) counts.get(wrds[i]); if (cnt == null) { cnt = new Integer(0); } int cc = cnt.intValue(); cnt = new Integer(cc + 1); counts.put(wrds[i], cnt); } reader.close(); return counts; } public static final String fileSeparator = System.getProperty("file.separator"); public Vector getFiles(String path) { try { File dir = new File(path); LocalFileFilter filter = new LocalFileFilter(); String[] ss = dir.list(filter); if (ss == null || ss.length == 0) return null; Vector v = new Vector(ss.length); for (int i = 0; i < ss.length; i++) { v.addElement(path + fileSeparator + ss[i]); } return v; } catch (Exception e) { System.out.println("Error in getFiles:" + e); } return null; } private static HashSet noiseWords = new HashSet(); static { noiseWords.add("after"); noiseWords.add("subject"); noiseWords.add("this"); noiseWords.add("received"); noiseWords.add("jan"); noiseWords.add("feb"); noiseWords.add("mar"); noiseWords.add("apr"); noiseWords.add("jun"); noiseWords.add("aug"); noiseWords.add("sep"); noiseWords.add("sep"); noiseWords.add("oct"); noiseWords.add("nov"); noiseWords.add("dec"); noiseWords.add("mon"); noiseWords.add("tue"); noiseWords.add("wed"); noiseWords.add("thu"); noiseWords.add("fri"); noiseWords.add("sat"); noiseWords.add("sun"); noiseWords.add("to"); noiseWords.add("the"); noiseWords.add("of"); noiseWords.add("and"); noiseWords.add("on"); noiseWords.add("as"); noiseWords.add("by"); noiseWords.add("by"); noiseWords.add("only"); noiseWords.add("is"); noiseWords.add("a"); noiseWords.add("from"); noiseWords.add("you"); noiseWords.add("for"); noiseWords.add("in"); noiseWords.add("my"); noiseWords.add("we"); noiseWords.add("be"); noiseWords.add("that"); noiseWords.add("an"); noiseWords.add("are"); noiseWords.add("our"); noiseWords.add("if"); noiseWords.add("one"); noiseWords.add("with"); noiseWords.add("was"); noiseWords.add("up"); noiseWords.add("get"); noiseWords.add("there"); noiseWords.add("at"); noiseWords.add("or"); noiseWords.add("id"); noiseWords.add("can"); noiseWords.add("div"); noiseWords.add("font"); noiseWords.add("http"); noiseWords.add("all"); noiseWords.add("it"); noiseWords.add("cc"); noiseWords.add("bcc"); noiseWords.add("gmt"); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -