📄 bayesiantrainer.java

📁 Practical AI in Java 人工智能编程
💻 JAVA
字号:
import java.io.*;import java.util.*;/** *  Use Paul Graham's formula to calculate the probabilities of words indicating SPAM: */public class BayesianTrainer {    Hashtable spamHash = new Hashtable();    Hashtable notSpamHash = new Hashtable();    Hashtable wordProbability = new Hashtable();    int num_spam_messages = 0;    int num_non_spam_messages = 0;    int i_temp = 0;    public static void main(String[] args) {        new BayesianTrainer();    }    public Hashtable getWordProbabilityHashtable() {        return wordProbability;    }    public BayesianTrainer() {        spamHash = makeHashes("./training_spam");        num_spam_messages = i_temp;        notSpamHash = makeHashes("./training_not_spam");        num_non_spam_messages = i_temp;        // now make the word probability hashtable:        HashSet allWords = new HashSet();        Enumeration keys = spamHash.keys();        while (keys.hasMoreElements()) allWords.add(keys.nextElement());        keys = notSpamHash.keys();        while (keys.hasMoreElements()) allWords.add(keys.nextElement());        Iterator iter = allWords.iterator();        //        //    Paul Graham's formula:        //        //(let ((g (* 2 (or (gethash word good) 0)))        //      (b (or (gethash word bad) 0)))        //  (unless (< (+ g b) 5)        //    (max .01        //      (min .99 (float (/ (min 1 (/ b nbad))      ;; term1        //                         (+ (min 1 (/ g ngood))  ;; term2        //                            (min 1 (/ b nbad)))))))))  ;; i1        //        while (iter.hasNext()) {            String word = (String) iter.next();            Integer spam = (Integer) spamHash.get(word);            int b = 0;            if (spam != null) b = spam.intValue();            Integer notspam = (Integer) notSpamHash.get(word);            int g = 0;            if (notspam != null) g = 2 * notspam.intValue();            if ((b + g) > 4) {                int term1 = b / num_spam_messages;                if (term1 == 0) term1 = 1;                int term2 = g / num_non_spam_messages;                if (term2 == 0) term2 = 1;                float f3 = term1 + term2;                float f4 = (float) term1 / f3;                if (f4 < 0.01f) f4 = 0.01f;                if (f4 > 0.98f) f4 = 0.98f;                wordProbability.put(word, new Float(f4));                System.out.println("word: " + word + " probability of SPAM word: " + f4);            }        }    }    /**     *  Read files with .txt file extensions to build PSAM and NOT SPAM word hashes     */    Hashtable makeHashes(String dirPath) {        i_temp = 0;        Hashtable ret = new Hashtable();        Vector files = getFiles(dirPath);        for (int j = 0, size = files.size(); j < size; j++) {            System.out.println(" Processing " + dirPath + " : " + files.elementAt(j));            getHashFromFile(ret, (String) files.elementAt(j));            i_temp++;        }        return ret;    }    /**     *  Utility to get all word tokens in a text file     */    Hashtable getHashFromFile(Hashtable h, String file_name) {        try {            Reader reader = new FileReader(file_name);            return readWords(h, reader);        } catch (Exception e) {            e.printStackTrace();        }        return new Hashtable(1);    }    Hashtable readWords(Hashtable counts, Reader reader) throws Exception {        Vector words = Tokenizer.getTokens(reader);        if (words.size() < 2) return new Hashtable(1);        String[] wrds = new String[words.size()];        for (int i = 0, size = wrds.length; i < size; i++) wrds[i] = (String) words.elementAt(i);        for (int i = 0, size = wrds.length; i < size; i++) {            if (noiseWords.contains(wrds[i]))  continue;            Integer cnt = (Integer) counts.get(wrds[i]);            if (cnt == null) {                cnt = new Integer(0);            }            int cc = cnt.intValue();            cnt = new Integer(cc + 1);            counts.put(wrds[i], cnt);        }        reader.close();        return counts;    }    public static final String fileSeparator = System.getProperty("file.separator");    public Vector getFiles(String path) {        try {            File dir = new File(path);            LocalFileFilter filter = new LocalFileFilter();            String[] ss = dir.list(filter);            if (ss == null || ss.length == 0) return null;            Vector v = new Vector(ss.length);            for (int i = 0; i < ss.length; i++) {                v.addElement(path + fileSeparator + ss[i]);            }            return v;        } catch (Exception e) {            System.out.println("Error in getFiles:" + e);        }        return null;    }    private static HashSet noiseWords = new HashSet();    static {        noiseWords.add("after");        noiseWords.add("subject");        noiseWords.add("this");        noiseWords.add("received");        noiseWords.add("jan");        noiseWords.add("feb");        noiseWords.add("mar");        noiseWords.add("apr");        noiseWords.add("jun");        noiseWords.add("aug");        noiseWords.add("sep");        noiseWords.add("sep");        noiseWords.add("oct");        noiseWords.add("nov");        noiseWords.add("dec");        noiseWords.add("mon");        noiseWords.add("tue");        noiseWords.add("wed");        noiseWords.add("thu");        noiseWords.add("fri");        noiseWords.add("sat");        noiseWords.add("sun");        noiseWords.add("to");        noiseWords.add("the");        noiseWords.add("of");        noiseWords.add("and");        noiseWords.add("on");        noiseWords.add("as");        noiseWords.add("by");        noiseWords.add("by");        noiseWords.add("only");        noiseWords.add("is");        noiseWords.add("a");        noiseWords.add("from");        noiseWords.add("you");        noiseWords.add("for");        noiseWords.add("in");        noiseWords.add("my");        noiseWords.add("we");        noiseWords.add("be");        noiseWords.add("that");        noiseWords.add("an");        noiseWords.add("are");        noiseWords.add("our");        noiseWords.add("if");        noiseWords.add("one");        noiseWords.add("with");        noiseWords.add("was");        noiseWords.add("up");        noiseWords.add("get");        noiseWords.add("there");        noiseWords.add("at");        noiseWords.add("or");        noiseWords.add("id");        noiseWords.add("can");        noiseWords.add("div");        noiseWords.add("font");        noiseWords.add("http");        noiseWords.add("all");        noiseWords.add("it");        noiseWords.add("cc");        noiseWords.add("bcc");        noiseWords.add("gmt");    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -