📄 bayesianfilter.java
字号:
/** * <p>Title: StandBayeMail </p> * <p>Description: A bayesian spam filter</p> * <p>Copyright: Copyright (c) 2004 by Luca M. Viola</p> * <p>Company: 3AM.it</p> * @author Luca M. Viola <luca@3am.it> * @version 1.0 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/package StandBayeMail;public class BayesianFilter{ private boolean NON_EQUIPROBABLE=false; // When the calculation is not equiprobable (true) it means that in // lack of evidence the probability of a new word being spam // equals the ratio between the hystorical "bad" words' number // and the good ones' private static final double UNKNOWN_WORD=0.4; // When the calculation is equiprobable, instead we should give // a 50% probability to unknown words to be in either domain, // but to avoid "false positives" we go slightly unbalanced // on the statical "goodness" of the new word (40% // probability that it is in the spam words domain) public static final int KEEPERS=15; // Number of extremes (discriminants) to calculate private static final int MINIMUM_FREQ=5; // The minimum occurrence of a word in order to be // esamined by the filter private static final double SPAM_CUTOFF=0.9; // The threshold over which the message tested is to be // considered SPAM. public static double getSpamicityThreshold() { return SPAM_CUTOFF; } private double deviation( double n ) { return Math.abs((n)-0.5); } // If v<a returns a, if v>b returns b, if a<=v<=b returns v private double minmax( double v,double a,double b) { return ( (v)<(a)?(a) : (v)>(b)?(b) : (v) ); } public Statistics bmf( WordCounterFile pblist, WordCounterFile pglist, WordCounterFile pmlist, Statistics pstats ) { String pword; double msg_prob =0.0; double prob, product, invproduct, dev; double goodness, spamness, slotdev, hitdev; if( NON_EQUIPROBABLE ) msg_prob =((double)pblist.getKeyNumber() / (double)pglist.getKeyNumber()); Discriminants [] pp; Discriminants hit=new Discriminants(); // This initializes the discriminants' arrays assuming a neutral // deviation (deviation=shift from the 50% of probability). for (int i = 0; i < KEEPERS; i++) { pp= pstats.extrema; pp[i].key=""; pp[i].prob = 0.5f; } // This takes a list of words from the mail and it calculates their // probability to be either good or bad, and out of all of them // it saves a "KEEPERS" number with their respective statistics // of probability computed according to their historical ratios and // deviations from 50%. // Such a list of words is called "extremes" or "discriminants". pmlist.initIterator(); while( (pword = pmlist.nextKey()) !=null ) { goodness = pglist.getCount( pword ); spamness = pblist.getCount( pword ); if( goodness + spamness < MINIMUM_FREQ ) { if( NON_EQUIPROBABLE ) prob = msg_prob; else prob = UNKNOWN_WORD; } else { double pb = Math.min( 1.0, (spamness / pblist.getMboxesCount()) ); double pg = Math.min( 1.0, (goodness / pglist.getMboxesCount()) ); if( NON_EQUIPROBABLE ) prob = (pb * msg_prob) / ((pg * (1 - msg_prob)) + (pb * msg_prob)); else prob = pb / (pg + pb); prob = minmax( prob, 0.01, 0.99 ); } // Updates the list of words with the maximum deviation dev = deviation(prob); hit = null; hitdev = 0; for (int i=0; i<KEEPERS; i++) { pp= pstats.extrema; // Removes duplicated words from the list of extremes (discriminants) if( pp[i].key.length() > 0 && pword.compareTo( pp[i].key ) == 0 ) { hit = null; break; } slotdev = deviation(pp[i].prob); if (dev>slotdev && dev>hitdev) { hit = pp[i]; hitdev = slotdev; } } if (hit!=null) { hit.prob = prob; hit.key = pword; } } // Here instead we apply the Bayes' theorem, such as described // in http://www.mathpages.com/home/kmath267.htm // The theorem is applied on a number of contrasting evidences // equal to KEEPERS product = invproduct = 1.0f; for( int i=0; i <KEEPERS; i++) { pp = pstats.extrema; if( pp[i].prob == 0 ) { break; } else { product *= pp[i].prob; invproduct *= (1 - pp[i].prob); } } pstats.spamicity = product / (product + invproduct); return pstats; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -