📄 bayesianfilter.java

📁 StandBayeMail
💻 JAVA
字号:
/** * <p>Title: StandBayeMail </p> * <p>Description: A bayesian spam filter</p> * <p>Copyright: Copyright (c) 2004 by Luca M. Viola</p> * <p>Company: 3AM.it</p> * @author Luca M. Viola <luca@3am.it> * @version 1.0  This program is free software; you can redistribute it and/or  modify it under the terms of the GNU General Public License  as published by the Free Software Foundation; either version 2  of the License, or (at your option) any later version.  This program is distributed in the hope that it will be useful,  but WITHOUT ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  GNU General Public License for more details.  You should have received a copy of the GNU General Public License  along with this program; if not, write to the Free Software  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.*/package StandBayeMail;public class BayesianFilter{  private boolean NON_EQUIPROBABLE=false;       // When the calculation is not equiprobable (true) it means that in                                                // lack of evidence the probability of a new word being spam                                                // equals the ratio between the hystorical "bad" words' number                                                // and the good ones'  private static final double UNKNOWN_WORD=0.4; // When the calculation is equiprobable, instead we should give                                                // a 50% probability to unknown words to be in either domain,                                                // but to avoid "false positives" we go slightly unbalanced                                                // on the statical "goodness" of the new word (40%                                                // probability that it is in the spam words domain)  public static final int KEEPERS=15;           // Number of extremes (discriminants) to calculate  private static final int MINIMUM_FREQ=5;      // The minimum occurrence of a word in order to be                                                // esamined by the filter  private static final double SPAM_CUTOFF=0.9;  // The threshold over which the message tested is to be                                                // considered SPAM.  public static double getSpamicityThreshold()  {    return SPAM_CUTOFF;  }  private double deviation( double n )  {    return Math.abs((n)-0.5);  }  // If v<a returns a, if v>b returns b, if a<=v<=b returns v  private double minmax( double v,double a,double b)  {    return ( (v)<(a)?(a) : (v)>(b)?(b) : (v) );  }  public Statistics bmf( WordCounterFile pblist, WordCounterFile pglist, WordCounterFile pmlist, Statistics pstats )  {    String      pword;    double msg_prob =0.0;    double prob, product, invproduct, dev;    double goodness, spamness, slotdev, hitdev;    if( NON_EQUIPROBABLE )       msg_prob =((double)pblist.getKeyNumber() / (double)pglist.getKeyNumber());    Discriminants [] pp;    Discriminants hit=new Discriminants();    // This initializes the  discriminants' arrays assuming a neutral    // deviation (deviation=shift from the 50% of probability).    for (int i = 0; i < KEEPERS; i++)    {      pp= pstats.extrema;      pp[i].key="";      pp[i].prob = 0.5f;    }    // This takes a list of words from the mail and it calculates their    // probability to be either good or bad, and out of all of them    // it saves a "KEEPERS" number with their respective statistics    // of probability computed according to their historical ratios and    // deviations from 50%.    // Such a list of words is called "extremes" or "discriminants".    pmlist.initIterator();    while( (pword = pmlist.nextKey()) !=null )    {        goodness = pglist.getCount( pword );        spamness = pblist.getCount( pword );        if( goodness + spamness < MINIMUM_FREQ )        {          if( NON_EQUIPROBABLE )            prob = msg_prob;          else            prob = UNKNOWN_WORD;        }        else        {            double pb = Math.min( 1.0, (spamness / pblist.getMboxesCount()) );            double pg = Math.min( 1.0, (goodness / pglist.getMboxesCount()) );            if( NON_EQUIPROBABLE )              prob = (pb * msg_prob) / ((pg * (1 - msg_prob)) + (pb * msg_prob));            else              prob = pb / (pg + pb);            prob = minmax( prob, 0.01, 0.99 );        }        // Updates the list of words with the maximum deviation        dev = deviation(prob);        hit = null;        hitdev = 0;        for (int i=0; i<KEEPERS; i++)        {            pp= pstats.extrema;            // Removes duplicated words from the list of extremes (discriminants)            if( pp[i].key.length() > 0 && pword.compareTo( pp[i].key ) == 0 )            {                hit = null;                break;            }            slotdev = deviation(pp[i].prob);            if (dev>slotdev && dev>hitdev)            {                hit = pp[i];                hitdev = slotdev;            }        }        if (hit!=null)        {            hit.prob = prob;            hit.key = pword;        }    }    // Here instead we apply the Bayes' theorem, such as described    // in http://www.mathpages.com/home/kmath267.htm    // The theorem is applied on a number of contrasting evidences    // equal to KEEPERS    product = invproduct = 1.0f;    for( int i=0; i <KEEPERS; i++)    {        pp = pstats.extrema;        if( pp[i].prob == 0 )        {            break;        }        else        {            product *= pp[i].prob;            invproduct *= (1 - pp[i].prob);        }    }    pstats.spamicity = product / (product + invproduct);    return pstats;  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -