📄 chisquaredcalculator.java
字号:
/*
* @(#)ChiSquaredCalculator.java 3/11/2004
*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.calculators;
import org.jasen.core.engine.JasenEngineConfiguration;
import org.jasen.core.engine.JasenMap;
import org.jasen.core.engine.JasenToken;
/**
* <P>
* Performs all the chi probability calculations required by jASEN.
* </P>
* <p>
* This is the core calculation class which ultimately determines the spam score for a message.
* </p>
* <p>
* Most of the methods herein are a direct port from the Python implementation published by Gary Robinson.
* </p>
* @author Jason Polites
*/
public class ChiSquaredCalculator
{
/**
*
*/
public ChiSquaredCalculator() {
super ();
}
/**
* Confirms or rejects the null hypothesis that the message words indicate spam.
* <p>
* Specifically, this is defined as
* <pre style="font-family:times new roman; font-size:14pt">
* <em>I = H / (H + S)</em>
* </pre>
* Where:
* <ul>
* <li/>H is the probability the tokens indicate HAM
* <li/>S is the probability the tokens indicate SPAM
* </ul>
* </p>
* @param words The word tokens extracted from the message
* @param map The token map
* @return The overall "spamminess" of the words
*/
public double confirmHypothesis(String[] words, JasenMap map) {
double I;
double S;
double H;
double[] fws;
// Get the probabilities
fws = calculateWordProbabilities(words, map);
S = calculateS(fws);
H = calculateH(fws);
I = H / (H + S);
return I;
}
/**
* Calculates the probability, as a value between 0.0 and 1.0, that the tokens provided indicate a HAM message
* @param probs The word probabilities computed by calculateWordProbabilities
* @return A value between 0.0 and 1.0 where 1.0 indicates high probability of HAM (non spam)
* @see ChiSquaredCalculator#calculateWordProbabilities(String[], JasenMap)
*/
public double calculateH(double[] probs) {
int n = probs.length;
double chi = calculateChi(probs);
double H = calculateInverseChiSquare(chi, n);
return H;
}
/**
* Calculates the probability, as a value between 0.0 and 1.0, that the tokens provided indicate a SPAM message
* @param probs The word probabilities computed by calculateWordProbabilities
* @return A value between 0.0 and 1.0 where 1.0 indicates high probability of SPAM
* @see ChiSquaredCalculator#calculateWordProbabilities(String[], JasenMap)
*/
public double calculateS(double[] probs) {
int n = probs.length;
double chi = calculateReverseChi(probs);
double S = calculateInverseChiSquare(chi, n);
return S;
}
/**
* Calculates the probability of each word indicating spam.
* <P>
* Specifically, this method uses the following approach from Gary Robinson:
* </P>
* <p>
* b(w) = (the number of spam e-mails containing the word w) / (the total number of spam e-mails) <br/>
* g(w) = (the number of ham e-mails containing the word w) / (the total number of ham e-mails) <br/>
* p(w) = b(w) / (b(w) + g(w)) <br/>
* </p>
* <p>
* Then we calculate:
* </p>
* f(w) = ((s * x) + (m * p(w)) / (s + m)
* <p>
* Where:
* <ul>
* <li/> s is the strength we want to give to our background information (confidence)
* <li/> x is our assumed probability, based on our general background information, that a word we don't have any other experience of will first appear in a spam (guess)
* <li/> m is the number of e-mails we have received that contain word w
* <li/> f(w) is the final probability returned
* </ul>
* </p>
* @param words The set of words for which the probabilities will be calculated
* @param map The map of word probabilities
* @return An array of double values, between 0.0 and 1.0, indicating the probability that the word indicates spam
*/
public double[] calculateWordProbabilities(String[] words, JasenMap map) {
double bw;
double gw;
double pw;
double fw;
double m;
double s = JasenEngineConfiguration.getInstance().getConfidence();
double x = JasenEngineConfiguration.getInstance().getGuess();
double[] fws = new double[words.length];
JasenToken token = null;
for (int i = 0; i < words.length; i++)
{
token = map.getToken(words[i]);
if(token != null) {
bw = (double)token.getSpamCount() / (double)map.getSpamObservations();
gw = (double)token.getHamCount() / (double)map.getHamObservations();
pw = bw / (bw + gw);
m = map.getSpamObservations() + map.getHamObservations();
fw = ((s * x) + (m * pw)) / (s + m);
fws[i] = fw;
}
else
{
fws[i] = x;
}
}
return fws;
}
/**
* Calculates the chi distribution of the word probabilities.
* <p>
* This is defined as:
* <pre>
* -2 ln <span style="font-size:16pt">∏</span>f(w)
* </pre>
* </p>
* <p>
* Where
* <ul>
* <li/>ln is the Natural Logarithm
* <li/><span style="font-size:12pt">∏</span> f(w) is the product of all word probabilities
* </ul>
* </p>
* @param fws
* @return The chi distribution value calculated
*/
public double calculateChi(double[] fws) {
return calculateChi(fws, false);
}
private double calculateChi(double[] fws, boolean reverse) {
double product = 0.0f;
double chi;
for (int i = 0; i < fws.length; i++)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -