⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 chisquaredcalculator.java

📁 spam source codejasen-0.9jASEN - java Anti Spam ENgine.zip 如标题所示
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/*
 * @(#)ChiSquaredCalculator.java	3/11/2004
 *
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.calculators;

import org.jasen.core.engine.JasenEngineConfiguration;
import org.jasen.core.engine.JasenMap;
import org.jasen.core.engine.JasenToken;

/**
 * <P>
 * 	Performs all the chi probability calculations required by jASEN.
 * </P>
 * <p>
 * 	This is the core calculation class which ultimately determines the spam score for a message.
 * </p>
 * <p>
 * 	Most of the methods herein are a direct port from the Python implementation published by Gary Robinson.
 * </p>
 * @author Jason Polites
 */
public class ChiSquaredCalculator
{

    /**
     *
     */
    public ChiSquaredCalculator() {
        super ();
    }

    /**
     * Confirms or rejects the null hypothesis that the message words indicate spam.
     * <p>
     * Specifically, this is defined as
     * <pre style="font-family:times new roman; font-size:14pt">
     * 	<em>I = H / (H + S)</em>
     * </pre>
     * Where:
     * <ul>
     * 	<li/>H is the probability the tokens indicate HAM
     *  <li/>S is the probability the tokens indicate SPAM
     * </ul>
     * </p>
     * @param words The word tokens extracted from the message
     * @param map The token map
     * @return The overall "spamminess" of the words
     */
    public double confirmHypothesis(String[] words, JasenMap map) {
        double I;
        double S;
        double H;
        double[] fws;

        // Get the probabilities
        fws = calculateWordProbabilities(words, map);
        S = calculateS(fws);
        H = calculateH(fws);
        I = H / (H + S);

        return I;
    }

    /**
     * Calculates the probability, as a value between 0.0 and 1.0, that the tokens provided indicate a HAM message
     * @param probs The word probabilities computed by calculateWordProbabilities
     * @return A value between 0.0 and 1.0 where 1.0 indicates high probability of HAM (non spam)
     * @see ChiSquaredCalculator#calculateWordProbabilities(String[], JasenMap)
     */
    public double calculateH(double[] probs) {
        int n = probs.length;
        double chi = calculateChi(probs);
        double H = calculateInverseChiSquare(chi, n);

        return H;
    }

    /**
     * Calculates the probability, as a value between 0.0 and 1.0, that the tokens provided indicate a SPAM message
     * @param probs The word probabilities computed by calculateWordProbabilities
     * @return A value between 0.0 and 1.0 where 1.0 indicates high probability of SPAM
     * @see ChiSquaredCalculator#calculateWordProbabilities(String[], JasenMap)
     */
    public double calculateS(double[] probs) {
        int n = probs.length;
        double chi = calculateReverseChi(probs);
        double S = calculateInverseChiSquare(chi, n);

        return S;
    }



    /**
     * Calculates the probability of each word indicating spam.
     * <P>
     * Specifically, this method uses the following approach from Gary Robinson:
     * </P>
     * <p>
     * 	b(w) = (the number of spam e-mails containing the word w) / (the total number of spam e-mails) <br/>
	 *	g(w) = (the number of ham e-mails containing the word w) / (the total number of ham e-mails) <br/>
	 * 	p(w) = b(w) / (b(w) + g(w)) <br/>
     * </p>
     * <p>
     * 	Then we calculate:
     * </p>
     * f(w) = ((s * x) + (m * p(w)) / (s + m)
     * <p>
     * 	Where:
     * 	<ul>
     * 	<li/> s is the strength we want to give to our background information (confidence)
     *  <li/> x is our assumed probability, based on our general background information, that a word we don't have any other experience of will first appear in a spam (guess)
     *  <li/> m is the number of e-mails we have received that contain word w
     *  <li/> f(w) is the final probability returned
     * 	</ul>
     * </p>
     * @param words The set of words for which the probabilities will be calculated
     * @param map The map of word probabilities
     * @return An array of double values, between 0.0 and 1.0, indicating the probability that the word indicates spam
     */
    public double[] calculateWordProbabilities(String[] words, JasenMap map) {

        double bw;
        double gw;
        double pw;
        double fw;
        double m;
        double s = JasenEngineConfiguration.getInstance().getConfidence();
        double x = JasenEngineConfiguration.getInstance().getGuess();

        double[] fws = new double[words.length];

        JasenToken token = null;

        for (int i = 0; i < words.length; i++)
        {
            token = map.getToken(words[i]);

            if(token != null) {
                bw = (double)token.getSpamCount() / (double)map.getSpamObservations();
                gw = (double)token.getHamCount() / (double)map.getHamObservations();

                pw = bw / (bw + gw);

                m = map.getSpamObservations() + map.getHamObservations();

                fw = ((s * x) + (m * pw)) / (s + m);

                fws[i] = fw;

            }
            else
            {
                fws[i] = x;
            }
        }

        return fws;
    }

    /**
     * Calculates the chi distribution of the word probabilities.
     * <p>
     * 	This is defined as:
     * <pre>
     * 	 -2 ln <span style="font-size:16pt">&#x220F;</span>f(w)
     * </pre>
     * </p>
     * <p>
     * 	Where
     * 	<ul>
     * 		<li/>ln is the Natural Logarithm
     * 		<li/><span style="font-size:12pt">&#x220F;</span> f(w) is the product of all word probabilities
     * 	</ul>
     * </p>
     * @param fws
     * @return The chi distribution value calculated
     */
    public double calculateChi(double[] fws) {
        return calculateChi(fws, false);
    }

    private double calculateChi(double[] fws, boolean reverse) {

        double product = 0.0f;
        double chi;

        for (int i = 0; i < fws.length; i++)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -