📄 score.c

📁 一个C语言写的快速贝叶斯垃圾邮件过滤工具
💻 C
字号:
/* $Id: score.c,v 1.58 2006/06/10 02:03:47 relson Exp $ *//*****************************************************************************NAME:   score.c -- implements Fisher variant on Robinson algorithm.******************************************************************************/#include "common.h"#include <math.h>#include <string.h>#include <stdlib.h>#include "bogoconfig.h"#include "bogofilter.h"#include "collect.h"#include "datastore.h"#include "msgcounts.h"#include "prob.h"#include "rand_sleep.h"#include "rstats.h"#include "score.h"#include "wordhash.h"#include "wordlists.h"#if defined(HAVE_GSL_10) && !defined(HAVE_GSL_14)/* HAVE_GSL_14 implies HAVE_GSL_10 * if we have neither, we'll use our included GSL 1.4, which knows CDFs * if we have both, we have GSL 1.4, which knows CDFs * * in other cases, we need to integrate the PDF to get the CDF */#define GSL_INTEGRATE_PDF#include "gsl/gsl_randist.h"#include "gsl/gsl_integration.h"#include "gsl/gsl_errno.h"#else#include "gsl/gsl_cdf.h"#endif/* Function Prototypes */static	double	get_spamicity(size_t robn, FLOAT P, FLOAT Q);/* Static Variables */static score_t  score;/* Function Definitions */double msg_spamicity(void){    return score.spamicity;}rc_t msg_status(void){    if (score.spamicity >= spam_cutoff)	return RC_SPAM;    if ((ham_cutoff < EPS) ||	(score.spamicity <= ham_cutoff))	return RC_HAM;    return RC_UNSURE;}void msg_print_stats(FILE *fp){    bool unsure = unsure_stats && (msg_status() == RC_UNSURE) && verbose;    (void)fp;    if (quiet)	return;    if (Rtable || unsure || verbose >= 2)	rstats_print(unsure);}/** search token in all lists according to precedence, summing up the * counts (all lists at same precedence are used); if found, set cnts * accordingly. */static int lookup(const word_t *token, wordcnts_t *cnts){    int override=0;    wordlist_t* list;    if (fBogotune) {	wordprop_t *wp = wordhash_search_memory(token);	if (wp) {	    cnts->good = wp->cnts.good;	    cnts->bad  = wp->cnts.bad;	}	return 0;    }    cnts->msgs_bad = cnts->msgs_good = 0;    for (list=word_lists; list != NULL; list=list->next)    {	dsv_t val;	int ret;	if (override > list->override)	/* if already found */	    break;	ret = ds_read(list->dsh, token, &val);	/* check if we have the token */	switch (ret) {	    case 0:		/* token found, pass on */		break;	    case 1:		/* token not found, clear counts */		val.count[IX_GOOD] = 0;		val.count[IX_SPAM] = 0;		break;	    case DS_ABORT_RETRY:		/* sleep, reinitialize and start over */		rand_sleep(1000,1000000);		begin_wordlist(list);		/* FALLTHROUGH */	    default:		return ret;	}	if (ret == 0 && list->type == WL_IGNORE) {	/* if found on ignore list */	    cnts->good = cnts->bad = 0;	    break;	}	override=list->override;	if (DEBUG_ALGORITHM(2)) {	    fprintf(dbgout, "%6d %5u %5u %5u %5u list=%s,%c,%d ",		    ret, (uint)val.count[IX_GOOD], (uint)val.count[IX_SPAM],		    (uint)list->msgcount[IX_GOOD], (uint)list->msgcount[IX_SPAM],		    list->listname, list->type, list->override);	    word_puts(token, 0, dbgout);	    fputc('\n', dbgout);	}	cnts->good += val.count[IX_GOOD];	cnts->bad += val.count[IX_SPAM];	cnts->msgs_good += list->msgcount[IX_GOOD];	cnts->msgs_bad += list->msgcount[IX_SPAM];    }    if (DEBUG_ALGORITHM(1)) {	fprintf(dbgout, "%5u %5u ", (uint)cnts->bad, (uint)cnts->good);	word_puts(token, 0, dbgout);	fputc('\n', dbgout);    }    return 0;}/* do wordlist lookups for the words in the wordhash */void lookup_words(wordhash_t *wh){    int ret;    hashnode_t *node;    if (msg_count_file)	/* if mc file, already done */	return;retry:    for (node = wordhash_first(wh); node != NULL; node = wordhash_next(wh))    {	word_t *token     = node->key;	wordprop_t *props = (wordprop_t *) node->buf;	wordcnts_t *cnts  = &props->cnts;	ret = lookup(token, cnts);	if (ret == DS_ABORT_RETRY)	    /* start all over, the message counts may have changed	     * lookup handles reinitializing the wordlist */	    goto retry;    }    return;}/** selects the best spam/non-spam indicators and calculates Robinson's S, * \return -1.0 for error, S otherwise */double msg_compute_spamicity(wordhash_t *wh, FILE *fp) /*@globals errno@*/{    hashnode_t *node;    FLOAT P = {1.0, 0};		/* Robinson's P */    FLOAT Q = {1.0, 0};		/* Robinson's Q */    double spamicity;    int    err = 0;    size_t robn = 0;    size_t count = 0;    bool need_stats = (Rtable || passthrough || (verbose > 0)) && !fBogotune;    (void) fp; 	/* quench compiler warning */    if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "### msg_compute_spamicity() begins\n");    if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "min_dev: %f, robs: %f, robx: %f\n", 				    min_dev, robs, robx);    for (node = wordhash_first(wh); node != NULL; node = wordhash_next(wh))    {	double prob;	word_t *token;	wordcnts_t *cnts;	wordprop_t *props;	if (!fBogotune) {	    props = (wordprop_t *) node->buf;	    cnts  = &props->cnts;	    token = node->key;	} else {	    cnts = (wordcnts_t *) node;	    token = NULL;	}	count += 1;	prob = calc_prob(cnts->good, cnts->bad,			 cnts->msgs_good, cnts->msgs_bad);	if (need_stats)	    rstats_add(token, prob, cnts);	/* Robinson's P and Q; accumulation step */	/*	 * P = 1 - ((1-p1)*(1-p2)*...*(1-pn))^(1/n)     [spamminess]	 * Q = 1 - (p1*p2*...*pn)^(1/n)                 [non-spamminess]	 */	if (fabs(EVEN_ODDS - prob) - min_dev >= EPS) {	    int e;	    P.mant *= 1-prob;	    if (P.mant < 1.0e-200) {		P.mant = frexp(P.mant, &e);		P.exp += e;	    }	    Q.mant *= prob;	    if (Q.mant < 1.0e-200) {		Q.mant = frexp(Q.mant, &e);		Q.exp += e;	    }	    robn ++;	}	if (DEBUG_ALGORITHM(3)) {	    (void)fprintf(dbgout, "%3lu %3lu %f ",			  (unsigned long)robn, (unsigned long)count, prob);	    (void)word_puts(token, 0, dbgout);	    (void)fputc('\n', dbgout);	}    }    /* Robinson's P, Q and S    ** S = (P - Q) / (P + Q)                        [combined indicator]    */    spamicity = get_spamicity(robn, P, Q);    if (need_stats && robn != 0)	rstats_fini(robn, P, Q, spamicity);    if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "### msg_compute_spamicity() ends\n");    return err ? -1.0 : spamicity;}void score_initialize(void){    word_t *word_robx = word_news(ROBX_W);    wordlist_t *list = get_default_wordlist(word_lists);    rstats_init();    if (fabs(min_dev) < EPS)	min_dev = MIN_DEV;    if (spam_cutoff < EPS)	spam_cutoff = SPAM_CUTOFF;    /*    ** If we're classifying messages, we need to compute the scalefactor     ** (from the .MSG_COUNT values)    ** If we're registering tokens, we needn't get .MSG_COUNT    */    if (fabs(robs) < EPS)	robs = ROBS;    if (fabs(robx) < EPS)    {	/* Assign default value in case there's no wordlist	 * or no wordlist entry */	robx = ROBX;	if (list->dsh != NULL)	{	    int ret;	    dsv_t val;	    /* Note: .ROBX is scaled by 1000000 in the wordlist */	    ret = ds_read(list->dsh, word_robx, &val);	    if (ret != 0)		robx = ROBX;	    else {		/* If found, unscale; else use predefined value */		uint l_robx = val.count[IX_SPAM];		robx = l_robx ? (double)l_robx / 1000000 : ROBX;	    }	}    }    if (robx < 0.0 || 1.0 < robx) {	fprintf(stderr, "Invalid robx value (%f).  Must be between 0.0 and 1.0\n", robx);	exit(EX_ERROR);    }    word_free(word_robx);    return;}void score_cleanup(void){    rstats_cleanup();}#ifdef GSL_INTEGRATE_PDFstatic double chisq(double x, void *p) {     return(gsl_ran_chisq_pdf(x, *(double *)p));}inline static double prbf(double x, double df) {    gsl_function chi;    int status;    double p, abserr;    const int intervals = 15;    const double eps = 1000 * DBL_EPSILON;    gsl_integration_workspace *w;    chi.function = chisq;    chi.params = &df;    gsl_set_error_handler_off();    w = gsl_integration_workspace_alloc(intervals);    if (!w) {	fprintf(stderr, "Out of memory! %s:%d\n", __FILE__, __LINE__);	exit(EX_ERROR);    }    status = gsl_integration_qag(&chi, 0, x, eps, eps,	    intervals, GSL_INTEG_GAUSS41, w, &p, &abserr);    if (status && status != GSL_EMAXITER) {	fprintf(stderr, "Integration error: %s\n", gsl_strerror(status));	exit(EX_ERROR);    }    gsl_integration_workspace_free(w);    p = max(0.0, 1.0 - p);    return(min(1.0, p));}#elseinline static double prbf(double x, double df){    double r = gsl_cdf_chisq_Q(x, df);    return (r < DBL_EPSILON) ? 0.0 : r;}#endifdouble get_spamicity(size_t robn, FLOAT P, FLOAT Q){    if (robn == 0)    {	score.spamicity = robx;    }    else    {        double sp_df = 2.0 * robn * sp_esf;        double ns_df = 2.0 * robn * ns_esf;	double ln2 = log(2.0);					/* ln(2) */	score.robn = robn;	/* convert to natural logs */	score.p_ln = (log(P.mant) + P.exp * ln2) * sp_esf;	/* invlogsum */	score.q_ln = (log(Q.mant) + Q.exp * ln2) * ns_esf;	/* logsum */	score.p_pr = prbf(-2.0 * score.p_ln, sp_df);		/* compute P */	score.q_pr = prbf(-2.0 * score.q_ln, ns_df);		/* compute Q */          if (!fBogotune && sp_esf >= 1.0 && ns_esf >= 1.0) {            score.spamicity = (1.0 + score.q_pr - score.p_pr) / 2.0;        } else if (score.q_pr < DBL_EPSILON && score.p_pr < DBL_EPSILON) {            score.spamicity = 0.5;        } else {            score.spamicity = score.q_pr / ( score.q_pr + score.p_pr);        }    }    return score.spamicity;}void msg_print_summary(const char *pfx){    if (!Rtable) {	(void)fprintf(fpo, "%s%-*s %6lu %9.6f %9.6f %9.6f\n",		      pfx, max_token_len+2, "N_P_Q_S_s_x_md", (unsigned long)score.robn, 		      score.p_pr, score.q_pr, score.spamicity);	(void)fprintf(fpo, "%s%-*s  %9.6f %9.6f %9.6f\n",		      pfx, max_token_len+2+6, " ", robs, robx, min_dev);    }    else {	/* Trim token to 22 characters to accomodate R's default line length of 80 */	(void)fprintf(fpo, "%s%-24s %6lu %9.2e %9.2e %9.2e %9.2e %9.2e %5.3f\n",		      pfx, "N_P_Q_S_s_x_md", (unsigned long)score.robn,		      score.p_pr, score.q_pr, score.spamicity, robs, robx, min_dev);     }}/* Done */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -