📄 bogoutil.c

📁 一个C语言写的快速贝叶斯垃圾邮件过滤工具
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* $Id: bogoutil.c,v 1.241 2007/01/01 20:17:43 relson Exp $ *//*****************************************************************************NAME:  bogoutil.c -- dumps & loads bogofilter text files from/to Berkeley DB format.AUTHORS:  Gyepi Sam    <gyepi@praxis-sw.com>  David Relson <relson@osagesoftware.com>******************************************************************************/#include "common.h"#include <ctype.h>#include <errno.h>#include <stdlib.h>#include <string.h>#include <sys/stat.h> #include "getopt.h"#include "bogofilter.h"#include "bogohist.h"#include "bool.h"#include "buff.h"#include "configfile.h"#include "datastore.h"#include "datastore_db.h"#include "error.h"#include "longoptions.h"#include "maint.h"#include "msgcounts.h"#include "paths.h"#include "prob.h"#include "rand_sleep.h"#include "robx.h"#include "sighandler.h"#include "swap.h"#include "wordlists.h"#include "xmalloc.h"#include "xstrdup.h"/* prototypes for dummies below: */#include "score.h"const char *progname = "bogoutil";static int token_count = 0;bool  maintain = false;bool  onlyprint = false;/* Function Prototypes */static int process_arg(int option, const char *name, const char *arg);/* Function Definitions *//* dummies to avoid score.o */double msg_spamicity(void) { return .0; }rc_t msg_status(void) { return RC_OK; }static void ds_open_failure(bfpath *bfp, void *dbe){    fprintf(stderr, "Error accessing file or directory '%s'.\n", bfp->filepath);    if (errno != 0)	fprintf(stderr, "error #%d - %s.\n", errno, strerror(errno));    if (dbe != NULL)	ds_cleanup(dbe);    exit(EX_ERROR);}static int ds_dump_hook(word_t *key, dsv_t *data,			/*@unused@*/ void *userdata)/* returns 0 if ok, 1 if not ok */{    (void)userdata;    if (fDie)	exit(EX_ERROR);    token_count += 1;    if (maintain && discard_token(key, data))	return 0;    if (replace_nonascii_characters)	do_replace_nonascii_characters(key->text, key->leng);    fprintf( fpo, "%.*s %lu %lu",	    CLAMP_INT_MAX(key->leng), key->text,	    (unsigned long)data->spamcount,	    (unsigned long)data->goodcount);    if (data->date)	fprintf( fpo, " %lu", (unsigned long)data->date);    fprintf( fpo, "\n");    fflush(stdout); /* solicit ferror flag if output is shorter than buffer */    return ferror(stdout) ? 1 : 0;}static ex_t dump_wordlist(bfpath *bfp){    ex_t rc;    void *dbe;    token_count = 0;    dbe = ds_init(bfp);    rc = ds_oper(dbe, bfp, DS_READ, ds_dump_hook, NULL);    ds_cleanup(dbe);    if (rc != EX_OK)	fprintf(stderr, "error dumping tokens!\n");    else	if (verbose)	    fprintf(dbgout, "%d tokens dumped\n", token_count);    return rc;}#define BUFSIZE 512const char POSIX_space[] = " \f\n\r\t\v";static byte *spanword(byte *t){    /* skip leading whitespace */    t += strspn((const char *)t, POSIX_space);    /* span current word */    t += strcspn((const char *)t, POSIX_space);    if (*t)	*t++ = '\0';    return t;}/** determines if the token is a regular token or a special non-count * token (.ROBX, .WORDLIST_VERSION), returns true if the token is a * count token */static bool is_count(const char *in){    static const char *const msgc = MSG_COUNT;    static const char *const enco = WORDLIST_ENCODING;    /* anything that doesn't start with a . is a count */    if (in[0] != '.')	return true;    /* .MSG_COUNT is also a count */    if (strcmp(in, msgc) == 0)	return true;    /* .ENCODING is also a count */    if (strcmp(in, enco) == 0)	return true;    return false;}static ex_t load_wordlist(bfpath *bfp){    void *dsh;    byte buf[BUFSIZE];    byte *p;    int rv = 0;    size_t len;    int load_count = 0;    unsigned long line = 0;    unsigned long count[IX_SIZE], date;    YYYYMMDD today_save = today;    void *dbe = ds_init(bfp);    dsh = ds_open(dbe, bfp, DS_WRITE | DS_LOAD);    if (dsh == NULL)	/* print error, cleanup, and exit */	ds_open_failure(bfp, dbe);    memset(buf, '\0', BUFSIZE);    if (DST_OK != ds_txn_begin(dsh))	exit(EX_ERROR);    for (;;) {	dsv_t data;	word_t *token;	if (fgets((char *)buf, BUFSIZE, fpin) == NULL) {	    if (ferror(fpin)) {		perror(progname);		rv = 2;	    }	    break;	}	line++;	len = strlen((char *)buf);	/* too short. */	if (len < 4)	    continue;	p = spanword(buf);	len = strlen((const char *)buf);	if (max_token_len != 0 &&	    len > max_token_len)	    continue;		/* too long - discard */	spamcount = (uint) atoi((const char *)p);	if ((int) spamcount < 0)	    spamcount = 0;	p = spanword(p);	goodcount = (uint) atoi((const char *)p);	if ((int) goodcount < 0)	    goodcount = 0;	p = spanword(p);	date = (uint) atoi((const char *)p);	p = spanword(p);	if (*p != '\0') {	    fprintf(stderr,		    "%s: Unexpected input [%s] on line %lu. "		    "Expecting whitespace before count.\n",		    progname, buf, line);	    rv = 1;	    break;	}	if (date == 0)				/* date as YYYYMMDD */	    date = today_save;	if (replace_nonascii_characters)	    do_replace_nonascii_characters(buf, len);  	token = word_new(buf, len);	data.goodcount = goodcount;	data.spamcount = spamcount;	data.date = date;	if (is_count((const char *)buf)		&& !(maintain && discard_token(token, &data))) {	    load_count += 1;	    /* Slower, but allows multiple lists to be concatenated */	    set_date(date);	    switch (ds_read(dsh, token, &data)) {		case 0:		case 1:		    break;		default:		    rv = 1;	    }	    data.spamcount += spamcount;	    data.goodcount += goodcount;	    if (ds_write(dsh, token, &data)) rv = 1;	}	word_free(token);    }    if (rv) {	fprintf(stderr, "read or write error, aborting.\n");	ds_txn_abort(dsh);    } else {	switch (ds_txn_commit(dsh)) {	    case DST_FAILURE:	    case DST_TEMPFAIL:		fprintf(stderr, "commit failed\n");		exit(EX_ERROR);	    case DST_OK:		break;	}    }    ds_close(dsh);    ds_cleanup(dbe);    if (verbose)	fprintf(dbgout, "%d tokens loaded\n", load_count);    return rv;}static int get_token(buff_t *buff, FILE *fp){    int rv = 0;    if (fgets((char *)buff->t.text, buff->size, fp) == NULL) {	if (ferror(fp)) {	    perror(progname);	    rv = 2;	} else {	    rv = 1;	}    } else {	buff->t.leng = (uint) strlen((const char *)buff->t.text);	if (buff->t.text[buff->t.leng - 1] == '\n' ) {	    buff->t.leng -= 1;	    buff->t.text[buff->t.leng] = (byte) '\0';	}	else	{	    fprintf(stderr,		    "%s: Unexpected input [%s]. Does not end with newline "		    "or line too long.\n",		    progname, buff->t.text);	    rv = 1;	}    }    return rv;}static ex_t display_words(bfpath *bfp, int argc, char **argv, bool show_probability){    byte buf[BUFSIZE];    buff_t *buff = buff_new(buf, 0, BUFSIZE);    const byte *word = buf;    const char *path = bfp->filepath;    const char *head_format = !show_probability ? "%-30s %6s %6s\n"   : "%-30s %6s  %6s  %6s\n";    const char *data_format = !show_probability ? "%-30s %6lu %6lu\n" : "%-30s %6lu  %6lu  %f\n";    void *dsh = NULL; /* initialize to silence bogus gcc warning */    void *dbe;    int rv = 0;    dsv_t msgcnts;    /* protect against broken stat(2) that succeeds for empty names */    if (path == NULL || *path == '\0') {        fprintf(stderr, "Expecting non-empty directory or file name.\n");        return EX_ERROR;    }    dbe = ds_init(bfp);    dsh = ds_open(dbe, bfp, DS_READ);;    if (dsh == NULL)	/* print error, cleanup, and exit */	ds_open_failure(bfp, dbe);    if (DST_OK != ds_txn_begin(dsh)) {	ds_close(dsh);	ds_cleanup(dbe);	fprintf(stderr, "Cannot begin transaction.\n");	return EX_ERROR;    }    if (show_probability)    {	ds_get_msgcounts(dsh, &msgcnts);	robs = ROBS;	robx = ROBX;    }    fprintf(fpo, head_format, "", "spam", "good", "  Fisher");    while (argc >= 0)    {	dsv_t val;	word_t *token;	int rc;	unsigned long spam_count;	unsigned long good_count;	double rob_prob = 0.0;		if (argc == 0)	{	    if (get_token(buff, stdin) != 0)		break;	    token = &buff->t;	} else {	    word = (const byte *) *argv++;	    if (--argc == 0)		argc = -1;	    token = word_news((const char *)word);	}	rc = ds_read(dsh, token, &val);	switch (rc) {	    case 0:		spam_count = val.spamcount;		good_count = val.goodcount;		if (!show_probability)		    fprintf(fpo, data_format, token->text, spam_count, good_count);		else		{		    rob_prob = calc_prob(good_count, spam_count, msgcnts.goodcount, msgcnts.spamcount);		    fprintf(fpo, data_format, token->text, spam_count, good_count, rob_prob);		}		break;	    case 1:		break;	    default:		fprintf(stderr, "Cannot read from database.\n");		rv = EX_ERROR;		goto finish;	}	if (token != &buff->t)	    word_free(token);    }finish:    if (DST_OK != rv ? ds_txn_abort(dsh) : ds_txn_commit(dsh)) {	fprintf(stderr, "Cannot %s transaction.\n", rv ? "abort" : "commit");	rv = EX_ERROR;    }    ds_close(dsh);    ds_cleanup(dbe);    buff_free(buff);    return rv;}static ex_t get_robx(bfpath *bfp){    double rx;    int ret = 0;    init_wordlist("word", bfp->filepath, 0, WL_REGULAR);    rx = compute_robinson_x();    if (rx < 0)	return EX_ERROR;    if (onlyprint)	printf("%f\n", rx);    else {	dsv_t val;	word_t *word_robx = word_news(ROBX_W);	/* since compute_robinson_x() closes the wordlists, 	   init_wordlist() must be called again */	init_wordlist("word", bfp->filepath, 0, WL_REGULAR);	open_wordlists(DS_WRITE);	val.goodcount = 0;	val.spamcount = (uint32_t) (rx * 1000000);	do {	    ret = ds_write(word_lists->dsh, word_robx, &val);	    if (ret == DS_ABORT_RETRY) {		rand_sleep(1000, 1000000);		begin_wordlist(word_lists);	    }	} while (ret == DS_ABORT_RETRY);	close_wordlists(true);	free_wordlists();	word_free(word_robx);    }    return ret ? EX_ERROR : EX_OK;}static void print_version(void){    (void)fprintf(stdout,		  "%s version %s\n"		  "    Database: %s\n"		  "Copyright (C) 2002-2007 David Relson, Matthias Andree\n"		  "Copyright (C) 2002-2003 Gyepi Sam.\n\n"		  "%s comes with ABSOLUTELY NO WARRANTY.  "		  "This is free software, and\nyou are welcome to "		  "redistribute it under the General Public License.  "		  "See\nthe COPYING file with the source distribution for "		  "details.\n"		  "\n", 		  progname, version, ds_version_str(), PACKAGE);}static void usage(FILE *fp){    fprintf(fp, "Usage: %s {-h|-V}\n", progname);    fprintf(fp, "   or: %s [OPTIONS] {-d|-l|-u|-m|-w|-p|--db-verify} file%s\n",	    progname, DB_EXT);    fprintf(fp, "   or: %s [OPTIONS] {-H|-r|-R} file\n", progname);#if defined (ENABLE_DB_DATASTORE) || defined (ENABLE_SQLITE_DATASTORE)    fprintf(fp, "   or: %s [OPTIONS] {--db-print-leafpage-count} file%s\n",	    progname, DB_EXT);    fprintf(fp, "   or: %s [OPTIONS] {--db-print-pagesize} file%s\n",	    progname, DB_EXT);#endif#if	defined(ENABLE_DB_DATASTORE) && !defined(DISABLE_TRANSACTIONS)    fprintf(fp, "   or: %s [OPTIONS] {--db-checkpoint} directory\n",	    progname);    fprintf(fp, "   or: %s [OPTIONS] {--db-list-logfiles} directory [list options]\n",	    progname);    fprintf(fp, "   or: %s [OPTIONS] {--db-prune|--db-remove-environment} directory\n",	    progname);    fprintf(fp, "   or: %s [OPTIONS] {--db-recover|--db-recover-harder} directory\n",	    progname);#endif}static const char *help_text[] = {    "\n",    "OPTIONS are:\n",    "  -C, --no-config-file        - don't read standard config files.\n",    "  -D, --debug-to-stdout       - direct debug output to stdout.\n",#ifdef	ENABLE_DB_DATASTORE    "  -k, --db-cachesize=size     - set Berkeley DB cache size (MB).\n",#endif    "  -v, --verbosity             - set debug verbosity level.\n",    "  -x, --debug-flags=list      - set flags to display debug information.\n",    "  -y, --timestamp-date=date   - set default date (format YYYYMMDD).\n",    "\n",    "Modes of operation are:\n",    "  -h, --help                  - print this help message and exit.\n",
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -