📄 bogoutil.c
字号:
/* $Id: bogoutil.c,v 1.241 2007/01/01 20:17:43 relson Exp $ *//*****************************************************************************NAME: bogoutil.c -- dumps & loads bogofilter text files from/to Berkeley DB format.AUTHORS: Gyepi Sam <gyepi@praxis-sw.com> David Relson <relson@osagesoftware.com>******************************************************************************/#include "common.h"#include <ctype.h>#include <errno.h>#include <stdlib.h>#include <string.h>#include <sys/stat.h> #include "getopt.h"#include "bogofilter.h"#include "bogohist.h"#include "bool.h"#include "buff.h"#include "configfile.h"#include "datastore.h"#include "datastore_db.h"#include "error.h"#include "longoptions.h"#include "maint.h"#include "msgcounts.h"#include "paths.h"#include "prob.h"#include "rand_sleep.h"#include "robx.h"#include "sighandler.h"#include "swap.h"#include "wordlists.h"#include "xmalloc.h"#include "xstrdup.h"/* prototypes for dummies below: */#include "score.h"const char *progname = "bogoutil";static int token_count = 0;bool maintain = false;bool onlyprint = false;/* Function Prototypes */static int process_arg(int option, const char *name, const char *arg);/* Function Definitions *//* dummies to avoid score.o */double msg_spamicity(void) { return .0; }rc_t msg_status(void) { return RC_OK; }static void ds_open_failure(bfpath *bfp, void *dbe){ fprintf(stderr, "Error accessing file or directory '%s'.\n", bfp->filepath); if (errno != 0) fprintf(stderr, "error #%d - %s.\n", errno, strerror(errno)); if (dbe != NULL) ds_cleanup(dbe); exit(EX_ERROR);}static int ds_dump_hook(word_t *key, dsv_t *data, /*@unused@*/ void *userdata)/* returns 0 if ok, 1 if not ok */{ (void)userdata; if (fDie) exit(EX_ERROR); token_count += 1; if (maintain && discard_token(key, data)) return 0; if (replace_nonascii_characters) do_replace_nonascii_characters(key->text, key->leng); fprintf( fpo, "%.*s %lu %lu", CLAMP_INT_MAX(key->leng), key->text, (unsigned long)data->spamcount, (unsigned long)data->goodcount); if (data->date) fprintf( fpo, " %lu", (unsigned long)data->date); fprintf( fpo, "\n"); fflush(stdout); /* solicit ferror flag if output is shorter than buffer */ return ferror(stdout) ? 1 : 0;}static ex_t dump_wordlist(bfpath *bfp){ ex_t rc; void *dbe; token_count = 0; dbe = ds_init(bfp); rc = ds_oper(dbe, bfp, DS_READ, ds_dump_hook, NULL); ds_cleanup(dbe); if (rc != EX_OK) fprintf(stderr, "error dumping tokens!\n"); else if (verbose) fprintf(dbgout, "%d tokens dumped\n", token_count); return rc;}#define BUFSIZE 512const char POSIX_space[] = " \f\n\r\t\v";static byte *spanword(byte *t){ /* skip leading whitespace */ t += strspn((const char *)t, POSIX_space); /* span current word */ t += strcspn((const char *)t, POSIX_space); if (*t) *t++ = '\0'; return t;}/** determines if the token is a regular token or a special non-count * token (.ROBX, .WORDLIST_VERSION), returns true if the token is a * count token */static bool is_count(const char *in){ static const char *const msgc = MSG_COUNT; static const char *const enco = WORDLIST_ENCODING; /* anything that doesn't start with a . is a count */ if (in[0] != '.') return true; /* .MSG_COUNT is also a count */ if (strcmp(in, msgc) == 0) return true; /* .ENCODING is also a count */ if (strcmp(in, enco) == 0) return true; return false;}static ex_t load_wordlist(bfpath *bfp){ void *dsh; byte buf[BUFSIZE]; byte *p; int rv = 0; size_t len; int load_count = 0; unsigned long line = 0; unsigned long count[IX_SIZE], date; YYYYMMDD today_save = today; void *dbe = ds_init(bfp); dsh = ds_open(dbe, bfp, DS_WRITE | DS_LOAD); if (dsh == NULL) /* print error, cleanup, and exit */ ds_open_failure(bfp, dbe); memset(buf, '\0', BUFSIZE); if (DST_OK != ds_txn_begin(dsh)) exit(EX_ERROR); for (;;) { dsv_t data; word_t *token; if (fgets((char *)buf, BUFSIZE, fpin) == NULL) { if (ferror(fpin)) { perror(progname); rv = 2; } break; } line++; len = strlen((char *)buf); /* too short. */ if (len < 4) continue; p = spanword(buf); len = strlen((const char *)buf); if (max_token_len != 0 && len > max_token_len) continue; /* too long - discard */ spamcount = (uint) atoi((const char *)p); if ((int) spamcount < 0) spamcount = 0; p = spanword(p); goodcount = (uint) atoi((const char *)p); if ((int) goodcount < 0) goodcount = 0; p = spanword(p); date = (uint) atoi((const char *)p); p = spanword(p); if (*p != '\0') { fprintf(stderr, "%s: Unexpected input [%s] on line %lu. " "Expecting whitespace before count.\n", progname, buf, line); rv = 1; break; } if (date == 0) /* date as YYYYMMDD */ date = today_save; if (replace_nonascii_characters) do_replace_nonascii_characters(buf, len); token = word_new(buf, len); data.goodcount = goodcount; data.spamcount = spamcount; data.date = date; if (is_count((const char *)buf) && !(maintain && discard_token(token, &data))) { load_count += 1; /* Slower, but allows multiple lists to be concatenated */ set_date(date); switch (ds_read(dsh, token, &data)) { case 0: case 1: break; default: rv = 1; } data.spamcount += spamcount; data.goodcount += goodcount; if (ds_write(dsh, token, &data)) rv = 1; } word_free(token); } if (rv) { fprintf(stderr, "read or write error, aborting.\n"); ds_txn_abort(dsh); } else { switch (ds_txn_commit(dsh)) { case DST_FAILURE: case DST_TEMPFAIL: fprintf(stderr, "commit failed\n"); exit(EX_ERROR); case DST_OK: break; } } ds_close(dsh); ds_cleanup(dbe); if (verbose) fprintf(dbgout, "%d tokens loaded\n", load_count); return rv;}static int get_token(buff_t *buff, FILE *fp){ int rv = 0; if (fgets((char *)buff->t.text, buff->size, fp) == NULL) { if (ferror(fp)) { perror(progname); rv = 2; } else { rv = 1; } } else { buff->t.leng = (uint) strlen((const char *)buff->t.text); if (buff->t.text[buff->t.leng - 1] == '\n' ) { buff->t.leng -= 1; buff->t.text[buff->t.leng] = (byte) '\0'; } else { fprintf(stderr, "%s: Unexpected input [%s]. Does not end with newline " "or line too long.\n", progname, buff->t.text); rv = 1; } } return rv;}static ex_t display_words(bfpath *bfp, int argc, char **argv, bool show_probability){ byte buf[BUFSIZE]; buff_t *buff = buff_new(buf, 0, BUFSIZE); const byte *word = buf; const char *path = bfp->filepath; const char *head_format = !show_probability ? "%-30s %6s %6s\n" : "%-30s %6s %6s %6s\n"; const char *data_format = !show_probability ? "%-30s %6lu %6lu\n" : "%-30s %6lu %6lu %f\n"; void *dsh = NULL; /* initialize to silence bogus gcc warning */ void *dbe; int rv = 0; dsv_t msgcnts; /* protect against broken stat(2) that succeeds for empty names */ if (path == NULL || *path == '\0') { fprintf(stderr, "Expecting non-empty directory or file name.\n"); return EX_ERROR; } dbe = ds_init(bfp); dsh = ds_open(dbe, bfp, DS_READ);; if (dsh == NULL) /* print error, cleanup, and exit */ ds_open_failure(bfp, dbe); if (DST_OK != ds_txn_begin(dsh)) { ds_close(dsh); ds_cleanup(dbe); fprintf(stderr, "Cannot begin transaction.\n"); return EX_ERROR; } if (show_probability) { ds_get_msgcounts(dsh, &msgcnts); robs = ROBS; robx = ROBX; } fprintf(fpo, head_format, "", "spam", "good", " Fisher"); while (argc >= 0) { dsv_t val; word_t *token; int rc; unsigned long spam_count; unsigned long good_count; double rob_prob = 0.0; if (argc == 0) { if (get_token(buff, stdin) != 0) break; token = &buff->t; } else { word = (const byte *) *argv++; if (--argc == 0) argc = -1; token = word_news((const char *)word); } rc = ds_read(dsh, token, &val); switch (rc) { case 0: spam_count = val.spamcount; good_count = val.goodcount; if (!show_probability) fprintf(fpo, data_format, token->text, spam_count, good_count); else { rob_prob = calc_prob(good_count, spam_count, msgcnts.goodcount, msgcnts.spamcount); fprintf(fpo, data_format, token->text, spam_count, good_count, rob_prob); } break; case 1: break; default: fprintf(stderr, "Cannot read from database.\n"); rv = EX_ERROR; goto finish; } if (token != &buff->t) word_free(token); }finish: if (DST_OK != rv ? ds_txn_abort(dsh) : ds_txn_commit(dsh)) { fprintf(stderr, "Cannot %s transaction.\n", rv ? "abort" : "commit"); rv = EX_ERROR; } ds_close(dsh); ds_cleanup(dbe); buff_free(buff); return rv;}static ex_t get_robx(bfpath *bfp){ double rx; int ret = 0; init_wordlist("word", bfp->filepath, 0, WL_REGULAR); rx = compute_robinson_x(); if (rx < 0) return EX_ERROR; if (onlyprint) printf("%f\n", rx); else { dsv_t val; word_t *word_robx = word_news(ROBX_W); /* since compute_robinson_x() closes the wordlists, init_wordlist() must be called again */ init_wordlist("word", bfp->filepath, 0, WL_REGULAR); open_wordlists(DS_WRITE); val.goodcount = 0; val.spamcount = (uint32_t) (rx * 1000000); do { ret = ds_write(word_lists->dsh, word_robx, &val); if (ret == DS_ABORT_RETRY) { rand_sleep(1000, 1000000); begin_wordlist(word_lists); } } while (ret == DS_ABORT_RETRY); close_wordlists(true); free_wordlists(); word_free(word_robx); } return ret ? EX_ERROR : EX_OK;}static void print_version(void){ (void)fprintf(stdout, "%s version %s\n" " Database: %s\n" "Copyright (C) 2002-2007 David Relson, Matthias Andree\n" "Copyright (C) 2002-2003 Gyepi Sam.\n\n" "%s comes with ABSOLUTELY NO WARRANTY. " "This is free software, and\nyou are welcome to " "redistribute it under the General Public License. " "See\nthe COPYING file with the source distribution for " "details.\n" "\n", progname, version, ds_version_str(), PACKAGE);}static void usage(FILE *fp){ fprintf(fp, "Usage: %s {-h|-V}\n", progname); fprintf(fp, " or: %s [OPTIONS] {-d|-l|-u|-m|-w|-p|--db-verify} file%s\n", progname, DB_EXT); fprintf(fp, " or: %s [OPTIONS] {-H|-r|-R} file\n", progname);#if defined (ENABLE_DB_DATASTORE) || defined (ENABLE_SQLITE_DATASTORE) fprintf(fp, " or: %s [OPTIONS] {--db-print-leafpage-count} file%s\n", progname, DB_EXT); fprintf(fp, " or: %s [OPTIONS] {--db-print-pagesize} file%s\n", progname, DB_EXT);#endif#if defined(ENABLE_DB_DATASTORE) && !defined(DISABLE_TRANSACTIONS) fprintf(fp, " or: %s [OPTIONS] {--db-checkpoint} directory\n", progname); fprintf(fp, " or: %s [OPTIONS] {--db-list-logfiles} directory [list options]\n", progname); fprintf(fp, " or: %s [OPTIONS] {--db-prune|--db-remove-environment} directory\n", progname); fprintf(fp, " or: %s [OPTIONS] {--db-recover|--db-recover-harder} directory\n", progname);#endif}static const char *help_text[] = { "\n", "OPTIONS are:\n", " -C, --no-config-file - don't read standard config files.\n", " -D, --debug-to-stdout - direct debug output to stdout.\n",#ifdef ENABLE_DB_DATASTORE " -k, --db-cachesize=size - set Berkeley DB cache size (MB).\n",#endif " -v, --verbosity - set debug verbosity level.\n", " -x, --debug-flags=list - set flags to display debug information.\n", " -y, --timestamp-date=date - set default date (format YYYYMMDD).\n", "\n", "Modes of operation are:\n", " -h, --help - print this help message and exit.\n",
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -