nway-train.py

来自「用python实现的邮件过滤器」· Python 代码 · 共 181 行

181 行

#!/usr/bin/env python"""Train databases for the nway filterusage %(prog)s [ -h ] [ -f ] [ -d dir ] [ -i float ] db1:mbox1 db2:mbox2 ...    -h - print this documentation and exit.    -f - force full retrain.    -d dir - directory containing mbox files - write db files relative             to dir/nway-db    -i float - set maximum imbalance of ham/spam ratio to float (default:               %(MAX_RATIO)s)    db:mbox - use mbox as source of messages for db"""from __future__ import divisionimport sysimport osimport getoptimport shelveprog = sys.argv[0]MAX_RATIO = 2.0STATE_KEY = 'saved state'def usage(msg=None):    if msg is not None:        print >> sys.stderr, msg    print >> sys.stderr, __doc__.strip() % globals()def merge(sbase, hbases, max_ratio):    print >> sys.stderr, "building %(sbase)s database" % locals()    # The individual databases are always trained as ham, so we need to flip    # the counts when using them as if they were spam.  The ham count will    # always be zero    sdb = shelve.open("%(sbase)s-ham.db" % locals(), 'r')    # remember, we want to reverse the usual meaning of the scores in    # this database    (ver, hamcount, spamcount) = sdb[STATE_KEY]    for hbase in hbases:        hdb = shelve.open("%(hbase)s-ham.db" % locals(), 'r')        hamcount += hdb[STATE_KEY][2]        hdb.close()    # XXX SpamBayes works best when there are roughly the same number of    # spam as ham messages.  Many people with extreme ham:spam ratios have    # reported problems.  The mechanism I'm using here - one spam database    # and several ham database will stress the system in that way.  One    # thing to try is to adjust the ham counts to minimize the apparent    # ratios.  For example, if the spam database has a count of 100 and the    # sum of the ham counts from the ham databases is 2000, perhaps the ham    # counts can all be scaled by a constant factor (by 20? by 5? by enough    # to bring the ham:spam ratio down to no more than 5:1? 3:1?) to make    # the apparent ham and spam counts more equal    hadjust = sadjust = 1.0    if hamcount/spamcount > max_ratio:        hadjust = spamcount*max_ratio/hamcount    elif spamcount/hamcount > max_ratio:        sadjust = hamcount*max_ratio/spamcount    print >> sys.stderr, "hamcount:   ", hamcount    print >> sys.stderr, "spamcount:  ", spamcount    print >> sys.stderr, "ham adjust: ", hadjust    print >> sys.stderr, "spam adjust:", sadjust    d = {}    d[STATE_KEY] = (ver, spamcount, hamcount*hadjust)    for k in sdb:        if k != STATE_KEY:            d[k] = (sadjust*sdb[k][1], 0)    sdb.close()    # Now update using the remaining ham databases.  Spam counts are always    # zero, so all we adjust are the ham counts    print >> sys.stderr, "merging",    for hbase in hbases:        hdb = shelve.open("%(hbase)s-ham.db" % locals(), 'r')        print >> sys.stderr, "%(hbase)s" % locals(),        sys.stderr.flush()        for key in hdb:            if key != STATE_KEY:                val = hdb[key]                s, h = d.get(key) or (0, 0)                d[key] = (s, hadjust*h+val[1])    print >> sys.stderr, ""    # we don't truncate to ints until the very end    v, s, h = d[STATE_KEY]    d[STATE_KEY] = (v, int(s), int(h))    print >> sys.stderr, "new saved state:", d[STATE_KEY]    keys = d.keys()    for k in keys:        if k != STATE_KEY:            s, h = int(d[k][0]), int(d[k][1])            # remove terms which dwindled away to (nearly) nothing            if (s, h) in ((1, 0), (0, 0), (0, 1), (1, 1)):                del d[k]            else:                d[k] = (s, h)    db = shelve.open("%(sbase)s.db" % locals())    print >> sys.stderr, "saving to %(sbase)s.db" % locals()    db.clear()    db.update(d)    db.close()def mtime(f):    try:        return os.path.getmtime(f)    except OSError:        return 0def main(args):    try:        opts, args = getopt.getopt(args, "hfd:i:",                                   ["help", "force", "directory=",                                    "imbalance="])    except getopt.GetoptError, msg:        usage(msg)        return 1    force_arg = ""    directory = "."    max_ratio = MAX_RATIO    for opt, arg in opts:        if opt in ("-h", "--help"):            usage()            return 0        elif opt in ("-f", "--force"):            force_arg = "-f"        elif opt in ("-d", "--directory"):            directory = arg        elif opt in ("-i", "--imbalance"):            max_ratio = float(arg)        mapping = {}    for db_mbox in args:        db, mbox = db_mbox.split(":")        mapping[db] = mbox    # train on each mbox once as ham    keys = mapping.keys()    keys.sort(lambda x,y: cmp(x.lower(),y.lower()))    for db in keys:        mboxf = os.path.join(directory, mapping[db])        dbf = os.path.join(directory, "nway-db", "%(db)s-ham.db" % locals())        if mtime(dbf) < mtime(mboxf):            if mtime(dbf) == 0:                # force a full retrain if the db doesn't exist                force = "-f"            else:                force = force_arg            os.system("sb_mboxtrain.py %(force)s -d %(dbf)s -g %(mboxf)s" %                      locals())        else:            print >> sys.stderr, "skipping", dbf, "(up-to-date)"    # for each key, merge its ham database (reversed) with all the other ham    # databases    for db in keys:        hdbs = [d for d in mapping if d != db]        hdbs.sort(lambda x,y: cmp(x.lower(),y.lower()))        merge(db, hdbs, max_ratio)    return 0if __name__ == "__main__":    sys.exit(main(sys.argv[1:]))

nway-train.py - 源码说明

本页面展示了「用python实现的邮件过滤器」中的 nway-train.py 源码文件，采用 Python 编程语言编写，共 181 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与python相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?