⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 nway-train-fast.py

📁 用python实现的邮件过滤器
💻 PY
字号:
#!/usr/bin/env python"""Train databases for the nway filterusage %(prog)s [ -h ] [ -f ] [ -d dir ] [ -i float ] db1:mbox1 db2:mbox2 ...    -h - print this documentation and exit.    -f - force full retrain.    -d dir - directory containing mbox files - write db files relative             to dir/nway-db    -i float - set maximum imbalance of ham/spam ratio to float (default:               %(MAX_RATIO)s)    db:mbox - use mbox as source of messages for db"""from __future__ import divisionimport sysimport osimport getoptimport shelveprog = sys.argv[0]MAX_RATIO = 2.0STATE_KEY = 'saved state'def usage(msg=None):    if msg is not None:        print >> sys.stderr, msg    print >> sys.stderr, __doc__.strip() % globals()def merge(hbases, max_ratio):    hbases.sort(lambda x,y: cmp(x.lower(), y.lower()))    # make a union of all the ham databases    bigd = {"saved state": (5, 0, 0)}    print >> sys.stderr, "building ham union:",    for hbase in hbases:        print >> sys.stderr, hbase,        sys.stderr.flush()        hdb = shelve.open("%(hbase)s-ham.db" % locals(), 'r')        v, s, h = bigd[STATE_KEY]        hv, hs, hh = hdb[STATE_KEY]        bigd[STATE_KEY] = (v, 0, h+hh)        for key in hdb:            if key != STATE_KEY:                s, h = bigd.get(key) or (0, 0)                hs, hh = hdb[key]                bigd[key] = (0, h+hh)        hdb.close()    print >> sys.stderr, ""    # for each element of the hbases list, create a spam version of it,    # merge bigd into it except for the values in the spam version.    for hbase in hbases:        print >> sys.stderr, "\nbuilding", hbase, "database"        spamd = {}        sdb = shelve.open("%(hbase)s-ham.db" % locals(), 'r')        (v, s, h) = sdb[STATE_KEY]        spamd[STATE_KEY] = (v, h, 0)        for k in sdb:            if k != STATE_KEY:                (s, h) = sdb[k]                spamd[k] = (h, 0)        print >> sys.stderr, "merging hamcounts"        # merge bigd into spamd minus the counts from hbase        (v, s, h) = spamd[STATE_KEY]        (v, ss, hh) = bigd[STATE_KEY]        hh -= sdb[STATE_KEY][2]        spamd[STATE_KEY] = (v, s, hh)        for k in bigd:            if k != STATE_KEY:                (s, h) = spamd.get(k) or (0, 0)                (ss, hh) = bigd[k]                hh -= (sdb.get(k) or (0, 0))[1]                spamd[k] = (s, h+hh)        sdb.close()        # scale the ham or spam counts so as to keep the ham:spam ratio        # between 1/max_ratio and max_ratio        hadjust = sadjust = 1.0        (v, spamcount, hamcount) = spamd[STATE_KEY]        if hamcount/spamcount > max_ratio:            hadjust = spamcount*max_ratio/hamcount        elif spamcount/hamcount > max_ratio:            sadjust = hamcount*max_ratio/spamcount        spamcount *= sadjust        hamcount *= hadjust        print >> sys.stderr, "hamcount:   ", hamcount        print >> sys.stderr, "spamcount:  ", spamcount        print >> sys.stderr, "ham adjust: ", hadjust        print >> sys.stderr, "spam adjust:", sadjust        spamd[STATE_KEY] = (v,  int(spamcount), int(hamcount))        for k in spamd.keys():            if k != STATE_KEY:                s, h = spamd[k]                del spamd[k]                s *= sadjust                h *= hadjust                val = (int(s), int(h))                if val == (0, 0):                    continue                spamd[k] = val        print >> sys.stderr, "final size:", len(spamd)-1, "tokens"        # dump it        print >> sys.stderr, "dumping", hbase, "database"        dbf = "%(hbase)s.db" % locals()        sdb = shelve.open(dbf)        sdb.update(spamd)        sdb.close()        print >> sys.stderr, "final disk size:", os.path.getsize(dbf), "bytes"def mtime(f):    try:        return os.path.getmtime(f)    except OSError:        return 0def main(args):    try:        opts, args = getopt.getopt(args, "hfd:i:",                                   ["help", "force", "directory=",                                    "imbalance="])    except getopt.GetoptError, msg:        usage(msg)        return 1    force_arg = ""    directory = "."    max_ratio = MAX_RATIO    for opt, arg in opts:        if opt in ("-h", "--help"):            usage()            return 0        elif opt in ("-f", "--force"):            force_arg = "-f"        elif opt in ("-d", "--directory"):            directory = arg        elif opt in ("-i", "--imbalance"):            max_ratio = float(arg)        mapping = {}    for db_mbox in args:        db, mbox = db_mbox.split(":")        mapping[db] = mbox    # train on each mbox once as ham    keys = mapping.keys()    keys.sort(lambda x,y: cmp(x.lower(),y.lower()))    for db in keys:        mboxf = os.path.join(directory, mapping[db])        dbf = os.path.join(directory, "nway-db", "%(db)s-ham.db" % locals())        if mtime(dbf) < mtime(mboxf):            if mtime(dbf) == 0:                # force a full retrain if the db doesn't exist                force = "-f"            else:                force = force_arg            os.system("sb_mboxtrain.py %(force)s -d %(dbf)s -g %(mboxf)s" %                      locals())        else:            print >> sys.stderr, "skipping", dbf, "(up-to-date)"    # merge all the databases together in strange and marvelous ways    merge(mapping.keys(), max_ratio)    return 0if __name__ == "__main__":    sys.exit(main(sys.argv[1:]))

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -