📄 onepass.py

📁 用python实现的邮件过滤器
💻 PY
字号:
#!/usr/bin/env python"""Make one pass through a collection of ham and spam discarding anything thatscores as ham or spam given the currently scored messages.  (Sort of likeone pass of the train-to-exhaustion script.)usage %(prog)s [ -h ] [ -R ] -g file -s file [ -d file | -p file ] \               [ -c ext ] [ -o sect:opt:val ] [ -v ]-h      - Print this usage message and exit.-g file - Take ham from file.-s file - Take spam from file.-d file - Use a database-based classifier named file.-p file - Use a pickle-based classifier named file.-c ext  - Cull all messages which aren't used as training input during any          run and write to new ham and spam files with ext as an extra file          extension.  All messages which are never considered (because one          training set is longer than the other or the -m flag was used to          reduce the amount of input) are retained.-o sect:opt:val -          Set [sect, opt] in the options database to val.-v        Be very verbose, spewing all sorts of stuff out to stderr.-R        Walk backwards through the mailbox.Note: Adding messages which train correctly won't affect anything other thanadding more ham or spam to the respective training pile.  To force suchmessages to have an effect you should set your ham_cutoff and spam_cutoffvalues closer to 0.0 and 1.0 than your normal settings during scoring.  Forexample, if your normal ham_cutoff and spam_cutoff values are 0.2 and 0.8,you might run %(prog)s like    %(prog)s -o Categorization:ham_cutoff:0.05 \        -o Categorization:spam_cutoff:0.95 \        [ other args ]"""from __future__ import divisionimport sysimport getoptimport osimport datetimefrom spambayes import storagefrom spambayes import Optionsfrom spambayes import mboxutilsfrom spambayes.tokenizer import tokenizeprog = os.path.basename(sys.argv[0])def usage(msg=None):    if msg is not None:        print >> sys.stderr, msg    print >> sys.stderr, __doc__.strip() % globals()try:    reversedexcept NameError:    def reversed(seq):        seq = seq[:]        seq.reverse()        return iter(seq)def train(store, hambox, spambox, maxmsgs, tdict, reverse, verbose, cullext):    ham_cutoff = Options.options["Categorization", "ham_cutoff"]    spam_cutoff = Options.options["Categorization", "spam_cutoff"]    hambone = mboxutils.getmbox(hambox)    spamcan = mboxutils.getmbox(spambox)    if cullext is not None:        newham = open(hambox + cullext, "w")        newspam = open(spambox + cullext, "w")    else:        newham = newspam = open("/dev/null", "w")    if reverse:        hambone = reversed(list(hambone))        spamcan = reversed(list(spamcan))    ham_misses = spam_misses = ham_hits = spam_hits = nmsgs = 0    start = datetime.datetime.now()    while True:        try:            ham = hambone.next()            nmsgs += 1        except StopIteration:            # no hams left            ham = None        try:            spam = spamcan.next()            nmsgs += 1        except StopIteration:            # no spams left            spam = None        if ham is None and spam is None:            break        sys.stdout.write("\r%5d, ham: %5d/%5d, spam: %5d/%5d" %                         (nmsgs, ham_misses, ham_hits+ham_misses,                          spam_misses, spam_hits+spam_misses))        sys.stdout.flush()        if ham is not None:            score = store.spamprob(tokenize(ham))            selector = ham["message-id"] or ham["subject"]            if score > ham_cutoff and selector is not None:                if verbose:                    print >> sys.stderr, "miss ham: %.6f %s" % (                        score, selector)                ham_misses += 1                tdict[ham["message-id"]] = True                store.learn(tokenize(ham), False)                newham.write(str(ham))            else:                if verbose:                    print >> sys.stderr, "hit ham: %.6f %s" % (                        score, selector)                ham_hits += 1        if spam is not None:            score = store.spamprob(tokenize(spam))            selector = (spam["message-id"] or                        spam["subject"])            if score < spam_cutoff and selector is not None:                if verbose:                    print >> sys.stderr, "miss spam: %.6f %s" % (                        score, selector)                spam_misses += 1                tdict[spam["message-id"]] = True                store.learn(tokenize(spam), True)                newspam.write(str(spam))            else:                if verbose:                    print >> sys.stderr, "hit spam: %.6f %s" % (                        score, selector)                spam_hits += 1    sys.stdout.write("\n")def main(args):    try:        opts, args = getopt.getopt(args, "hg:s:d:p:o:m:r:c:vR",                                   ["help", "good=", "spam=",                                    "database=", "pickle=", "verbose",                                    "option=", "max=", "maxrounds=",                                    "cullext=", "reverse", "ratio="])    except getopt.GetoptError, msg:        usage(msg)        return 1    ham = spam = cullext = None    maxmsgs = 0    verbose = False    reverse = False    sh_ratio = (1, 1)    for opt, arg in opts:        if opt in ("-h", "--help"):            usage()            return 0        elif opt in ("-v", "--verbose"):            verbose = True        elif opt in ("-g", "--good"):            ham = arg        elif opt in ("-s", "--spam"):            spam = arg        elif opt in ("-d", "--database"):            Options.options["Storage", "persistent_storage_file"] = arg            Options.options["Storage", "persistent_use_database"] = "dbm"        elif opt in ("-p", "--pickle"):            Options.options["Storage", "persistent_storage_file"] = arg            Options.options["Storage", "persistent_use_database"] = "pickle"        elif opt in ("-c", "--cullext"):            cullext = arg        elif opt in ("-m", "--max"):            maxmsgs = int(arg)        elif opt in ("-R", "--reverse"):            reverse = True        elif opt in ('-o', '--option'):            Options.options.set_from_cmdline(arg, sys.stderr)        elif opt == '--ratio':            arg = arg.split(":")            sh_ratio = (int(arg[0]), int(arg[1]))                if ham is None or spam is None:        usage("require both ham and spam piles")        return 1    dbname, usedb = storage.database_type(opts)    print "creating %s (%s)" % (dbname, usedb)    try:        os.unlink(dbname)    except OSError:        pass    store = storage.open_storage(dbname, usedb)    tdict = {}    train(store, ham, spam, maxmsgs, tdict, reverse, verbose, cullext)    store.store()    return 0if __name__ == "__main__":    sys.exit(main(sys.argv[1:]))
💿 文件大小 1791 K
👤 上传用户 guigong
📂 所属分类数学计算
🏷️ 相关标签

#python #邮件过滤
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -