📄 findbest.py

📁 用python实现的邮件过滤器
💻 PY
字号:
#!/usr/bin/env python'''Find the next "best" unsure message to train on.    %(prog)s [ -h ] [ -s ] [ -b N ] ham spam unsureGiven a number of unsure messages and a desire to keep your trainingdatabase small, the question naturally arises, "Which message should I addto my database next?".  A common approach is to sort the unsures by theirSpamBayes scores and train on the one which scores lowest.  This is areasonable approach, but there is no guarantee the lowest scoring unsure isin any way related to the other unsure messages.This script offers a different approach.  Given an existing pile of ham andspam, it trains on them to establish a baseline, then for each message inthe unsure pile, it trains on that message, scores the entire unsure pileagainst the resulting training database, then untrains on that message.  Foreach such message the following output is generated:    * spamprob of the candidate message    * number of other unsure messages which would score as spam if it was      added to the training database    * overall mean of all scored messages after training    * standard deviation of all scored messages after training    * message-id of the candidate messageWith no options, all candidate unsure messages are trained and scoredagainst.  At the end of the run, a file, "best.pck" is written out which isa dictionary keyed by the overall mean rounded to three decimal places.  Thevalues are lists of message-ids which generate that mean.Three options affect the behavior of the program.  If the -h flag is given,this help message is displayed and the program exits.  If the -s flag isgiven, no messages which score as spam are tested as candidates.  If the -bN flag is given, only the messages which generated the N highest means inthe last run without the -b flag are tested as candidates.  Because theprogram runtime can be very slow (O(n^2) in the number of unsure messages),if you have a fairly large pile of unsure messages, these options can speedthings up dramatically.  If the -b flag is used, a new "best.pck" file isnot written.  Typically you would run once without the -b flag, then severaltimes with the -b flag, adding one message to the spam pile after each run.After adding several messages to your spam file, you might then redistributethe unsure pile to move spams and hams to their respective folders, thenstart again with a smaller unsure pile.The ham, spam and unsure command line arguments can be anything suitable forfeeding to spambayes.mboxutils.getmbox().  The "best.pck" file is searchedfor and written to these files in this order:    * best.pck in the current directory    * $HOME/tmp/best.pck    * $HOME/best.pck[To do?  Someone might consider the reverse operation.  Given a pile of hamand spam, which message can be removed with the least impact?  What pile ofmail should that removal be tested against?]'''import sysimport osimport cPickle as pickleimport getoptimport mathfrom spambayes.mboxutils import getmboxfrom spambayes.classifier import Classifierfrom spambayes.hammie import Hammiefrom spambayes.tokenizer import tokenizefrom spambayes.Options import optionscls = Classifier()h = Hammie(cls)def counter(tag, i):    if tag:        sys.stdout.write("\r%s: %4d" % (tag, i))    else:        sys.stdout.write("\r%4d" % i)    sys.stdout.flush()def learn(mbox, h, is_spam):    i = 0    tag = is_spam and "Spam" or "Ham"    for msg in getmbox(mbox):        counter(tag, i)        i += 1        h.train(msg, is_spam)    printdef score(unsure, h, cls, scores, msgids=None, skipspam=False):    """See what effect on others each msg in unsure has"""    ham_cutoff = options["Categorization", "ham_cutoff"]    spam_cutoff = options["Categorization", "spam_cutoff"]    # compute a base - number of messages in unsure already in the    # region of interest    n = 0    total = 0.0    okalready = set()    add = okalready.add    for msg in getmbox(unsure):        prob = cls.spamprob(tokenize(msg))        n += 1        if prob >= spam_cutoff:            add(msg['message-id'])        else:            total += prob    first_mean = total/n    print len(okalready), "out of", n, "messages already score as spam"    print "initial mean spam prob: %.3f" % first_mean    print "%5s %3s %5s %5s %s" % ("prob", "new", "mean", "sdev", "msgid")    # one by one, train on each message and see what effect it has on    # the other messages in the mailbox    for msg in getmbox(unsure):        msgid = msg['message-id']        if msgids is not None and msgid not in msgids:            continue        msgprob = cls.spamprob(tokenize(msg))        if skipspam and msgprob >= spam_cutoff:            continue        n = j = 0        h.train(msg, True)        # see how many other messages in unsure now score as spam        total = 0.0        probs = []        for trial in getmbox(unsure):            # don't score messages which previously scored as spam            if trial['message-id'] in okalready:                continue            n += 1            if n % 10 == 0:                counter("", n)            prob = cls.spamprob(tokenize(trial))            probs.append(prob)            total += prob            if prob >= spam_cutoff:                j += 1        counter("", n)        h.untrain(msg, True)        mean = total/n        meankey = round(mean, 3)        scores.setdefault(meankey, []).append(msgid)        sdev = math.sqrt(sum([(mean-prob)**2 for prob in probs])/n)        print "\r%.3f %3d %.3f %.3f %s" % (msgprob, j, mean, sdev, msgid)prog = os.path.basename(sys.argv[0])def usage(msg=None):    if msg is not None:        print >> sys.stderr, msg    print >> sys.stderr, __doc__.strip() % globals()def main(args):    try:        opts, args = getopt.getopt(args, "b:sh")    except getopt.error, msg:        usage(msg)        return 1    best = 0    skipspam = False    for opt, arg in opts:        if opt == "-h":            usage()            return 0        if opt == "-b":            best = int(arg)        elif opt == "-s":            skipspam = True    if len(args) != 3:        usage("require ham, spam and unsure message piles")        return 1    ham, spam, unsure = args    choices = ["best.pck"]    if "HOME" in os.environ:        home = os.environ["HOME"]        choices.append(os.path.join(home, "tmp", "best.pck"))        choices.append(os.path.join(home, "best.pck"))    choices.append(None)    for bestfile in choices:        if bestfile is None:            break        if os.path.exists(bestfile):            break        try:            file(bestfile, "w")        except IOError:            pass        else:            os.unlink(bestfile)    if bestfile is None:        usage("can't find a place to write best.pck file")        return 1    print "establish base training"    learn(ham, h, False)    learn(spam, h, True)    print "scoring"    if best:        last_scores = pickle.load(file(bestfile))        last_scores = last_scores.items()        last_scores.sort()        msgids = set()        for (k, v) in last_scores[-best:]:            msgids.update(set(v))    else:        msgids = None            scores = {}    try:        score(unsure, h, cls, scores, msgids, skipspam)    except KeyboardInterrupt:        # allow early termination without loss of computed scores        pass    if not best:        pickle.dump(scores, file(bestfile, 'w'))    return 0if __name__ == "__main__":    sys.exit(main(sys.argv[1:]))
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -