⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stats.py

📁 用python实现的邮件过滤器
💻 PY
📖 第 1 页 / 共 2 页
字号:
#! /usr/bin/env python"""Stats.py - SpamBayes statistics class.Classes:    Stats - provides statistical information about previous activity.Abstract:    Provide statistics on the activity that spambayes has done - for    example the number of messages classified as each type, and the    number of messages trained as each type.  This information is    retrieved from the messageinfo database, so is as reliable as that    is <wink>.    This class provides information for both the web interface, the    Outlook plug-in, and sb_pop3dnd.To Do:    o People would like pretty graphs, so maybe that could be done.    o People have requested time-based statistics - mail per hour,      spam per hour, and so on.      Discussion on spambayes-dev indicated that this would be a lot      of work for not much gain; however, since we now have some      time data stored, it wouldn't be too bad, so maybe it can go in.    o Suggestions?"""# This module is part of the spambayes project, which is Copyright 2002-5# The Python Software Foundation and is covered by the Python Software# Foundation license.__author__ = "Tony Meyer <ta-meyer@ihug.co.nz>"__credits__ = "Kenny Pitt, Mark Hammond, all the spambayes folk."try:    True, Falseexcept NameError:    # Maintain compatibility with Python 2.2    True, False = 1, 0import timeimport typesfrom spambayes.message import STATS_START_KEYfrom spambayes.message import database_type, open_storage, Messagetry:    _except NameError:    _ = lambda arg: argclass Stats(object):    def __init__(self, options, messageinfo_db):        self.messageinfo_db = messageinfo_db        self.options = options        # Reset session stats.        self.Reset()        # Load persistent stats.        self.from_date = self.messageinfo_db.get_statistics_start_date()        self.CalculatePersistentStats()    def Reset(self):        self.num_ham = self.num_spam = self.num_unsure = 0        self.num_trained_spam = self.num_trained_spam_fn  = 0        self.num_trained_ham = self.num_trained_ham_fp = 0    def ResetTotal(self, permanently=False):        self.totals = {}        for stat in ["num_ham", "num_spam", "num_unsure",                     "num_trained_spam", "num_trained_spam_fn",                     "num_trained_ham", "num_trained_ham_fp",]:            self.totals[stat] = 0        if permanently:            # Reset the date.            self.from_date = time.time()            self.messageinfo_db.set_statistics_start_date(self.from_date)    def RecordClassification(self, score):        """Record that a message has been classified this session."""        if score >= self.options["Categorization", "spam_cutoff"]:            self.num_spam += 1        elif score >= self.options["Categorization", "ham_cutoff"]:            self.num_unsure += 1        else:            self.num_ham += 1    def RecordTraining(self, as_ham, old_score=None, old_class=None):        """Record that a message has been trained this session.        If old_score and old_class are None, then the message had not        previously been trained (e.g. using the "Train" box on the web        interface), and so cannot be considered a fp or fn).        If both old_score and old_class are specified, old_score is used.        """        # XXX Why, oh why, does this function have as_ham, when every        # XXX other function has isSpam???        if as_ham:            self.num_trained_ham += 1            # If we are recovering an item that is in the "spam" threshold,            # then record it as a "false positive"            if old_score is not None and \               old_score > self.options["Categorization", "spam_cutoff"]:                self.num_trained_ham_fp += 1            elif old_class == self.options["Headers", "header_spam_string"]:                self.num_trained_ham_fp += 1        else:            self.num_trained_spam += 1            # If we are deleting as Spam an item that was in our "good"            # range, then record it as a false negative.            if old_score is not None and \               old_score < self.options["Categorization", "ham_cutoff"]:                self.num_trained_spam_fn += 1            elif old_class == self.options["Headers", "header_ham_string"]:                self.num_trained_spam_fn += 1    def CalculatePersistentStats(self):        """Calculate the statistics totals (i.e. not this session).        This is done by running through the messageinfo database and        adding up the various information.  This could get quite time        consuming if the messageinfo database gets very large, so        some consideration should perhaps be made about what to do        then.        """        self.ResetTotal()        totals = self.totals        for msg_id in self.messageinfo_db.keys():            # Skip the date key.            if msg_id == STATS_START_KEY:                continue            m = Message(msg_id)            self.messageinfo_db.load_msg(m)            # Skip all old messages that don't have a date.            if m.date_modified is None:                continue            # Skip ones that are too old.            if self.from_date and m.date_modified < self.from_date:                continue            classification = m.GetClassification()            trained = m.GetTrained()                        if classification == self.options["Headers",                                              "header_spam_string"]:                # Classified as spam.                totals["num_spam"] += 1                if trained == False:                    # False positive (classified as spam, trained as ham)                    totals["num_trained_ham_fp"] += 1            elif classification == self.options["Headers",                                                "header_ham_string"]:                # Classified as ham.                totals["num_ham"] += 1                if trained == True:                    # False negative (classified as ham, trained as spam)                    totals["num_trained_spam_fn"] += 1            elif classification == self.options["Headers",                                                "header_unsure_string"]:                # Classified as unsure.                totals["num_unsure"] += 1                if trained == False:                    totals["num_trained_ham"] += 1                elif trained == True:                    totals["num_trained_spam"] += 1    def _CombineSessionAndTotal(self):        totals = self.totals        data = {}        data["num_ham"] = self.num_ham + totals["num_ham"]        data["num_spam"] = self.num_spam + totals["num_spam"]        data["num_unsure"] = self.num_unsure + totals["num_unsure"]        data["num_seen"] = data["num_ham"] + data["num_spam"] + \                           data["num_unsure"]        data["num_trained_ham"] = self.num_trained_ham + \                                  totals["num_trained_ham"]        data["num_trained_ham_fp"] = self.num_trained_ham_fp + \                                     totals["num_trained_ham_fp"]        data["num_trained_spam"] = self.num_trained_spam + \                                   totals["num_trained_spam"]        data["num_trained_spam_fn"] = self.num_trained_spam_fn + \                                      totals["num_trained_spam_fn"]        return data    def _CalculateAdditional(self, data):        data["perc_ham"] = 100.0 * data["num_ham"] / data["num_seen"]        data["perc_spam"] = 100.0 * data["num_spam"] / data["num_seen"]        data["perc_unsure"] = 100.0 * data["num_unsure"] / data["num_seen"]        data["num_ham_correct"] = data["num_ham"] - \                                  data["num_trained_spam_fn"]        data["num_spam_correct"] = data["num_spam"] - \                                   data["num_trained_ham_fp"]        data["num_correct"] = data["num_ham_correct"] + \

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -