stats.py

来自「用python实现的邮件过滤器」· Python 代码 · 共 391 行 · 第 1/2 页
391 行
#! /usr/bin/env python"""Stats.py - SpamBayes statistics class.Classes:    Stats - provides statistical information about previous activity.Abstract:    Provide statistics on the activity that spambayes has done - for    example the number of messages classified as each type, and the    number of messages trained as each type.  This information is    retrieved from the messageinfo database, so is as reliable as that    is <wink>.    This class provides information for both the web interface, the    Outlook plug-in, and sb_pop3dnd.To Do:    o People would like pretty graphs, so maybe that could be done.    o People have requested time-based statistics - mail per hour,      spam per hour, and so on.      Discussion on spambayes-dev indicated that this would be a lot      of work for not much gain; however, since we now have some      time data stored, it wouldn't be too bad, so maybe it can go in.    o Suggestions?"""# This module is part of the spambayes project, which is Copyright 2002-5# The Python Software Foundation and is covered by the Python Software# Foundation license.__author__ = "Tony Meyer <ta-meyer@ihug.co.nz>"__credits__ = "Kenny Pitt, Mark Hammond, all the spambayes folk."try:    True, Falseexcept NameError:    # Maintain compatibility with Python 2.2    True, False = 1, 0import timeimport typesfrom spambayes.message import STATS_START_KEYfrom spambayes.message import database_type, open_storage, Messagetry:    _except NameError:    _ = lambda arg: argclass Stats(object):    def __init__(self, options, messageinfo_db):        self.messageinfo_db = messageinfo_db        self.options = options        # Reset session stats.        self.Reset()        # Load persistent stats.        self.from_date = self.messageinfo_db.get_statistics_start_date()        self.CalculatePersistentStats()    def Reset(self):        self.num_ham = self.num_spam = self.num_unsure = 0        self.num_trained_spam = self.num_trained_spam_fn  = 0        self.num_trained_ham = self.num_trained_ham_fp = 0    def ResetTotal(self, permanently=False):        self.totals = {}        for stat in ["num_ham", "num_spam", "num_unsure",                     "num_trained_spam", "num_trained_spam_fn",                     "num_trained_ham", "num_trained_ham_fp",]:            self.totals[stat] = 0        if permanently:            # Reset the date.            self.from_date = time.time()            self.messageinfo_db.set_statistics_start_date(self.from_date)    def RecordClassification(self, score):        """Record that a message has been classified this session."""        if score >= self.options["Categorization", "spam_cutoff"]:            self.num_spam += 1        elif score >= self.options["Categorization", "ham_cutoff"]:            self.num_unsure += 1        else:            self.num_ham += 1    def RecordTraining(self, as_ham, old_score=None, old_class=None):        """Record that a message has been trained this session.        If old_score and old_class are None, then the message had not        previously been trained (e.g. using the "Train" box on the web        interface), and so cannot be considered a fp or fn).        If both old_score and old_class are specified, old_score is used.        """        # XXX Why, oh why, does this function have as_ham, when every        # XXX other function has isSpam???        if as_ham:            self.num_trained_ham += 1            # If we are recovering an item that is in the "spam" threshold,            # then record it as a "false positive"            if old_score is not None and \               old_score > self.options["Categorization", "spam_cutoff"]:                self.num_trained_ham_fp += 1            elif old_class == self.options["Headers", "header_spam_string"]:                self.num_trained_ham_fp += 1        else:            self.num_trained_spam += 1            # If we are deleting as Spam an item that was in our "good"            # range, then record it as a false negative.            if old_score is not None and \               old_score < self.options["Categorization", "ham_cutoff"]:                self.num_trained_spam_fn += 1            elif old_class == self.options["Headers", "header_ham_string"]:                self.num_trained_spam_fn += 1    def CalculatePersistentStats(self):        """Calculate the statistics totals (i.e. not this session).        This is done by running through the messageinfo database and        adding up the various information.  This could get quite time        consuming if the messageinfo database gets very large, so        some consideration should perhaps be made about what to do        then.        """        self.ResetTotal()        totals = self.totals        for msg_id in self.messageinfo_db.keys():            # Skip the date key.            if msg_id == STATS_START_KEY:                continue            m = Message(msg_id)            self.messageinfo_db.load_msg(m)            # Skip all old messages that don't have a date.            if m.date_modified is None:                continue            # Skip ones that are too old.            if self.from_date and m.date_modified < self.from_date:                continue            classification = m.GetClassification()            trained = m.GetTrained()                        if classification == self.options["Headers",                                              "header_spam_string"]:                # Classified as spam.                totals["num_spam"] += 1                if trained == False:                    # False positive (classified as spam, trained as ham)                    totals["num_trained_ham_fp"] += 1            elif classification == self.options["Headers",                                                "header_ham_string"]:                # Classified as ham.                totals["num_ham"] += 1                if trained == True:                    # False negative (classified as ham, trained as spam)                    totals["num_trained_spam_fn"] += 1            elif classification == self.options["Headers",                                                "header_unsure_string"]:                # Classified as unsure.                totals["num_unsure"] += 1                if trained == False:                    totals["num_trained_ham"] += 1                elif trained == True:                    totals["num_trained_spam"] += 1    def _CombineSessionAndTotal(self):        totals = self.totals        data = {}        data["num_ham"] = self.num_ham + totals["num_ham"]        data["num_spam"] = self.num_spam + totals["num_spam"]        data["num_unsure"] = self.num_unsure + totals["num_unsure"]        data["num_seen"] = data["num_ham"] + data["num_spam"] + \                           data["num_unsure"]        data["num_trained_ham"] = self.num_trained_ham + \                                  totals["num_trained_ham"]        data["num_trained_ham_fp"] = self.num_trained_ham_fp + \                                     totals["num_trained_ham_fp"]        data["num_trained_spam"] = self.num_trained_spam + \                                   totals["num_trained_spam"]        data["num_trained_spam_fn"] = self.num_trained_spam_fn + \                                      totals["num_trained_spam_fn"]        return data    def _CalculateAdditional(self, data):        data["perc_ham"] = 100.0 * data["num_ham"] / data["num_seen"]        data["perc_spam"] = 100.0 * data["num_spam"] / data["num_seen"]        data["perc_unsure"] = 100.0 * data["num_unsure"] / data["num_seen"]        data["num_ham_correct"] = data["num_ham"] - \                                  data["num_trained_spam_fn"]        data["num_spam_correct"] = data["num_spam"] - \                                   data["num_trained_ham_fp"]        data["num_correct"] = data["num_ham_correct"] + \
stats.py - 源码说明

本页面展示了「用python实现的邮件过滤器」中的 stats.py 源码文件，采用 Python 编程语言编写，共 391 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与python相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?