stats.py
来自「用python实现的邮件过滤器」· Python 代码 · 共 391 行 · 第 1/2 页
PY
391 行
data["num_spam_correct"] data["num_incorrect"] = data["num_trained_spam_fn"] + \ data["num_trained_ham_fp"] data["perc_correct"] = 100.0 * data["num_correct"] / \ data["num_seen"] data["perc_incorrect"] = 100.0 * data["num_incorrect"] / \ data["num_seen"] data["perc_fp"] = 100.0 * data["num_trained_ham_fp"] / \ data["num_seen"] data["perc_fn"] = 100.0 * data["num_trained_spam_fn"] / \ data["num_seen"] data["num_unsure_trained_ham"] = data["num_trained_ham"] - \ data["num_trained_ham_fp"] data["num_unsure_trained_spam"] = data["num_trained_spam"] - \ data["num_trained_spam_fn"] data["num_unsure_not_trained"] = data["num_unsure"] - \ data["num_unsure_trained_ham"] - \ data["num_unsure_trained_spam"] if data["num_unsure"]: data["perc_unsure_trained_ham"] = 100.0 * \ data["num_unsure_trained_ham"] / \ data["num_unsure"] data["perc_unsure_trained_spam"] = 100.0 * \ data["num_unsure_trained_spam"] / \ data["num_unsure"] data["perc_unsure_not_trained"] = 100.0 * \ data["num_unsure_not_trained"] / \ data["num_unsure"] data["total_ham"] = data["num_ham_correct"] + \ data["num_trained_ham"] data["total_spam"] = data["num_spam_correct"] + \ data["num_trained_spam"] if data["total_ham"]: data["perc_ham_incorrect"] = 100.0 * \ data["num_trained_ham_fp"] / \ data["total_ham"] data["perc_ham_unsure"] = 100.0 * \ data["num_unsure_trained_ham"] / \ data["total_ham"] data["perc_ham_incorrect_or_unsure"] = \ 100.0 * (data["num_trained_ham_fp"] + data["num_unsure_trained_ham"]) / \ data["total_ham"] if data["total_spam"]: data["perc_spam_correct"] = 100.0 * data["num_spam_correct"] / \ data["total_spam"] data["perc_spam_unsure"] = 100.0 * \ data["num_unsure_trained_spam"] / \ data["total_spam"] data["perc_spam_correct_or_unsure"] = \ 100.0 * (data["num_spam_correct"] + \ data["num_unsure_trained_spam"]) / \ data["total_spam"] fp_cost = self.options["TestDriver", "best_cutoff_fp_weight"] fn_cost = self.options["TestDriver", "best_cutoff_fn_weight"] unsure_cost = self.options["TestDriver", "best_cutoff_unsure_weight"] data["total_cost"] = data["num_trained_ham_fp"] * fp_cost + \ data["num_trained_spam_fn"] * fn_cost + \ data["num_unsure"] * unsure_cost # If there was no filtering done, what would the cost have been? # (Assuming that any spam in the inbox earns the cost of a fn) no_filter_cost = data["num_spam"] * fn_cost data["cost_savings"] = no_filter_cost - data["total_cost"] return data def _AddPercentStrings(self, data, dp): data["perc_ham_s"] = "%%(perc_ham).%df%%(perc)s" % (dp,) data["perc_spam_s"] = "%%(perc_spam).%df%%(perc)s" % (dp,) data["perc_unsure_s"] = "%%(perc_unsure).%df%%(perc)s" % (dp,) data["perc_correct_s"] = "%%(perc_correct).%df%%(perc)s" % (dp,) data["perc_incorrect_s"] = "%%(perc_incorrect).%df%%(perc)s" % (dp,) data["perc_fp_s"] = "%%(perc_fp).%df%%(perc)s" % (dp,) data["perc_fn_s"] = "%%(perc_fn).%df%%(perc)s" % (dp,) data["perc_spam_correct_s"] = "%%(perc_spam_correct).%df%%(perc)s" \ % (dp,) data["perc_spam_unsure_s"] = "%%(perc_spam_unsure).%df%%(perc)s" \ % (dp,) data["perc_spam_correct_or_unsure_s"] = \ "%%(perc_spam_correct_or_unsure).%df%%(perc)s" % (dp,) data["perc_ham_incorrect_s"] = "%%(perc_ham_incorrect).%df%%(perc)s" \ % (dp,) data["perc_ham_unsure_s"] = "%%(perc_ham_unsure).%df%%(perc)s" \ % (dp,) data["perc_ham_incorrect_or_unsure_s"] = \ "%%(perc_ham_incorrect_or_unsure).%df%%(perc)s" % (dp,) data["perc_unsure_trained_ham_s"] = \ "%%(perc_unsure_trained_ham).%df%%(perc)s" % (dp,) data["perc_unsure_trained_spam_s"] = "%%(perc_unsure_trained_spam).%df%%(perc)s" \ % (dp,) data["perc_unsure_not_trained_s"] = "%%(perc_unsure_not_trained).%df%%(perc)s" \ % (dp,) data["perc"] = "%" return data def GetStats(self, use_html=False, session_only=False, decimal_points=1): """Return a description of the statistics. If session_only is True, then only a description of the statistics since we were last reset. Otherwise, lifetime statistics (i.e. those including the ones loaded). Users probably care most about persistent statistics, so present those by default. If session-only stats are desired, then a special call to here can be made. The percentages will be accurate to the given number of decimal points. If use_html is True, then the returned data is marked up with appropriate HTML, otherwise it is plain text. """ chunks = [] push = chunks.append if session_only: data = {} data["num_seen"] = self.num_ham + self.num_spam + \ self.num_unsure data["num_ham"] = self.num_ham data["num_spam"] = self.num_spam data["num_unsure"] = self.num_unsure data["num_trained_ham"] = self.num_trained_ham data["num_trained_ham_fp"] = self.num_trained_ham_fp data["num_trained_spam"] = self.num_trained_spam data["num_trained_spam_fn"] = self.num_trained_spam_fn else: data = self._CombineSessionAndTotal() push(_("Messages classified: %d") % (data["num_seen"],)) if data["num_seen"] == 0: return chunks data = self._CalculateAdditional(data) format_dict = self._AddPercentStrings(data, decimal_points) # Possibly use HTML for tabs. if use_html: format_dict["tab"] = " " else: format_dict["tab"] = "\t" push((_("%(tab)sGood:%(tab)s%(num_ham)d (%(perc_ham_s)s)") \ % format_dict) % format_dict) push((_("%(tab)sSpam:%(tab)s%(num_spam)d (%(perc_spam_s)s)") \ % format_dict) % format_dict) push((_("%(tab)sUnsure:%(tab)s%(num_unsure)d (%(perc_unsure_s)s)") \ % format_dict) % format_dict) push("") push((_("Classified correctly:%(tab)s%(num_correct)d (%(perc_correct_s)s of total)") \ % format_dict) % format_dict) push((_("Classified incorrectly:%(tab)s%(num_incorrect)d (%(perc_incorrect_s)s of total)") \ % format_dict) % format_dict) if format_dict["num_incorrect"]: push((_("%(tab)sFalse positives:%(tab)s%(num_trained_ham_fp)d (%(perc_fp_s)s of total)") \ % format_dict) % format_dict) push((_("%(tab)sFalse negatives:%(tab)s%(num_trained_spam_fn)d (%(perc_fn_s)s of total)") \ % format_dict) % format_dict) push("") push(_("Manually classified as good:%(tab)s%(num_trained_ham)d") % format_dict) push(_("Manually classified as spam:%(tab)s%(num_trained_spam)d") % format_dict) push("") if format_dict["num_unsure"]: push((_("Unsures trained as good:%(tab)s%(num_unsure_trained_ham)d (%(perc_unsure_trained_ham_s)s of unsures)") \ % format_dict) % format_dict) push((_("Unsures trained as spam:%(tab)s%(num_unsure_trained_spam)d (%(perc_unsure_trained_spam_s)s of unsures)") \ % format_dict) % format_dict) push((_("Unsures not trained:%(tab)s%(tab)s%(num_unsure_not_trained)d (%(perc_unsure_not_trained_s)s of unsures)") \ % format_dict) % format_dict) push("") if format_dict["total_spam"]: push((_("Spam correctly identified:%(tab)s%(perc_spam_correct_s)s (+ %(perc_spam_unsure_s)s unsure)") \ % format_dict) % format_dict) if format_dict["total_ham"]: push((_("Good incorrectly identified:%(tab)s%(perc_ham_incorrect_s)s (+ %(perc_ham_unsure_s)s unsure)") \ % format_dict) % format_dict) if format_dict["total_spam"] or format_dict["total_ham"]: push("") push(_("Total cost of spam:%(tab)s$%(total_cost).2f") % format_dict) push(_("SpamBayes savings:%(tab)s$%(cost_savings).2f") % format_dict) return chunksif __name__=='__main__': s = Stats() print "\n".join(s.GetStats())
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?