⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tester.py

📁 用python实现的邮件过滤器
💻 PY
📖 第 1 页 / 共 2 页
字号:
# unit tester for the Outlook addin.## Note we are only attempting to test Outlook specific# functionality, such as filters, etc.## General process is to create test messages known to contain ham/spam# keywords, and tracking their progress through the filters.  We also# move this test message back around, and watch the incremental retrain# in action.  Also checks that the message correctly remains classified# after a message move.from __future__ import generatorsfrom win32com.client import constantsimport sysfrom time import sleepimport copyimport rfc822import cStringIOimport threadingfrom spambayes.storage import STATE_KEYimport msgstorefrom win32com.mapi import mapi, mapiutilimport pythoncomHAM="ham"SPAM="spam"UNSURE="unsure"TEST_SUBJECT = "SpamBayes addin auto-generated test message"class TestFailure(Exception):    passdef TestFailed(msg):    raise TestFailure(msg)def AssertRaises(exception, func, *args):    try:        func(*args)        raise TestFailed("Function '%s' should have raised '%r', but it worked!" % \                         (func, exception))    except:        exc_type = sys.exc_info()[0]        if exc_type == exception or issubclass(exc_type, exception):            return        raisefilter_event = threading.Event()def WaitForFilters():    # Must wait longer than normal, so when run with a timer we still work.    filter_event.clear()    for i in range(500):        pythoncom.PumpWaitingMessages()        if filter_event.isSet():            break        sleep(0.01)def DictExtractor(bayes):    for k, v in bayes.wordinfo.items():        yield k, vdef DBExtractor(bayes):    # We use bsddb3 now if we can    try:        import bsddb3 as bsddb        bsddb_error = bsddb.db.DBNotFoundError    except ImportError:        import bsddb        bsddb_error = bsddb.error    key = bayes.dbm.first()[0]    if key != STATE_KEY:        yield key, bayes._wordinfoget(key)    while True:        try:            key = bayes.dbm.next()[0]        except bsddb.error:            break        except bsddb_error:            break        if key != STATE_KEY:            yield key, bayes._wordinfoget(key)# Find the top 'n' words in the Spam database that are clearly# marked as either ham or spam.  Simply enumerates the# bayes word list looking for any word with zero count in the# non-requested category._top_ham = None_top_spam = Nonedef FindTopWords(bayes, num, get_spam):    global _top_spam, _top_ham    if get_spam and _top_spam: return _top_spam    if not get_spam and _top_ham: return _top_ham    items = []    try:        bayes.db # bsddb style        extractor = DBExtractor    except AttributeError:        extractor = DictExtractor    for word, info in extractor(bayes):        if info is None:            break        if ":" in word:            continue        if get_spam:            if info.hamcount==0:                items.append((info.spamcount, word, info))        else:            if info.spamcount==0:                items.append((info.hamcount, word, info))    items.sort()    items.reverse()    # Throw an error if we don't have enough tokens - otherwise    # the test itself may fail, which will be more confusing than this.    if len(items) < num:        TestFailed("Error: could not find %d words with Spam=%s - only found %d" % (num, get_spam, len(items)))    ret = {}    for n, word, info in items[:num]:        ret[word]=copy.copy(info)    if get_spam:        _top_spam = ret    else:        _top_ham = ret    return ret# A little driver/manager for our testsclass Driver:    def __init__(self, mgr):        if mgr is None:            import manager            mgr = manager.GetManager()        self.manager = mgr        # Remember the "spam" folder.        folder = mgr.message_store.GetFolder(mgr.config.filter.spam_folder_id)        self.folder_spam = folder.GetOutlookItem()        # Remember the "unsure" folder.        folder = mgr.message_store.GetFolder(mgr.config.filter.unsure_folder_id)        self.folder_unsure = folder.GetOutlookItem()        # And the drafts folder where new messages are created.        self.folder_drafts = mgr.outlook.Session.GetDefaultFolder(constants.olFolderDrafts)    def GetWatchFolderGenerator(self):        mgr = self.manager        gen = mgr.message_store.GetFolderGenerator(                                mgr.config.filter.watch_folder_ids,                                mgr.config.filter.watch_include_sub)        for f in gen:            yield f, f.GetOutlookItem()    def FindTestMessage(self, folder):        subject = TEST_SUBJECT        items = folder.Items        return items.Find("[Subject] = '%s'" % (subject,))    def CheckMessageFilteredFrom(self, folder):        # For hotmail accounts, the message may take a little time to actually        # be removed from the original folder (ie, it appears in the "dest"        # folder before it vanished.        for i in range(5):            if self.FindTestMessage(folder) is None:                break            for j in range(10):                sleep(.05)        else:            ms_folder = self.manager.message_store.GetFolder(folder)            TestFailed("The test message remained in folder '%s'" % ms_folder.GetFQName())    def _CleanTestMessageFromFolder(self, folder):        subject = TEST_SUBJECT        num = 0        # imap/hotmail etc only soft delete, and I see no way to differentiate        # force the user to purge them manually        for i in range(50):            msg = self.FindTestMessage(folder)            if msg is None:                break            msg.Delete()        else:            raise TestFailed("Old test messages appear to still exist.  These may" \                             "be 'soft-deleted' - you will need to purge them manually")        if num:            print "Cleaned %d test messages from folder '%s'" % (num, folder.Name)    def CleanAllTestMessages(self):        self._CleanTestMessageFromFolder(self.folder_spam)        self._CleanTestMessageFromFolder(self.folder_unsure)        self._CleanTestMessageFromFolder(self.folder_drafts)        for msf, of in self.GetWatchFolderGenerator():            self._CleanTestMessageFromFolder(of)    def CreateTestMessageInFolder(self, spam_status, folder):        msg, words = self.CreateTestMessage(spam_status)        msg.Save() # Put into "Drafts".        assert self.FindTestMessage(self.folder_drafts) is not None        # Move it to the specified folder        msg.Move(folder)        # And now find it in the specified folder        return self.FindTestMessage(folder), words    def CreateTestMessage(self, spam_status):        words = {}        bayes = self.manager.classifier_data.bayes        if spam_status != SPAM:            words.update(FindTopWords(bayes, 50, False))        if spam_status != HAM:            words.update(FindTopWords(bayes, 50, True))        # Create a new blank message with our words        msg = self.manager.outlook.CreateItem(0)        msg.Body = "\n".join(words.keys())        msg.Subject = TEST_SUBJECT        return msg, wordsdef check_words(words, bayes, spam_offset, ham_offset):    for word, existing_info in words.items():        new_info = bayes._wordinfoget(word)        if existing_info.spamcount+spam_offset != new_info.spamcount or \           existing_info.hamcount+ham_offset != new_info.hamcount:            TestFailed("Word check for '%s failed. "                       "old spam/ham=%d/%d, new spam/ham=%d/%d,"                       "spam_offset=%d, ham_offset=%d" % \                       (word,                        existing_info.spamcount, existing_info.hamcount,                        new_info.spamcount, new_info.hamcount,                        spam_offset, ham_offset))# The tests themselves.# The "spam" test is huge - we do standard filter tests, but# also do incremental retrain tests.def TestSpamFilter(driver):    bayes = driver.manager.classifier_data.bayes    nspam = bayes.nspam    nham = bayes.nham    original_bayes = copy.copy(driver.manager.classifier_data.bayes)    # for each watch folder, create a spam message, and do the training thang    for msf_watch, folder_watch in driver.GetWatchFolderGenerator():        print "Performing Spam test on watch folder '%s'..." % msf_watch.GetFQName()        # Create a spam message in the Inbox - it should get immediately filtered        msg, words = driver.CreateTestMessageInFolder(SPAM, folder_watch)        # sleep to ensure filtering.        WaitForFilters()        # It should no longer be in the Inbox.        driver.CheckMessageFilteredFrom(folder_watch)        # It should be in the "sure is spam" folder.        spam_msg = driver.FindTestMessage(driver.folder_spam)        if spam_msg is None:            TestFailed("The test message vanished from the Inbox, but didn't appear in Spam")        # Check that none of the above caused training.        if nspam != bayes.nspam:            TestFailed("Something caused a new spam message to appear")        if nham != bayes.nham:            TestFailed("Something caused a new ham message to appear")        check_words(words, bayes, 0, 0)        # Now move the message back to the inbox - it should get trained.        store_msg = driver.manager.message_store.GetMessage(spam_msg)        driver.manager.classifier_data.message_db.load_msg(store_msg)        import train        if train.been_trained_as_ham(store_msg):            TestFailed("This new spam message should not have been trained as ham yet")        if train.been_trained_as_spam(store_msg):            TestFailed("This new spam message should not have been trained as spam yet")        spam_msg.Move(folder_watch)        WaitForFilters()        spam_msg = driver.FindTestMessage(folder_watch)        if spam_msg is None:            TestFailed("The message appears to have been filtered out of the watch folder")        store_msg = driver.manager.message_store.GetMessage(spam_msg)        driver.manager.classifier_data.message_db.load_msg(store_msg)        need_untrain = True        try:            if nspam != bayes.nspam:                TestFailed("There were not the same number of spam messages after a re-train")            if nham+1 != bayes.nham:                TestFailed("There was not one more ham messages after a re-train")            if train.been_trained_as_spam(store_msg):                TestFailed("This new spam message should not have been trained as spam yet")            if not train.been_trained_as_ham(store_msg):                TestFailed("This new spam message should have been trained as ham now")            # word infos should have one extra ham            check_words(words, bayes, 0, 1)            # Now move it back to the Spam folder.            # This should see the message un-trained as ham, and re-trained as Spam            spam_msg.Move(driver.folder_spam)            WaitForFilters()            spam_msg = driver.FindTestMessage(driver.folder_spam)            if spam_msg is None:                TestFailed("Could not find the message in the Spam folder")            store_msg = driver.manager.message_store.GetMessage(spam_msg)            driver.manager.classifier_data.message_db.load_msg(store_msg)            if nspam +1 != bayes.nspam:                TestFailed("There should be one more spam now")            if nham != bayes.nham:                TestFailed("There should be the same number of hams again")            if not train.been_trained_as_spam(store_msg):                TestFailed("This new spam message should have been trained as spam by now")            if train.been_trained_as_ham(store_msg):                TestFailed("This new spam message should have been un-trained as ham")            # word infos should have one extra spam, no extra ham            check_words(words, bayes, 1, 0)            # Move the message to another folder, and make sure we still            # identify it correctly as having been trained.            # Move to the "unsure" folder, just cos we know about it, and            # we know that no special watching of this folder exists.            spam_msg.Move(driver.folder_unsure)            spam_msg = driver.FindTestMessage(driver.folder_unsure)            if spam_msg is None:                TestFailed("Could not find the message in the Unsure folder")            store_msg = driver.manager.message_store.GetMessage(spam_msg)            driver.manager.classifier_data.message_db.load_msg(store_msg)            if not train.been_trained_as_spam(store_msg):                TestFailed("Message was not identified as Spam after moving")            # word infos still be 'spam'            check_words(words, bayes, 1, 0)            # Now undo the damage we did.            was_spam = train.untrain_message(store_msg, driver.manager.classifier_data)            driver.manager.classifier_data.message_db.load_msg(store_msg)            if not was_spam:                TestFailed("Untraining this message did not indicate it was spam")            if train.been_trained_as_spam(store_msg) or \               train.been_trained_as_ham(store_msg):                TestFailed("Untraining this message kept it has ham/spam")            need_untrain = False        finally:            if need_untrain:                train.untrain_message(store_msg, driver.manager.classifier_data)        # Check all the counts are back where we started.        if nspam != bayes.nspam:            TestFailed("Spam count didn't get back to the same")        if nham != bayes.nham:            TestFailed("Ham count didn't get back to the same")        check_words(words, bayes, 0, 0)        if bayes.wordinfo != original_bayes.wordinfo:            TestFailed("The bayes object's 'wordinfo' did not compare the same at the end of all this!")        if bayes.probcache != original_bayes.probcache:            TestFailed("The bayes object's 'probcache' did not compare the same at the end of all this!")

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -