📄 tester.py
字号:
# unit tester for the Outlook addin.## Note we are only attempting to test Outlook specific# functionality, such as filters, etc.## General process is to create test messages known to contain ham/spam# keywords, and tracking their progress through the filters. We also# move this test message back around, and watch the incremental retrain# in action. Also checks that the message correctly remains classified# after a message move.from __future__ import generatorsfrom win32com.client import constantsimport sysfrom time import sleepimport copyimport rfc822import cStringIOimport threadingfrom spambayes.storage import STATE_KEYimport msgstorefrom win32com.mapi import mapi, mapiutilimport pythoncomHAM="ham"SPAM="spam"UNSURE="unsure"TEST_SUBJECT = "SpamBayes addin auto-generated test message"class TestFailure(Exception): passdef TestFailed(msg): raise TestFailure(msg)def AssertRaises(exception, func, *args): try: func(*args) raise TestFailed("Function '%s' should have raised '%r', but it worked!" % \ (func, exception)) except: exc_type = sys.exc_info()[0] if exc_type == exception or issubclass(exc_type, exception): return raisefilter_event = threading.Event()def WaitForFilters(): # Must wait longer than normal, so when run with a timer we still work. filter_event.clear() for i in range(500): pythoncom.PumpWaitingMessages() if filter_event.isSet(): break sleep(0.01)def DictExtractor(bayes): for k, v in bayes.wordinfo.items(): yield k, vdef DBExtractor(bayes): # We use bsddb3 now if we can try: import bsddb3 as bsddb bsddb_error = bsddb.db.DBNotFoundError except ImportError: import bsddb bsddb_error = bsddb.error key = bayes.dbm.first()[0] if key != STATE_KEY: yield key, bayes._wordinfoget(key) while True: try: key = bayes.dbm.next()[0] except bsddb.error: break except bsddb_error: break if key != STATE_KEY: yield key, bayes._wordinfoget(key)# Find the top 'n' words in the Spam database that are clearly# marked as either ham or spam. Simply enumerates the# bayes word list looking for any word with zero count in the# non-requested category._top_ham = None_top_spam = Nonedef FindTopWords(bayes, num, get_spam): global _top_spam, _top_ham if get_spam and _top_spam: return _top_spam if not get_spam and _top_ham: return _top_ham items = [] try: bayes.db # bsddb style extractor = DBExtractor except AttributeError: extractor = DictExtractor for word, info in extractor(bayes): if info is None: break if ":" in word: continue if get_spam: if info.hamcount==0: items.append((info.spamcount, word, info)) else: if info.spamcount==0: items.append((info.hamcount, word, info)) items.sort() items.reverse() # Throw an error if we don't have enough tokens - otherwise # the test itself may fail, which will be more confusing than this. if len(items) < num: TestFailed("Error: could not find %d words with Spam=%s - only found %d" % (num, get_spam, len(items))) ret = {} for n, word, info in items[:num]: ret[word]=copy.copy(info) if get_spam: _top_spam = ret else: _top_ham = ret return ret# A little driver/manager for our testsclass Driver: def __init__(self, mgr): if mgr is None: import manager mgr = manager.GetManager() self.manager = mgr # Remember the "spam" folder. folder = mgr.message_store.GetFolder(mgr.config.filter.spam_folder_id) self.folder_spam = folder.GetOutlookItem() # Remember the "unsure" folder. folder = mgr.message_store.GetFolder(mgr.config.filter.unsure_folder_id) self.folder_unsure = folder.GetOutlookItem() # And the drafts folder where new messages are created. self.folder_drafts = mgr.outlook.Session.GetDefaultFolder(constants.olFolderDrafts) def GetWatchFolderGenerator(self): mgr = self.manager gen = mgr.message_store.GetFolderGenerator( mgr.config.filter.watch_folder_ids, mgr.config.filter.watch_include_sub) for f in gen: yield f, f.GetOutlookItem() def FindTestMessage(self, folder): subject = TEST_SUBJECT items = folder.Items return items.Find("[Subject] = '%s'" % (subject,)) def CheckMessageFilteredFrom(self, folder): # For hotmail accounts, the message may take a little time to actually # be removed from the original folder (ie, it appears in the "dest" # folder before it vanished. for i in range(5): if self.FindTestMessage(folder) is None: break for j in range(10): sleep(.05) else: ms_folder = self.manager.message_store.GetFolder(folder) TestFailed("The test message remained in folder '%s'" % ms_folder.GetFQName()) def _CleanTestMessageFromFolder(self, folder): subject = TEST_SUBJECT num = 0 # imap/hotmail etc only soft delete, and I see no way to differentiate # force the user to purge them manually for i in range(50): msg = self.FindTestMessage(folder) if msg is None: break msg.Delete() else: raise TestFailed("Old test messages appear to still exist. These may" \ "be 'soft-deleted' - you will need to purge them manually") if num: print "Cleaned %d test messages from folder '%s'" % (num, folder.Name) def CleanAllTestMessages(self): self._CleanTestMessageFromFolder(self.folder_spam) self._CleanTestMessageFromFolder(self.folder_unsure) self._CleanTestMessageFromFolder(self.folder_drafts) for msf, of in self.GetWatchFolderGenerator(): self._CleanTestMessageFromFolder(of) def CreateTestMessageInFolder(self, spam_status, folder): msg, words = self.CreateTestMessage(spam_status) msg.Save() # Put into "Drafts". assert self.FindTestMessage(self.folder_drafts) is not None # Move it to the specified folder msg.Move(folder) # And now find it in the specified folder return self.FindTestMessage(folder), words def CreateTestMessage(self, spam_status): words = {} bayes = self.manager.classifier_data.bayes if spam_status != SPAM: words.update(FindTopWords(bayes, 50, False)) if spam_status != HAM: words.update(FindTopWords(bayes, 50, True)) # Create a new blank message with our words msg = self.manager.outlook.CreateItem(0) msg.Body = "\n".join(words.keys()) msg.Subject = TEST_SUBJECT return msg, wordsdef check_words(words, bayes, spam_offset, ham_offset): for word, existing_info in words.items(): new_info = bayes._wordinfoget(word) if existing_info.spamcount+spam_offset != new_info.spamcount or \ existing_info.hamcount+ham_offset != new_info.hamcount: TestFailed("Word check for '%s failed. " "old spam/ham=%d/%d, new spam/ham=%d/%d," "spam_offset=%d, ham_offset=%d" % \ (word, existing_info.spamcount, existing_info.hamcount, new_info.spamcount, new_info.hamcount, spam_offset, ham_offset))# The tests themselves.# The "spam" test is huge - we do standard filter tests, but# also do incremental retrain tests.def TestSpamFilter(driver): bayes = driver.manager.classifier_data.bayes nspam = bayes.nspam nham = bayes.nham original_bayes = copy.copy(driver.manager.classifier_data.bayes) # for each watch folder, create a spam message, and do the training thang for msf_watch, folder_watch in driver.GetWatchFolderGenerator(): print "Performing Spam test on watch folder '%s'..." % msf_watch.GetFQName() # Create a spam message in the Inbox - it should get immediately filtered msg, words = driver.CreateTestMessageInFolder(SPAM, folder_watch) # sleep to ensure filtering. WaitForFilters() # It should no longer be in the Inbox. driver.CheckMessageFilteredFrom(folder_watch) # It should be in the "sure is spam" folder. spam_msg = driver.FindTestMessage(driver.folder_spam) if spam_msg is None: TestFailed("The test message vanished from the Inbox, but didn't appear in Spam") # Check that none of the above caused training. if nspam != bayes.nspam: TestFailed("Something caused a new spam message to appear") if nham != bayes.nham: TestFailed("Something caused a new ham message to appear") check_words(words, bayes, 0, 0) # Now move the message back to the inbox - it should get trained. store_msg = driver.manager.message_store.GetMessage(spam_msg) driver.manager.classifier_data.message_db.load_msg(store_msg) import train if train.been_trained_as_ham(store_msg): TestFailed("This new spam message should not have been trained as ham yet") if train.been_trained_as_spam(store_msg): TestFailed("This new spam message should not have been trained as spam yet") spam_msg.Move(folder_watch) WaitForFilters() spam_msg = driver.FindTestMessage(folder_watch) if spam_msg is None: TestFailed("The message appears to have been filtered out of the watch folder") store_msg = driver.manager.message_store.GetMessage(spam_msg) driver.manager.classifier_data.message_db.load_msg(store_msg) need_untrain = True try: if nspam != bayes.nspam: TestFailed("There were not the same number of spam messages after a re-train") if nham+1 != bayes.nham: TestFailed("There was not one more ham messages after a re-train") if train.been_trained_as_spam(store_msg): TestFailed("This new spam message should not have been trained as spam yet") if not train.been_trained_as_ham(store_msg): TestFailed("This new spam message should have been trained as ham now") # word infos should have one extra ham check_words(words, bayes, 0, 1) # Now move it back to the Spam folder. # This should see the message un-trained as ham, and re-trained as Spam spam_msg.Move(driver.folder_spam) WaitForFilters() spam_msg = driver.FindTestMessage(driver.folder_spam) if spam_msg is None: TestFailed("Could not find the message in the Spam folder") store_msg = driver.manager.message_store.GetMessage(spam_msg) driver.manager.classifier_data.message_db.load_msg(store_msg) if nspam +1 != bayes.nspam: TestFailed("There should be one more spam now") if nham != bayes.nham: TestFailed("There should be the same number of hams again") if not train.been_trained_as_spam(store_msg): TestFailed("This new spam message should have been trained as spam by now") if train.been_trained_as_ham(store_msg): TestFailed("This new spam message should have been un-trained as ham") # word infos should have one extra spam, no extra ham check_words(words, bayes, 1, 0) # Move the message to another folder, and make sure we still # identify it correctly as having been trained. # Move to the "unsure" folder, just cos we know about it, and # we know that no special watching of this folder exists. spam_msg.Move(driver.folder_unsure) spam_msg = driver.FindTestMessage(driver.folder_unsure) if spam_msg is None: TestFailed("Could not find the message in the Unsure folder") store_msg = driver.manager.message_store.GetMessage(spam_msg) driver.manager.classifier_data.message_db.load_msg(store_msg) if not train.been_trained_as_spam(store_msg): TestFailed("Message was not identified as Spam after moving") # word infos still be 'spam' check_words(words, bayes, 1, 0) # Now undo the damage we did. was_spam = train.untrain_message(store_msg, driver.manager.classifier_data) driver.manager.classifier_data.message_db.load_msg(store_msg) if not was_spam: TestFailed("Untraining this message did not indicate it was spam") if train.been_trained_as_spam(store_msg) or \ train.been_trained_as_ham(store_msg): TestFailed("Untraining this message kept it has ham/spam") need_untrain = False finally: if need_untrain: train.untrain_message(store_msg, driver.manager.classifier_data) # Check all the counts are back where we started. if nspam != bayes.nspam: TestFailed("Spam count didn't get back to the same") if nham != bayes.nham: TestFailed("Ham count didn't get back to the same") check_words(words, bayes, 0, 0) if bayes.wordinfo != original_bayes.wordinfo: TestFailed("The bayes object's 'wordinfo' did not compare the same at the end of all this!") if bayes.probcache != original_bayes.probcache: TestFailed("The bayes object's 'probcache' did not compare the same at the end of all this!")
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -