📄 export.py
字号:
# Exports your ham and spam folders to a standard SpamBayes test directory.import sys, os, shutilfrom manager import GetManagerNUM_BUCKETS = 10DEFAULT_DIRECTORY = "..\\testtools\\Data"import remime_header_re = re.compile(r""" ^ content- (type | transfer-encoding) : [^\n]* \n ([ \t] [^\n]* \n)* # suck up adjacent continuation lines""", re.VERBOSE | re.MULTILINE | re.IGNORECASE)# Return # of msgs in folder (a MAPIMsgStoreFolder).def count_messages(folder): result = 0 for msg in folder.GetMessageGenerator(): result += 1 return result# Return triple (num_spam_messages,# num_ham_messages,# ["Set1", "Set2", ...])# where the list contains one entry for each bucket.def BuildBuckets(manager, num_buckets): store = manager.message_store config = manager.config num_ham = num_spam = 0 for folder in store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub): num_spam += count_messages(folder) for folder in store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub): num_ham += count_messages(folder) dirs = ["Set%d" % i for i in range(1, num_buckets + 1)] return num_spam, num_ham, dirs# Return the text of msg (a MAPIMsgStoreMsg object) as a string.# There are subtleties, alas.def get_text(msg): email_object = msg.GetEmailPackageObject() try: # Don't use str(msg) instead -- that inserts an information- # free "Unix From" line at the top of each msg. return email_object.as_string() except: # Fudge. GetEmailPackageObject() strips MIME headers by default. # I'm not exactly sure why, but I have some spam with what looks to # be ill-formed MIME, such that the email pkg's .as_string() (or # str() -- same thing, really) gets fatally confused when the MIME # headers are stripped, dying with an internal # # string payload expected: <type 'list'> # # TypeError. Ignore the exception and try again. pass # This is what our ShowClues() does, and that's never had a problem # getting a string from these problem messages. email_object = msg.GetEmailPackageObject(strip_mime_headers=False) text = email_object.as_string() # If we leave the Content-Type and Content-Transfer-Encoding headers in # now, the email package can get confused when it tries to parse this # string. So, alas, strip 'em by hand. i = text.find('\n\n') # boundary between headers and body if i < 0: # no body i = len(text) - 2 headers, body = text[:i+2], text[i+2:] ##print 'before:\n', text headers = mime_header_re.sub('', headers) # remove troublesome headers text = headers + body ##print 'after:\n', text # A sanity check, to make sure the email pkg can still parse this mess. # If it can't, it will raise some exception. I haven't seen this # happen yet. Getting into this section is rare (less than 1% of my spam # so far), so the expense doesn't bother me. import email email.message_from_string(text) return text# Export the messages from the folders in folder_ids, as text files, into# the subdirectories whose names are given in buckets, under the directory# 'root' (which is .../Ham or .../Spam). Each message is placed in a# bucket subdirectory chosen at random (among all bucket subdirectories).# Returns the total number of .txt files created (== the number of msgs# successfully exported).def _export_folders(manager, root, buckets, folder_ids, include_sub): from random import choice num = 0 store = manager.message_store for folder in store.GetFolderGenerator(folder_ids, include_sub): print "", folder.name for message in folder.GetMessageGenerator(): this_dir = os.path.join(root, choice(buckets)) # filename is the EID.txt try: msg_text = get_text(message) except KeyboardInterrupt: raise except: print "Failed to get message text for '%s': %s" \ % (message.GetSubject(), sys.exc_info()[1]) continue fname = os.path.join(this_dir, message.GetID()[1]) + ".txt" f = open(fname, "w") f.write(msg_text) f.close() num += 1 return num# This does all the work. 'directory' is the parent directory for the# generated Ham and Spam sub-folders.def export(directory, num_buckets): print "Loading bayes manager..." manager = GetManager() config = manager.config num_spam, num_ham, buckets = BuildBuckets(manager, num_buckets) print "Have", num_spam, "spam and", num_ham, "ham to export,", print "spread over", len(buckets), "directories." for sub in "Spam", "Ham": if os.path.exists(os.path.join(directory, sub)): shutil.rmtree(os.path.join(directory, sub)) for b in buckets + ["reservoir"]: d = os.path.join(directory, sub, b) os.makedirs(d) print "Exporting spam..." num = _export_folders(manager, os.path.join(directory, "Spam"), buckets, config.training.spam_folder_ids, config.training.spam_include_sub) print "Exported", num, "spam messages." print "Exporting ham..." num = _export_folders(manager, os.path.join(directory, "Ham"), buckets, config.training.ham_folder_ids, config.training.ham_include_sub) print "Exported", num, "ham messages."def main(): import getopt try: opts, args = getopt.getopt(sys.argv[1:], "hqn:") except getopt.error, d: usage(d) quiet = 0 num_buckets = NUM_BUCKETS for opt, val in opts: if opt == '-h': usage() elif opt == '-q': quiet = 1 elif opt == '-n': num_buckets = int(val) else: assert 0, "internal error on option '%s'" % opt if len(args) > 1: usage("Only one directory name can be specified.") elif args: directory = args[0] else: directory = os.path.join(os.path.dirname(sys.argv[0]), DEFAULT_DIRECTORY) if num_buckets < 1: usage("-n must be at least 1.") directory = os.path.abspath(directory) print "This program will export your Outlook Ham and Spam folders" print "to the directory '%s'" % directory if os.path.exists(directory): print "*******" print "WARNING: all existing files in '%s' will be deleted" % directory print "*******" if not quiet: raw_input("Press enter to continue, or Ctrl+C to abort.") export(directory, num_buckets)# Display errormsg (if specified), a blank line, and usage information; then# exit with status 1 (usage doesn't return).def usage(errormsg=None): if errormsg: print str(errormsg) print print """ \Usage: %s [-h] [-q] [-n nsets] [directory]-h : help - display this msg and stop-q : quiet - don't prompt for confirmation.-n : number of Set subdirectories in the Ham and Spam dirs, default=%dExport Spam and Ham training folders defined in the Outlook Plugin to a testdirectory. The directory structure is as defined in the parentREADME-DEVEL.txt file, in the "Standard Test Data Setup" section. Files aredistributed randomly among the Set subdirectories. You should probably userebal.py afterwards to even them out.If 'directory' is not specified, '%s' is assumed.If 'directory' exists, it will be recursively deleted beforethe export (but you will be asked to confirm unless -q is given).""" \ % (os.path.basename(sys.argv[0]), NUM_BUCKETS, DEFAULT_DIRECTORY) sys.exit(1)if __name__=='__main__': main()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -