⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 export.py

📁 用python实现的邮件过滤器
💻 PY
字号:
# Exports your ham and spam folders to a standard SpamBayes test directory.import sys, os, shutilfrom manager import GetManagerNUM_BUCKETS = 10DEFAULT_DIRECTORY = "..\\testtools\\Data"import remime_header_re = re.compile(r"""    ^ content- (type | transfer-encoding) : [^\n]* \n    ([ \t] [^\n]* \n)*  # suck up adjacent continuation lines""", re.VERBOSE | re.MULTILINE | re.IGNORECASE)# Return # of msgs in folder (a MAPIMsgStoreFolder).def count_messages(folder):    result = 0    for msg in folder.GetMessageGenerator():        result += 1    return result# Return triple (num_spam_messages,#                num_ham_messages,#                ["Set1", "Set2", ...])# where the list contains one entry for each bucket.def BuildBuckets(manager, num_buckets):    store = manager.message_store    config = manager.config    num_ham = num_spam = 0    for folder in store.GetFolderGenerator(config.training.spam_folder_ids,                                           config.training.spam_include_sub):        num_spam += count_messages(folder)    for folder in store.GetFolderGenerator(config.training.ham_folder_ids,                                           config.training.ham_include_sub):        num_ham += count_messages(folder)    dirs = ["Set%d" % i for i in range(1, num_buckets + 1)]    return num_spam, num_ham, dirs# Return the text of msg (a MAPIMsgStoreMsg object) as a string.# There are subtleties, alas.def get_text(msg):    email_object = msg.GetEmailPackageObject()    try:        # Don't use str(msg) instead -- that inserts an information-        # free "Unix From" line at the top of each msg.        return email_object.as_string()    except:        # Fudge.  GetEmailPackageObject() strips MIME headers by default.        # I'm not exactly sure why, but I have some spam with what looks to        # be ill-formed MIME, such that the email pkg's .as_string() (or        # str() -- same thing, really) gets fatally confused when the MIME        # headers are stripped, dying with an internal        #        #    string payload expected: <type 'list'>        #        # TypeError.  Ignore the exception and try again.        pass    # This is what our ShowClues() does, and that's never had a problem    # getting a string from these problem messages.    email_object = msg.GetEmailPackageObject(strip_mime_headers=False)    text = email_object.as_string()    # If we leave the Content-Type and Content-Transfer-Encoding headers in    # now, the email package can get confused when it tries to parse this    # string.  So, alas, strip 'em by hand.    i = text.find('\n\n')  # boundary between headers and body    if i < 0:        # no body        i = len(text) - 2    headers, body = text[:i+2], text[i+2:]    ##print 'before:\n', text    headers = mime_header_re.sub('', headers) # remove troublesome headers    text = headers + body    ##print 'after:\n', text    # A sanity check, to make sure the email pkg can still parse this mess.    # If it can't, it will raise some exception.  I haven't seen this    # happen yet.  Getting into this section is rare (less than 1% of my spam    # so far), so the expense doesn't bother me.    import email    email.message_from_string(text)    return text# Export the messages from the folders in folder_ids, as text files, into# the subdirectories whose names are given in buckets, under the directory# 'root' (which is .../Ham or .../Spam).  Each message is placed in a# bucket subdirectory chosen at random (among all bucket subdirectories).# Returns the total number of .txt files created (== the number of msgs# successfully exported).def _export_folders(manager, root, buckets, folder_ids, include_sub):    from random import choice    num = 0    store = manager.message_store    for folder in store.GetFolderGenerator(folder_ids, include_sub):        print "", folder.name        for message in folder.GetMessageGenerator():            this_dir = os.path.join(root,  choice(buckets))            # filename is the EID.txt            try:                msg_text = get_text(message)            except KeyboardInterrupt:                raise            except:                print "Failed to get message text for '%s': %s" \                      % (message.GetSubject(), sys.exc_info()[1])                continue            fname = os.path.join(this_dir, message.GetID()[1]) + ".txt"            f = open(fname, "w")            f.write(msg_text)            f.close()            num += 1    return num# This does all the work.  'directory' is the parent directory for the# generated Ham and Spam sub-folders.def export(directory, num_buckets):    print "Loading bayes manager..."    manager = GetManager()    config = manager.config    num_spam, num_ham, buckets = BuildBuckets(manager, num_buckets)    print "Have", num_spam, "spam and", num_ham, "ham to export,",    print "spread over", len(buckets), "directories."    for sub in "Spam", "Ham":        if os.path.exists(os.path.join(directory, sub)):            shutil.rmtree(os.path.join(directory, sub))        for b in buckets + ["reservoir"]:            d = os.path.join(directory, sub, b)            os.makedirs(d)    print "Exporting spam..."    num = _export_folders(manager,                          os.path.join(directory, "Spam"),                          buckets,                          config.training.spam_folder_ids,                          config.training.spam_include_sub)    print "Exported", num, "spam messages."    print "Exporting ham..."    num = _export_folders(manager,                          os.path.join(directory, "Ham"),                          buckets,                          config.training.ham_folder_ids,                          config.training.ham_include_sub)    print "Exported", num, "ham messages."def main():    import getopt    try:        opts, args = getopt.getopt(sys.argv[1:], "hqn:")    except getopt.error, d:        usage(d)    quiet = 0    num_buckets = NUM_BUCKETS    for opt, val in opts:        if opt == '-h':            usage()        elif opt == '-q':            quiet = 1        elif opt == '-n':            num_buckets = int(val)        else:            assert 0, "internal error on option '%s'" % opt    if len(args) > 1:        usage("Only one directory name can be specified.")    elif args:        directory = args[0]    else:        directory = os.path.join(os.path.dirname(sys.argv[0]),                                 DEFAULT_DIRECTORY)    if num_buckets < 1:        usage("-n must be at least 1.")    directory = os.path.abspath(directory)    print "This program will export your Outlook Ham and Spam folders"    print "to the directory '%s'" % directory    if os.path.exists(directory):        print "*******"        print "WARNING: all existing files in '%s' will be deleted" % directory        print "*******"    if not quiet:        raw_input("Press enter to continue, or Ctrl+C to abort.")    export(directory, num_buckets)# Display errormsg (if specified), a blank line, and usage information; then# exit with status 1 (usage doesn't return).def usage(errormsg=None):    if errormsg:        print str(errormsg)        print    print """ \Usage: %s [-h] [-q] [-n nsets] [directory]-h : help - display this msg and stop-q : quiet - don't prompt for confirmation.-n : number of Set subdirectories in the Ham and Spam dirs, default=%dExport Spam and Ham training folders defined in the Outlook Plugin to a testdirectory.  The directory structure is as defined in the parentREADME-DEVEL.txt file, in the "Standard Test Data Setup" section.  Files aredistributed randomly among the Set subdirectories.  You should probably userebal.py afterwards to even them out.If 'directory' is not specified, '%s' is assumed.If 'directory' exists, it will be recursively deleted beforethe export (but you will be asked to confirm unless -q is given).""" \            % (os.path.basename(sys.argv[0]),               NUM_BUCKETS,               DEFAULT_DIRECTORY)    sys.exit(1)if __name__=='__main__':    main()

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -