splitn.py

来自「用python实现的邮件过滤器」· Python 代码 · 共 114 行

PY
114
字号
#! /usr/bin/env python"""Split an mbox into N random mboxes.Usage: %(program)s [-h] [-s seed] [-v] -n N sourcembox outfilebaseOptions:    -h / --help        Print this help message and exit    -s seed        Seed the random number generator with seed (an integer).        By default, use system time at startup to seed.    -v        Verbose.  Displays a period for each 100 messages parsed.        May display other stuff.    -n N        The number of output mboxes desired.  This is required.Arguments:    sourcembox        The mbox to split.    outfilebase        The base path + name prefix for each of the N output files.        Output mboxes have names of the form            outfilebase + ("%%d.mbox" %% i)Example:    %(program)s -s 123 -n5 spam.mbox rspamproduces 5 mboxes, named rspam1.mbox through rspam5.mbox.  Each containsa random selection of the messages in spam.mbox, and together they containevery message in spam.mbox exactly once.  Each has approximately the samenumber of messages.  spam.mbox is not altered.  In addition, the seed forthe random number generator is forced to 123, so that while the split israndom, it's reproducible."""import sysimport randomimport mailboximport emailimport getoptfrom spambayes import mboxutilstry:    True, Falseexcept NameError:    # Maintain compatibility with Python 2.2    True, False = 1, 0program = sys.argv[0]def usage(code, msg=''):    print >> sys.stderr, __doc__ % globals()    if msg:        print >> sys.stderr, msg    sys.exit(code)def main():    try:        opts, args = getopt.getopt(sys.argv[1:], 'hn:s:v', ['help'])    except getopt.error, msg:        usage(1, msg)    n = None    verbose = False    for opt, arg in opts:        if opt in ('-h', '--help'):            usage(0)        elif opt == '-s':            random.seed(int(arg))        elif opt == '-n':            n = int(arg)        elif opt == '-v':            verbose = True    if n is None or n <= 1:        usage(1, "an -n value > 1 is required")    if len(args) != 2:        usage(1, "input mbox name and output base path are required")    inputpath, outputbasepath = args    infile = file(inputpath, 'rb')    outfiles = [file(outputbasepath + ("%d.mbox" % i), 'wb')                for i in range(1, n+1)]    mbox = mailbox.PortableUnixMailbox(infile, mboxutils.get_message)    counter = 0    for msg in mbox:        i = random.randrange(n)        astext = str(msg)        outfiles[i].write(astext)        counter += 1        if verbose:            if counter % 100 == 0:                print '.',    if verbose:        print        print counter, "messages split into", n, "files"    infile.close()    for f in outfiles:        f.close()if __name__ == '__main__':    main()

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?