storage.py

来自「用python实现的邮件过滤器」· Python 代码 · 共 1,060 行 · 第 1/3 页
1,060 行
#! /usr/bin/env python'''storage.py - Spambayes database management framework.Classes:    PickledClassifier - Classifier that uses a pickle db    DBDictClassifier - Classifier that uses a shelve db    PGClassifier - Classifier that uses postgres    mySQLClassifier - Classifier that uses mySQL    CBDClassifier - Classifier that uses CDB    ZODBClassifier - Classifier that uses ZODB    ZEOClassifier - Classifier that uses ZEO    Trainer - Classifier training observer    SpamTrainer - Trainer for spam    HamTrainer - Trainer for hamAbstract:    *Classifier are subclasses of Classifier (classifier.Classifier)    that add automatic state store/restore function to the Classifier class.    All SQL based classifiers are subclasses of SQLClassifier, which is a    subclass of Classifier.    PickledClassifier is a Classifier class that uses a cPickle    datastore.  This database is relatively small, but slower than other    databases.    DBDictClassifier is a Classifier class that uses a database    store.    Trainer is concrete class that observes a Corpus and trains a    Classifier object based upon movement of messages between corpora  When    an add message notification is received, the trainer trains the    database with the message, as spam or ham as appropriate given the    type of trainer (spam or ham).  When a remove message notification    is received, the trainer untrains the database as appropriate.    SpamTrainer and HamTrainer are convenience subclasses of Trainer, that    initialize as the appropriate type of TrainerTo Do:    o Suggestions?    '''# This module is part of the spambayes project, which is Copyright 2002-5# The Python Software Foundation and is covered by the Python Software# Foundation license.### Note to authors - please direct all prints to sys.stderr.  In some### situations prints to sys.stdout will garble the message (e.g., in### hammiefilter).__author__ = "Neale Pickett <neale@woozle.org>, \Tim Stone <tim@fourstonesExpressions.com>"__credits__ = "All the spambayes contributors."try:    True, Falseexcept NameError:    # Maintain compatibility with Python 2.2    True, False = 1, 0    def bool(val):        return not not valimport osimport sysimport timeimport typesfrom spambayes import classifierfrom spambayes.Options import options, get_pathname_optionimport cPickle as pickleimport errnoimport shelvefrom spambayes import cdbfrom spambayes import dbmstorage# Make shelve use binary pickles by default.oldShelvePickler = shelve.Picklerdef binaryDefaultPickler(f, binary=1):    return oldShelvePickler(f, binary)shelve.Pickler = binaryDefaultPicklerPICKLE_TYPE = 1NO_UPDATEPROBS = False   # Probabilities will not be autoupdated with trainingUPDATEPROBS = True       # Probabilities will be autoupdated with trainingclass PickledClassifier(classifier.Classifier):    '''Classifier object persisted in a pickle'''    def __init__(self, db_name):        classifier.Classifier.__init__(self)        self.db_name = db_name        self.load()    def load(self):        '''Load this instance from the pickle.'''        # This is a bit strange, because the loading process        # creates a temporary instance of PickledClassifier, from which        # this object's state is copied.  This is a nuance of the way        # that pickle does its job.        # Tim sez:  that's because this is an unusual way to use pickle.        # Note that nothing non-trivial is actually copied, though:        # assignment merely copies a pointer.  The actual wordinfo etc        # objects are shared between tempbayes and self, and the tiny        # tempbayes object is reclaimed when load() returns.        if options["globals", "verbose"]:            print >> sys.stderr, 'Loading state from',self.db_name,'pickle'        tempbayes = None        try:            fp = open(self.db_name, 'rb')        except IOError, e:            if e.errno != errno.ENOENT: raise        else:            tempbayes = pickle.load(fp)            fp.close()        if tempbayes:            # Copy state from tempbayes.  The use of our base-class            # __setstate__ is forced, in case self is of a subclass of            # PickledClassifier that overrides __setstate__.            classifier.Classifier.__setstate__(self,                                               tempbayes.__getstate__())            if options["globals", "verbose"]:                print >> sys.stderr, ('%s is an existing pickle,'                                      ' with %d ham and %d spam') \                      % (self.db_name, self.nham, self.nspam)        else:            # new pickle            if options["globals", "verbose"]:                print >> sys.stderr, self.db_name,'is a new pickle'            self.wordinfo = {}            self.nham = 0            self.nspam = 0    def store(self):        '''Store self as a pickle'''        if options["globals", "verbose"]:            print >> sys.stderr, 'Persisting',self.db_name,'as a pickle'        # Be as defensive as possible; keep always a safe copy.        tmp = self.db_name + '.tmp'        try:             fp = open(tmp, 'wb')             pickle.dump(self, fp, PICKLE_TYPE)             fp.close()         except IOError, e:             if options["globals", "verbose"]:                 print >> sys.stderr, 'Failed update: ' + str(e)            if fp is not None:                 os.remove(tmp)             raise        try:            # With *nix we can just rename, and (as long as permissions            # are correct) the old file will vanish.  With win32, this            # won't work - the Python help says that there may not be            # a way to do an atomic replace, so we rename the old one,            # put the new one there, and then delete the old one.  If            # something goes wrong, there is at least a copy of the old            # one.            os.rename(tmp, self.db_name)        except OSError:            os.rename(self.db_name, self.db_name + '.bak')            os.rename(tmp, self.db_name)            os.remove(self.db_name + '.bak')    def close(self):        # we keep no resources open - nothing to do        pass# Values for our changed words mapWORD_DELETED = "D"WORD_CHANGED = "C"STATE_KEY = 'saved state'class DBDictClassifier(classifier.Classifier):    '''Classifier object persisted in a caching database'''    def __init__(self, db_name, mode='c'):        '''Constructor(database name)'''        classifier.Classifier.__init__(self)        self.statekey = STATE_KEY        self.mode = mode        self.db_name = db_name        self.load()    def close(self):        # Close our underlying database.  Better not assume all databases        # have close functions!        def noop(): pass        getattr(self.db, "close", noop)()        getattr(self.dbm, "close", noop)()        # should not be a need to drop the 'dbm' or 'db' attributes.        # but we do anyway, because it makes it more clear what has gone        # wrong if we try to keep using the database after we have closed        # it.        if hasattr(self, "db"):            del self.db        if hasattr(self, "dbm"):            del self.dbm        if options["globals", "verbose"]:            print >> sys.stderr, 'Closed',self.db_name,'database'    def load(self):        '''Load state from database'''        if options["globals", "verbose"]:            print >> sys.stderr, 'Loading state from',self.db_name,'database'        self.dbm = dbmstorage.open(self.db_name, self.mode)        self.db = shelve.Shelf(self.dbm)        if self.db.has_key(self.statekey):            t = self.db[self.statekey]            if t[0] != classifier.PICKLE_VERSION:                raise ValueError("Can't unpickle -- version %s unknown" % t[0])            (self.nspam, self.nham) = t[1:]            if options["globals", "verbose"]:                print >> sys.stderr, ('%s is an existing database,'                                      ' with %d spam and %d ham') \                      % (self.db_name, self.nspam, self.nham)        else:            # new database            if options["globals", "verbose"]:                print >> sys.stderr, self.db_name,'is a new database'            self.nspam = 0            self.nham = 0        self.wordinfo = {}        self.changed_words = {} # value may be one of the WORD_ constants    def store(self):        '''Place state into persistent store'''        if options["globals", "verbose"]:            print >> sys.stderr, 'Persisting',self.db_name,'state in database'        # Iterate over our changed word list.        # This is *not* thread-safe - another thread changing our        # changed_words could mess us up a little.  Possibly a little        # lock while we copy and reset self.changed_words would be appropriate.        # For now, just do it the naive way.        for key, flag in self.changed_words.iteritems():            if flag is WORD_CHANGED:                val = self.wordinfo[key]                self.db[key] = val.__getstate__()            elif flag is WORD_DELETED:                assert key not in self.wordinfo, \                       "Should not have a wordinfo for words flagged for delete"                # Word may be deleted before it was ever written.                try:                    del self.db[key]                except KeyError:                    pass            else:                raise RuntimeError, "Unknown flag value"        # Reset the changed word list.        self.changed_words = {}        # Update the global state, then do the actual save.        self._write_state_key()        self.db.sync()    def _write_state_key(self):        self.db[self.statekey] = (classifier.PICKLE_VERSION,                                  self.nspam, self.nham)    def _post_training(self):        """This is called after training on a wordstream.  We ensure that the        database is in a consistent state at this point by writing the state        key."""        self._write_state_key()    def _wordinfoget(self, word):        if isinstance(word, unicode):            word = word.encode("utf-8")        try:            return self.wordinfo[word]        except KeyError:            ret = None            if self.changed_words.get(word) is not WORD_DELETED:                r = self.db.get(word)                if r:                    ret = self.WordInfoClass()                    ret.__setstate__(r)                    self.wordinfo[word] = ret            return ret    def _wordinfoset(self, word, record):        # "Singleton" words (i.e. words that only have a single instance)        # take up more than 1/2 of the database, but are rarely used        # so we don't put them into the wordinfo cache, but write them        # directly to the database        # If the word occurs again, then it will be brought back in and        # never be a singleton again.        # This seems to reduce the memory footprint of the DBDictClassifier by        # as much as 60%!!!  This also has the effect of reducing the time it        # takes to store the database        if isinstance(word, unicode):            word = word.encode("utf-8")        if record.spamcount + record.hamcount <= 1:            self.db[word] = record.__getstate__()            try:                del self.changed_words[word]            except KeyError:                # This can happen if, e.g., a new word is trained as ham                # twice, then untrained once, all before a store().                pass            try:                del self.wordinfo[word]            except KeyError:                pass        else:            self.wordinfo[word] = record            self.changed_words[word] = WORD_CHANGED    def _wordinfodel(self, word):        if isinstance(word, unicode):            word = word.encode("utf-8")        del self.wordinfo[word]        self.changed_words[word] = WORD_DELETED    def _wordinfokeys(self):        wordinfokeys = self.db.keys()        del wordinfokeys[wordinfokeys.index(self.statekey)]        return wordinfokeysclass SQLClassifier(classifier.Classifier):    def __init__(self, db_name):        '''Constructor(database name)'''        classifier.Classifier.__init__(self)        self.statekey = STATE_KEY        self.db_name = db_name        self.load()    def close(self):        '''Release all database resources'''        # As we (presumably) aren't as constrained as we are by file locking,        # don't force sub-classes to override        pass    def load(self):        '''Load state from the database'''        raise NotImplementedError, "must be implemented in subclass"    def store(self):
storage.py - 源码说明

本页面展示了「用python实现的邮件过滤器」中的 storage.py 源码文件，采用 Python 编程语言编写，共 1,060 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与python相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?