📄 storage.py
字号:
#! /usr/bin/env python'''storage.py - Spambayes database management framework.Classes: PickledClassifier - Classifier that uses a pickle db DBDictClassifier - Classifier that uses a shelve db PGClassifier - Classifier that uses postgres mySQLClassifier - Classifier that uses mySQL CBDClassifier - Classifier that uses CDB ZODBClassifier - Classifier that uses ZODB ZEOClassifier - Classifier that uses ZEO Trainer - Classifier training observer SpamTrainer - Trainer for spam HamTrainer - Trainer for hamAbstract: *Classifier are subclasses of Classifier (classifier.Classifier) that add automatic state store/restore function to the Classifier class. All SQL based classifiers are subclasses of SQLClassifier, which is a subclass of Classifier. PickledClassifier is a Classifier class that uses a cPickle datastore. This database is relatively small, but slower than other databases. DBDictClassifier is a Classifier class that uses a database store. Trainer is concrete class that observes a Corpus and trains a Classifier object based upon movement of messages between corpora When an add message notification is received, the trainer trains the database with the message, as spam or ham as appropriate given the type of trainer (spam or ham). When a remove message notification is received, the trainer untrains the database as appropriate. SpamTrainer and HamTrainer are convenience subclasses of Trainer, that initialize as the appropriate type of TrainerTo Do: o Suggestions? '''# This module is part of the spambayes project, which is Copyright 2002-5# The Python Software Foundation and is covered by the Python Software# Foundation license.### Note to authors - please direct all prints to sys.stderr. In some### situations prints to sys.stdout will garble the message (e.g., in### hammiefilter).__author__ = "Neale Pickett <neale@woozle.org>, \Tim Stone <tim@fourstonesExpressions.com>"__credits__ = "All the spambayes contributors."try: True, Falseexcept NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 def bool(val): return not not valimport osimport sysimport timeimport typesfrom spambayes import classifierfrom spambayes.Options import options, get_pathname_optionimport cPickle as pickleimport errnoimport shelvefrom spambayes import cdbfrom spambayes import dbmstorage# Make shelve use binary pickles by default.oldShelvePickler = shelve.Picklerdef binaryDefaultPickler(f, binary=1): return oldShelvePickler(f, binary)shelve.Pickler = binaryDefaultPicklerPICKLE_TYPE = 1NO_UPDATEPROBS = False # Probabilities will not be autoupdated with trainingUPDATEPROBS = True # Probabilities will be autoupdated with trainingclass PickledClassifier(classifier.Classifier): '''Classifier object persisted in a pickle''' def __init__(self, db_name): classifier.Classifier.__init__(self) self.db_name = db_name self.load() def load(self): '''Load this instance from the pickle.''' # This is a bit strange, because the loading process # creates a temporary instance of PickledClassifier, from which # this object's state is copied. This is a nuance of the way # that pickle does its job. # Tim sez: that's because this is an unusual way to use pickle. # Note that nothing non-trivial is actually copied, though: # assignment merely copies a pointer. The actual wordinfo etc # objects are shared between tempbayes and self, and the tiny # tempbayes object is reclaimed when load() returns. if options["globals", "verbose"]: print >> sys.stderr, 'Loading state from',self.db_name,'pickle' tempbayes = None try: fp = open(self.db_name, 'rb') except IOError, e: if e.errno != errno.ENOENT: raise else: tempbayes = pickle.load(fp) fp.close() if tempbayes: # Copy state from tempbayes. The use of our base-class # __setstate__ is forced, in case self is of a subclass of # PickledClassifier that overrides __setstate__. classifier.Classifier.__setstate__(self, tempbayes.__getstate__()) if options["globals", "verbose"]: print >> sys.stderr, ('%s is an existing pickle,' ' with %d ham and %d spam') \ % (self.db_name, self.nham, self.nspam) else: # new pickle if options["globals", "verbose"]: print >> sys.stderr, self.db_name,'is a new pickle' self.wordinfo = {} self.nham = 0 self.nspam = 0 def store(self): '''Store self as a pickle''' if options["globals", "verbose"]: print >> sys.stderr, 'Persisting',self.db_name,'as a pickle' # Be as defensive as possible; keep always a safe copy. tmp = self.db_name + '.tmp' try: fp = open(tmp, 'wb') pickle.dump(self, fp, PICKLE_TYPE) fp.close() except IOError, e: if options["globals", "verbose"]: print >> sys.stderr, 'Failed update: ' + str(e) if fp is not None: os.remove(tmp) raise try: # With *nix we can just rename, and (as long as permissions # are correct) the old file will vanish. With win32, this # won't work - the Python help says that there may not be # a way to do an atomic replace, so we rename the old one, # put the new one there, and then delete the old one. If # something goes wrong, there is at least a copy of the old # one. os.rename(tmp, self.db_name) except OSError: os.rename(self.db_name, self.db_name + '.bak') os.rename(tmp, self.db_name) os.remove(self.db_name + '.bak') def close(self): # we keep no resources open - nothing to do pass# Values for our changed words mapWORD_DELETED = "D"WORD_CHANGED = "C"STATE_KEY = 'saved state'class DBDictClassifier(classifier.Classifier): '''Classifier object persisted in a caching database''' def __init__(self, db_name, mode='c'): '''Constructor(database name)''' classifier.Classifier.__init__(self) self.statekey = STATE_KEY self.mode = mode self.db_name = db_name self.load() def close(self): # Close our underlying database. Better not assume all databases # have close functions! def noop(): pass getattr(self.db, "close", noop)() getattr(self.dbm, "close", noop)() # should not be a need to drop the 'dbm' or 'db' attributes. # but we do anyway, because it makes it more clear what has gone # wrong if we try to keep using the database after we have closed # it. if hasattr(self, "db"): del self.db if hasattr(self, "dbm"): del self.dbm if options["globals", "verbose"]: print >> sys.stderr, 'Closed',self.db_name,'database' def load(self): '''Load state from database''' if options["globals", "verbose"]: print >> sys.stderr, 'Loading state from',self.db_name,'database' self.dbm = dbmstorage.open(self.db_name, self.mode) self.db = shelve.Shelf(self.dbm) if self.db.has_key(self.statekey): t = self.db[self.statekey] if t[0] != classifier.PICKLE_VERSION: raise ValueError("Can't unpickle -- version %s unknown" % t[0]) (self.nspam, self.nham) = t[1:] if options["globals", "verbose"]: print >> sys.stderr, ('%s is an existing database,' ' with %d spam and %d ham') \ % (self.db_name, self.nspam, self.nham) else: # new database if options["globals", "verbose"]: print >> sys.stderr, self.db_name,'is a new database' self.nspam = 0 self.nham = 0 self.wordinfo = {} self.changed_words = {} # value may be one of the WORD_ constants def store(self): '''Place state into persistent store''' if options["globals", "verbose"]: print >> sys.stderr, 'Persisting',self.db_name,'state in database' # Iterate over our changed word list. # This is *not* thread-safe - another thread changing our # changed_words could mess us up a little. Possibly a little # lock while we copy and reset self.changed_words would be appropriate. # For now, just do it the naive way. for key, flag in self.changed_words.iteritems(): if flag is WORD_CHANGED: val = self.wordinfo[key] self.db[key] = val.__getstate__() elif flag is WORD_DELETED: assert key not in self.wordinfo, \ "Should not have a wordinfo for words flagged for delete" # Word may be deleted before it was ever written. try: del self.db[key] except KeyError: pass else: raise RuntimeError, "Unknown flag value" # Reset the changed word list. self.changed_words = {} # Update the global state, then do the actual save. self._write_state_key() self.db.sync() def _write_state_key(self): self.db[self.statekey] = (classifier.PICKLE_VERSION, self.nspam, self.nham) def _post_training(self): """This is called after training on a wordstream. We ensure that the database is in a consistent state at this point by writing the state key.""" self._write_state_key() def _wordinfoget(self, word): if isinstance(word, unicode): word = word.encode("utf-8") try: return self.wordinfo[word] except KeyError: ret = None if self.changed_words.get(word) is not WORD_DELETED: r = self.db.get(word) if r: ret = self.WordInfoClass() ret.__setstate__(r) self.wordinfo[word] = ret return ret def _wordinfoset(self, word, record): # "Singleton" words (i.e. words that only have a single instance) # take up more than 1/2 of the database, but are rarely used # so we don't put them into the wordinfo cache, but write them # directly to the database # If the word occurs again, then it will be brought back in and # never be a singleton again. # This seems to reduce the memory footprint of the DBDictClassifier by # as much as 60%!!! This also has the effect of reducing the time it # takes to store the database if isinstance(word, unicode): word = word.encode("utf-8") if record.spamcount + record.hamcount <= 1: self.db[word] = record.__getstate__() try: del self.changed_words[word] except KeyError: # This can happen if, e.g., a new word is trained as ham # twice, then untrained once, all before a store(). pass try: del self.wordinfo[word] except KeyError: pass else: self.wordinfo[word] = record self.changed_words[word] = WORD_CHANGED def _wordinfodel(self, word): if isinstance(word, unicode): word = word.encode("utf-8") del self.wordinfo[word] self.changed_words[word] = WORD_DELETED def _wordinfokeys(self): wordinfokeys = self.db.keys() del wordinfokeys[wordinfokeys.index(self.statekey)] return wordinfokeysclass SQLClassifier(classifier.Classifier): def __init__(self, db_name): '''Constructor(database name)''' classifier.Classifier.__init__(self) self.statekey = STATE_KEY self.db_name = db_name self.load() def close(self): '''Release all database resources''' # As we (presumably) aren't as constrained as we are by file locking, # don't force sub-classes to override pass def load(self): '''Load state from the database''' raise NotImplementedError, "must be implemented in subclass" def store(self):
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -