📄 storage.py
字号:
'''Save state to the database''' self._set_row(self.statekey, self.nspam, self.nham) def cursor(self): '''Return a new db cursor''' raise NotImplementedError, "must be implemented in subclass" def fetchall(self, c): '''Return all rows as a dict''' raise NotImplementedError, "must be implemented in subclass" def commit(self, c): '''Commit the current transaction - may commit at db or cursor''' raise NotImplementedError, "must be implemented in subclass" def create_bayes(self): '''Create a new bayes table''' c = self.cursor() c.execute(self.table_definition) self.commit(c) def _get_row(self, word): '''Return row matching word''' try: c = self.cursor() c.execute("select * from bayes" " where word=%s", (word,)) except Exception, e: print >> sys.stderr, "error:", (e, word) raise rows = self.fetchall(c) if rows: return rows[0] else: return {} def _set_row(self, word, nspam, nham): c = self.cursor() if self._has_key(word): c.execute("update bayes" " set nspam=%s,nham=%s" " where word=%s", (nspam, nham, word)) else: c.execute("insert into bayes" " (nspam, nham, word)" " values (%s, %s, %s)", (nspam, nham, word)) self.commit(c) def _delete_row(self, word): c = self.cursor() c.execute("delete from bayes" " where word=%s", (word,)) self.commit(c) def _has_key(self, key): c = self.cursor() c.execute("select word from bayes" " where word=%s", (key,)) return len(self.fetchall(c)) > 0 def _wordinfoget(self, word): if isinstance(word, unicode): word = word.encode("utf-8") row = self._get_row(word) if row: item = self.WordInfoClass() item.__setstate__((row["nspam"], row["nham"])) return item else: return self.WordInfoClass() def _wordinfoset(self, word, record): if isinstance(word, unicode): word = word.encode("utf-8") self._set_row(word, record.spamcount, record.hamcount) def _wordinfodel(self, word): if isinstance(word, unicode): word = word.encode("utf-8") self._delete_row(word) def _wordinfokeys(self): c = self.cursor() c.execute("select word from bayes") rows = self.fetchall(c) return [r[0] for r in rows]class PGClassifier(SQLClassifier): '''Classifier object persisted in a Postgres database''' def __init__(self, db_name): self.table_definition = ("create table bayes (" " word bytea not null default ''," " nspam integer not null default 0," " nham integer not null default 0," " primary key(word)" ")") SQLClassifier.__init__(self, db_name) def cursor(self): return self.db.cursor() def fetchall(self, c): return c.dictfetchall() def commit(self, c): self.db.commit() def load(self): '''Load state from database''' import psycopg if options["globals", "verbose"]: print >> sys.stderr, 'Loading state from',self.db_name,'database' self.db = psycopg.connect(self.db_name) c = self.cursor() try: c.execute("select count(*) from bayes") except psycopg.ProgrammingError: self.db.rollback() self.create_bayes() if self._has_key(self.statekey): row = self._get_row(self.statekey) self.nspam = row["nspam"] self.nham = row["nham"] if options["globals", "verbose"]: print >> sys.stderr, ('%s is an existing database,' ' with %d spam and %d ham') \ % (self.db_name, self.nspam, self.nham) else: # new database if options["globals", "verbose"]: print >> sys.stderr, self.db_name,'is a new database' self.nspam = 0 self.nham = 0class mySQLClassifier(SQLClassifier): '''Classifier object persisted in a mySQL database It is assumed that the database already exists, and that the mySQL server is currently running.''' def __init__(self, data_source_name): self.table_definition = ("create table bayes (" " word varchar(255) not null default ''," " nspam integer not null default 0," " nham integer not null default 0," " primary key(word)" ");") self.host = "localhost" self.username = "root" self.password = "" db_name = "spambayes" source_info = data_source_name.split() for info in source_info: if info.startswith("host"): self.host = info[5:] elif info.startswith("user"): self.username = info[5:] elif info.startswith("pass"): self.password = info[5:] elif info.startswith("dbname"): db_name = info[7:] SQLClassifier.__init__(self, db_name) def cursor(self): return self.db.cursor() def fetchall(self, c): return c.fetchall() def commit(self, c): self.db.commit() def load(self): '''Load state from database''' import MySQLdb if options["globals", "verbose"]: print >> sys.stderr, 'Loading state from',self.db_name,'database' self.db = MySQLdb.connect(host=self.host, db=self.db_name, user=self.username, passwd=self.password) c = self.cursor() try: c.execute("select count(*) from bayes") except MySQLdb.ProgrammingError: try: self.db.rollback() except MySQLdb.NotSupportedError: # Server doesn't support rollback, so just assume that # we can keep going and create the db. This should only # happen once, anyway. pass self.create_bayes() if self._has_key(self.statekey): row = self._get_row(self.statekey) self.nspam = int(row[1]) self.nham = int(row[2]) if options["globals", "verbose"]: print >> sys.stderr, ('%s is an existing database,' ' with %d spam and %d ham') \ % (self.db_name, self.nspam, self.nham) else: # new database if options["globals", "verbose"]: print >> sys.stderr, self.db_name,'is a new database' self.nspam = 0 self.nham = 0 def _wordinfoget(self, word): if isinstance(word, unicode): word = word.encode("utf-8") row = self._get_row(word) if row: item = self.WordInfoClass() item.__setstate__((row[1], row[2])) return item else: return Noneclass CDBClassifier(classifier.Classifier): """A classifier that uses a CDB database. A CDB wordinfo database is quite small and fast but is slow to update. It is appropriate if training is done rarely (e.g. monthly or weekly using archived ham and spam). """ def __init__(self, db_name): classifier.Classifier.__init__(self) self.db_name = db_name self.statekey = STATE_KEY self.load() def _WordInfoFactory(self, counts): # For whatever reason, WordInfo's cannot be created with # constructor ham/spam counts, so we do the work here. # Since we're doing the work, we accept the ham/spam count # in the form of a comma-delimited string, as that's what # we get. ham, spam = counts.split(',') wi = classifier.WordInfo() wi.hamcount = int(ham) wi.spamcount = int(spam) return wi # Stolen from sb_dbexpimp.py # Heaven only knows what encoding non-ASCII stuff will be in # Try a few common western encodings and punt if they all fail def uunquote(self, s): for encoding in ("utf-8", "cp1252", "iso-8859-1"): try: return unicode(s, encoding) except UnicodeDecodeError: pass # punt return s def load(self): if os.path.exists(self.db_name): db = open(self.db_name, "rb") data = dict(cdb.Cdb(db)) db.close() self.nham, self.nspam = [int(i) for i in \ data[self.statekey].split(',')] self.wordinfo = dict([(self.uunquote(k), self._WordInfoFactory(v)) \ for k, v in data.iteritems() \ if k != self.statekey]) if options["globals", "verbose"]: print >> sys.stderr, ('%s is an existing CDB,' ' with %d ham and %d spam') \ % (self.db_name, self.nham, self.nspam) else: if options["globals", "verbose"]: print >> sys.stderr, self.db_name, 'is a new CDB' self.wordinfo = {} self.nham = 0 self.nspam = 0 def store(self): items = [(self.statekey, "%d,%d" % (self.nham, self.nspam))] for word, wi in self.wordinfo.iteritems(): if isinstance(word, types.UnicodeType): word = word.encode("utf-8") items.append((word, "%d,%d" % (wi.hamcount, wi.spamcount))) db = open(self.db_name, "wb") cdb.cdb_make(db, items) db.close() def close(self): # We keep no resources open - nothing to do. pass# If ZODB isn't available, then this class won't be useable, but we# still need to be able to import this module. So we pretend that all# is ok.try: from persistent import Persistentexcept ImportError: try: from ZODB import Persistent except ImportError: Persistent = objectclass _PersistentClassifier(classifier.Classifier, Persistent): def __init__(self): import ZODB from BTrees.OOBTree import OOBTree classifier.Classifier.__init__(self) self.wordinfo = OOBTree()class ZODBClassifier(object): # Allow subclasses to override classifier class. ClassifierClass = _PersistentClassifier def __init__(self, db_name, mode='c'): self.db_filename = db_name self.db_name = os.path.basename(db_name) self.closed = True self.mode = mode self.load() def __getattr__(self, att): # We pretend that we are a classifier subclass. if hasattr(self, "classifier") and hasattr(self.classifier, att): return getattr(self.classifier, att) raise AttributeError("ZODBClassifier object has no attribute '%s'" % (att,)) def __setattr__(self, att, value): # For some attributes, we change the classifier instead. if att in ("nham", "nspam") and hasattr(self, "classifier"): setattr(self.classifier, att, value) else:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -