db.py
来自「Harvestman-最新版本」· Python 代码 · 共 134 行
PY
134 行
# -- coding: utf-8"""db.py - Provides HarvestManDbManager class which takes careof creating and managing the user's crawl database. Thecrawl database is an sqlite database created as$HOME/.harvestman/db/crawls.db where $HOME is the homedirectory of the user. The crawls database is updated withmeta-data of every crawl after a crawl is completed.Created by Anand B Pillai <abpillai at gmail dot com> Mar 26 2008Copyright (C) 2008 Anand B Pillai."""import os, sysimport timefrom harvestman.lib.common.common import objects, extrainfo, logconsoledef adapt_datetime(ts): return time.mktime(ts.timetuple())class HarvestManDbManager(object): """ Class performing the creation/management of crawl databases """ projid = 0 @classmethod def try_import(cls): try: import sqlite3 return sqlite3 except ImportError, e: pass @classmethod def create_user_database(cls): sqlite3 = cls.try_import() if sqlite3 is None: return logconsole("Creating user's crawl database file in %s..." % objects.config.userdbdir) dbfile = os.path.join(objects.config.userdbdir, "crawls.db") conn = sqlite3.connect(dbfile) c = conn.cursor() # Create table for projects # This line is causing a problem in darwin # c.execute("drop table if exists projects") c.execute("""create table projects (id integer primary key autoincrement default 0, time real, name text, url str, config str)""") # Create table for project statistics # We are storing the information for # 1. number of urls scanned # 2. number of urls processed (fetched/crawled) # 3. number of URLs which were crawl-filtered # 4. number of urls failed to fetch # 5. number of urls with 404 errors # 6. number of URLs which hit the cache # 7. number of servers scanned # 8. number of unique directories scanned # 9. number of files saved # 10. Amount of data fetched in bytes # 11. the total time for the crawl. # This line is causing a problem in darwin # c.execute("drop table project_stats") c.execute("""create table project_stats (project_id integer primary key, urls integer, procurls integer, filteredurls integer, failedurls integer, brokenurls integer, cacheurls integer, servers integer, directories integer, files integer, data real, duration text)""") c.close() @classmethod def add_project_record(cls): sqlite3 = cls.try_import() if sqlite3 is None: return extrainfo('Writing project record to crawls database...') dbfile = os.path.join(objects.config.userdbdir, "crawls.db") # Get the configuration as a pickled string cfg = objects.config.copy() conn = sqlite3.connect(dbfile) c = conn.cursor() c.execute("insert into projects (time, name, url, config) values(?,?,?,?)", (time.time(),cfg.project,cfg.url, repr(cfg))) conn.commit() # Fetch the most recent project id and save it as projid c.execute("select max(id) from projects") cls.projid = c.fetchone()[0] # print 'project id=>',cls.projid c.close() extrainfo("Done.") @classmethod def add_stats_record(cls, statsd): sqlite3 = cls.try_import() if sqlite3 is None: return logconsole('Writing project statistics to crawl database...') dbfile = os.path.join(objects.config.userdbdir, "crawls.db") conn = sqlite3.connect(dbfile) c = conn.cursor() t = (cls.projid, statsd['links'], statsd['processed'], statsd['filtered'], statsd['fatal'], statsd['broken'], statsd['filesinrepos'], statsd['extservers'] + 1, statsd['extdirs'] + 1, statsd['files'], statsd['bytes'], '%.2f' % statsd['fetchtime']) c.execute("insert into project_stats values(?,?,?,?,?,?,?,?,?,?,?,?)", t) conn.commit() c.close() passif __name__ == "__main__": HarvestManDbManager.create_user_database() pass
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?