📄 datamgr.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
# -- coding: utf-8""" datamgr.py - Data manager module for HarvestMan.    This module is part of the HarvestMan program.    Author: Anand B Pillai <abpillai at gmail dot com>        Oct 13 2006     Anand          Removed data lock since it is not required - Python GIL                                   automatically locks byte operations.    Feb 2 2007      Anand          Re-added function parse_style_sheet which went missing.    Feb 26 2007      Anand          Fixed bug in check_duplicate_download for stylesheets.                                   Also rewrote logic.    Mar 05 2007     Anand          Added method get_last_modified_time_and_data to support                                   server-side cache checking using HTTP 304. Fixed a small                                   bug in css url handling.    Apr 19 2007     Anand          Made to work with URL collections. Moved url mapping                                   dictionary here. Moved CSS parsing logic to pageparser                                   module.    Feb 13 2008     Anand          Replaced URL dictionary with disk caching binary search                                   tree. Other changes done later -> Got rid of many                                   redundant lists which were wasting memory. Need to trim                                   this further.   Feb 14 2008      Anand          Many changes. Replaced/removed datastructures. Merged                                   cache updating functions. Details in doc/Datastructures.txt .   April 4 2008     Anand          Added update_url method and corresponding update method                                   in bst.py to update state of URLs after download. Added                                   statement to print broken links information at end.      Copyright (C) 2004 Anand B Pillai.    """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import os, sysimport shutilimport timeimport mathimport reimport shaimport copyimport randomimport shelveimport tarfileimport zlibimport threading # Utilsfrom harvestman.lib import utilsfrom harvestman.lib import urlparserfrom harvestman.lib.mirrors import HarvestManMirrorManagerfrom harvestman.lib.db import HarvestManDbManagerfrom harvestman.lib.urlthread import HarvestManUrlThreadPoolfrom harvestman.lib.connector import *from harvestman.lib.methodwrapper import MethodWrapperMetaClassfrom harvestman.lib.common.common import *from harvestman.lib.common.macros import *from harvestman.lib.common.bst import BSTfrom harvestman.lib.common.pydblite import Base# Defining pluggable functions__plugins__ = { 'download_url_plugin': 'HarvestManDataManager:download_url',                'post_download_setup_plugin': 'HarvestManDataManager:post_download_setup',                'print_project_info_plugin': 'HarvestManDataManager:print_project_info',                'dump_url_tree_plugin': 'HarvestManDataManager:dump_url_tree'}# Defining functions with callbacks__callbacks__ = { 'download_url_callback': 'HarvestManDataManager:download_url',                  'post_download_setup_callback' : 'HarvestManDataManager:post_download_setup' }class HarvestManDataManager(object):    """ The data manager cum indexer class """    # For supporting callbacks    __metaclass__ = MethodWrapperMetaClass    alias = 'datamgr'            def __init__(self):        self.reset()    def reset(self):        # URLs which failed with any error        self._numfailed = 0        # URLs which failed even after a re-download        self._numfailed2 = 0        # URLs which were retried        self._numretried = 0        self.cache = None        self.savedfiles = 0        self.reposfiles = 0        self.cachefiles = 0        self.filteredfiles = 0        # Config object        self._cfg = objects.config        # Dictionary of servers crawled and        # their meta-data. Meta-data is        # a dictionary which currently        # has only one entry.        # i.e accept-ranges.        self._serversdict = {}        # byte count        self.bytes = 0L        # saved bytes count        self.savedbytes = 0L                # Redownload flag        self._redownload = False        # Mirror manager        self.mirrormgr = HarvestManMirrorManager.getInstance()        # Condition object for synchronization        self.cond = threading.Condition(threading.Lock())                self._urldb = None        self.collections = None    def initialize(self):        """ Do initializations per project """        # Url thread group class for multithreaded downloads        if self._cfg.usethreads:            self._urlThreadPool = HarvestManUrlThreadPool()            self._urlThreadPool.spawn_threads()        else:            self._urlThreadPool = None        # URL database, a BST with disk-caching        self._urldb = BST()        # Collections database, a BST with disk-caching                self.collections = BST()        # For testing, don't set this otherwise we might        # be left with many orphaned .bidx... folders!        if not self._cfg.testing:            self._urldb.set_auto(2)            self.collections.set_auto(2)        # Load any mirrors        self.mirrormgr.load_mirrors(self._cfg.mirrorfile)        # Set mirror search flag        self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch    def get_urldb(self):        return self._urldb        def add_url(self, urlobj):        """ Add urlobject urlobj to the local dictionary """        # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index)        self._urldb.insert(urlobj.index, urlobj)            def update_url(self, urlobj):        """ Update urlobject urlobj in the local dictionary """        # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index)        self._urldb.update(urlobj.index, urlobj)            def get_url(self, index):        # return self._urldict[str(index)]        return self._urldb.lookup(index)    def get_original_url(self, urlobj):        # Return the original URL object for        # duplicate URLs. This is useful for        # processing URL objects obtained from        # the collection object, because many        # of them might be duplicate and would        # not have any post-download information        # such a headers etc.        if urlobj.refindex != -1:            return self.get_url(urlobj.refindex)        else:            # Return the same URL object to avoid            # an <if None> check on the caller            return urlobj            def get_proj_cache_filename(self):        """ Return the cache filename for the current project """        # Note that this function does not actually build the cache directory.        # Get the cache file path        if self._cfg.projdir and self._cfg.project:            cachedir = os.path.join(self._cfg.projdir, "hm-cache")            cachefilename = os.path.join(cachedir, 'cache')            return cachefilename        else:            return ''    def get_proj_cache_directory(self):        """ Return the cache directory for the current project """        # Note that this function does not actually build the cache directory.        # Get the cache file path        if self._cfg.projdir and self._cfg.project:            return os.path.join(self._cfg.projdir, "hm-cache")        else:            return ''            def get_server_dictionary(self):        return self._serversdict    def supports_range_requests(self, urlobj):        """ Check whether the given url object        supports range requests """        # Look up its server in the dictionary        server = urlobj.get_full_domain()        if server in self._serversdict:            d = self._serversdict[server]            return d.get('accept-ranges', False)        return False            def read_project_cache(self):        """ Try to read the project cache file """        # Get cache filename        info('Reading Project Cache...')        cachereader = utils.HarvestManCacheReaderWriter(self.get_proj_cache_directory())        obj, found = cachereader.read_project_cache()        self._cfg.cachefound = found        self.cache = obj        if not found:            # Fresh cache - create structure...            self.cache.create('url','last_modified','etag', 'updated','location','checksum',                              'content_length','data','headers')                        # Create an index on URL            self.cache.create_index('url')        else:            pass    def write_file_from_cache(self, urlobj):        """ Write file from url cache. This        works only if the cache dictionary of this        url has a key named 'data' """        ret = False        # print 'Inside write_file_from_cache...'        url = urlobj.get_full_url()        content = self.cache._url[url]                if len(content):            # Value itself is a dictionary            item = content[0]            if not item.has_key('data'):                return ret            else:                urldata = item['data']                if urldata:                    fileloc = item['location']                                        # Write file                    extrainfo("Updating file from cache=>", fileloc)                    try:                        if SUCCESS(self.create_local_directory(os.path.dirname(fileloc))):                            f=open(fileloc, 'wb')                            f.write(zlib.decompress(urldata))                            f.close()                            ret = True                    except (IOError, zlib.error), e:                        error("Error:",e)                                        return ret    def update_cache_for_url(self, urlobj, filename, urldata, contentlen, lastmodified, tag):        """ Method to update the cache information for the URL 'url'        associated to file 'filename' on the disk """        # if page caching is disabled, skip this...        if not objects.config.pagecache:            return                url = urlobj.get_full_url()        if urldata:            csum = sha.new(urldata).hexdigest()        else:            csum = ''                    # Update all cache keys        content = self.cache._url[url]        if content:            rec = content[0]            self.cache.update(rec, checksum=csum, location=filename,content_length=contentlen,                               last_modified=lastmodified,etag=tag, updated=True)            if self._cfg.datacache:                self.cache.update(rec,data=zlib.compress(urldata))        else:            # Insert as new values            if self._cfg.datacache:                self.cache.insert(url=url, checksum=csum, location=filename,content_length=contentlen,last_modified=lastmodified,                                  etag=tag, updated=True,data=zlib.compress(urldata))            else:                self.cache.insert(url=url, checksum=csum, location=filename,content_length=contentlen, last_modified=lastmodified,                                  etag=tag, updated=True)                            def get_url_cache_data(self, urlobj):        """ Get cached data for the URL from disk """        # This is returned as Unix time, i.e number of        # seconds since Epoch.        # This will be called from connector to avoid downloading        # URL data using HTTP 304. However, we support this only        # if we have data for the URL.        if (not self._cfg.pagecache) or (not self._cfg.datacache):            return ''        url = urlobj.get_full_url()        content = self.cache._url[url]        if content:            item = content[0]            # Check if we have the data for the URL            data = item.get('data','')            if data:                try:                    return zlib.decompress(data)                except zlib.error, e:                    error('Error:',e)                    return ''        return ''    def get_last_modified_time(self, urlobj):        """ Return last-modified-time and data of the given URL if it        was found in the cache """        # This is returned as Unix time, i.e number of        # seconds since Epoch.        # This will be called from connector to avoid downloading        # URL data using HTTP 304.         if (not self._cfg.pagecache):            return ''        url = urlobj.get_full_url()
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -