📄 datamgr.py
字号:
# -- coding: utf-8""" datamgr.py - Data manager module for HarvestMan. This module is part of the HarvestMan program. Author: Anand B Pillai <abpillai at gmail dot com> Oct 13 2006 Anand Removed data lock since it is not required - Python GIL automatically locks byte operations. Feb 2 2007 Anand Re-added function parse_style_sheet which went missing. Feb 26 2007 Anand Fixed bug in check_duplicate_download for stylesheets. Also rewrote logic. Mar 05 2007 Anand Added method get_last_modified_time_and_data to support server-side cache checking using HTTP 304. Fixed a small bug in css url handling. Apr 19 2007 Anand Made to work with URL collections. Moved url mapping dictionary here. Moved CSS parsing logic to pageparser module. Feb 13 2008 Anand Replaced URL dictionary with disk caching binary search tree. Other changes done later -> Got rid of many redundant lists which were wasting memory. Need to trim this further. Feb 14 2008 Anand Many changes. Replaced/removed datastructures. Merged cache updating functions. Details in doc/Datastructures.txt . April 4 2008 Anand Added update_url method and corresponding update method in bst.py to update state of URLs after download. Added statement to print broken links information at end. Copyright (C) 2004 Anand B Pillai. """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import os, sysimport shutilimport timeimport mathimport reimport shaimport copyimport randomimport shelveimport tarfileimport zlibimport threading # Utilsfrom harvestman.lib import utilsfrom harvestman.lib import urlparserfrom harvestman.lib.mirrors import HarvestManMirrorManagerfrom harvestman.lib.db import HarvestManDbManagerfrom harvestman.lib.urlthread import HarvestManUrlThreadPoolfrom harvestman.lib.connector import *from harvestman.lib.methodwrapper import MethodWrapperMetaClassfrom harvestman.lib.common.common import *from harvestman.lib.common.macros import *from harvestman.lib.common.bst import BSTfrom harvestman.lib.common.pydblite import Base# Defining pluggable functions__plugins__ = { 'download_url_plugin': 'HarvestManDataManager:download_url', 'post_download_setup_plugin': 'HarvestManDataManager:post_download_setup', 'print_project_info_plugin': 'HarvestManDataManager:print_project_info', 'dump_url_tree_plugin': 'HarvestManDataManager:dump_url_tree'}# Defining functions with callbacks__callbacks__ = { 'download_url_callback': 'HarvestManDataManager:download_url', 'post_download_setup_callback' : 'HarvestManDataManager:post_download_setup' }class HarvestManDataManager(object): """ The data manager cum indexer class """ # For supporting callbacks __metaclass__ = MethodWrapperMetaClass alias = 'datamgr' def __init__(self): self.reset() def reset(self): # URLs which failed with any error self._numfailed = 0 # URLs which failed even after a re-download self._numfailed2 = 0 # URLs which were retried self._numretried = 0 self.cache = None self.savedfiles = 0 self.reposfiles = 0 self.cachefiles = 0 self.filteredfiles = 0 # Config object self._cfg = objects.config # Dictionary of servers crawled and # their meta-data. Meta-data is # a dictionary which currently # has only one entry. # i.e accept-ranges. self._serversdict = {} # byte count self.bytes = 0L # saved bytes count self.savedbytes = 0L # Redownload flag self._redownload = False # Mirror manager self.mirrormgr = HarvestManMirrorManager.getInstance() # Condition object for synchronization self.cond = threading.Condition(threading.Lock()) self._urldb = None self.collections = None def initialize(self): """ Do initializations per project """ # Url thread group class for multithreaded downloads if self._cfg.usethreads: self._urlThreadPool = HarvestManUrlThreadPool() self._urlThreadPool.spawn_threads() else: self._urlThreadPool = None # URL database, a BST with disk-caching self._urldb = BST() # Collections database, a BST with disk-caching self.collections = BST() # For testing, don't set this otherwise we might # be left with many orphaned .bidx... folders! if not self._cfg.testing: self._urldb.set_auto(2) self.collections.set_auto(2) # Load any mirrors self.mirrormgr.load_mirrors(self._cfg.mirrorfile) # Set mirror search flag self.mirrormgr.mirrorsearch = self._cfg.mirrorsearch def get_urldb(self): return self._urldb def add_url(self, urlobj): """ Add urlobject urlobj to the local dictionary """ # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index) self._urldb.insert(urlobj.index, urlobj) def update_url(self, urlobj): """ Update urlobject urlobj in the local dictionary """ # print 'Adding %s with index %d' % (urlobj.get_full_url(), urlobj.index) self._urldb.update(urlobj.index, urlobj) def get_url(self, index): # return self._urldict[str(index)] return self._urldb.lookup(index) def get_original_url(self, urlobj): # Return the original URL object for # duplicate URLs. This is useful for # processing URL objects obtained from # the collection object, because many # of them might be duplicate and would # not have any post-download information # such a headers etc. if urlobj.refindex != -1: return self.get_url(urlobj.refindex) else: # Return the same URL object to avoid # an <if None> check on the caller return urlobj def get_proj_cache_filename(self): """ Return the cache filename for the current project """ # Note that this function does not actually build the cache directory. # Get the cache file path if self._cfg.projdir and self._cfg.project: cachedir = os.path.join(self._cfg.projdir, "hm-cache") cachefilename = os.path.join(cachedir, 'cache') return cachefilename else: return '' def get_proj_cache_directory(self): """ Return the cache directory for the current project """ # Note that this function does not actually build the cache directory. # Get the cache file path if self._cfg.projdir and self._cfg.project: return os.path.join(self._cfg.projdir, "hm-cache") else: return '' def get_server_dictionary(self): return self._serversdict def supports_range_requests(self, urlobj): """ Check whether the given url object supports range requests """ # Look up its server in the dictionary server = urlobj.get_full_domain() if server in self._serversdict: d = self._serversdict[server] return d.get('accept-ranges', False) return False def read_project_cache(self): """ Try to read the project cache file """ # Get cache filename info('Reading Project Cache...') cachereader = utils.HarvestManCacheReaderWriter(self.get_proj_cache_directory()) obj, found = cachereader.read_project_cache() self._cfg.cachefound = found self.cache = obj if not found: # Fresh cache - create structure... self.cache.create('url','last_modified','etag', 'updated','location','checksum', 'content_length','data','headers') # Create an index on URL self.cache.create_index('url') else: pass def write_file_from_cache(self, urlobj): """ Write file from url cache. This works only if the cache dictionary of this url has a key named 'data' """ ret = False # print 'Inside write_file_from_cache...' url = urlobj.get_full_url() content = self.cache._url[url] if len(content): # Value itself is a dictionary item = content[0] if not item.has_key('data'): return ret else: urldata = item['data'] if urldata: fileloc = item['location'] # Write file extrainfo("Updating file from cache=>", fileloc) try: if SUCCESS(self.create_local_directory(os.path.dirname(fileloc))): f=open(fileloc, 'wb') f.write(zlib.decompress(urldata)) f.close() ret = True except (IOError, zlib.error), e: error("Error:",e) return ret def update_cache_for_url(self, urlobj, filename, urldata, contentlen, lastmodified, tag): """ Method to update the cache information for the URL 'url' associated to file 'filename' on the disk """ # if page caching is disabled, skip this... if not objects.config.pagecache: return url = urlobj.get_full_url() if urldata: csum = sha.new(urldata).hexdigest() else: csum = '' # Update all cache keys content = self.cache._url[url] if content: rec = content[0] self.cache.update(rec, checksum=csum, location=filename,content_length=contentlen, last_modified=lastmodified,etag=tag, updated=True) if self._cfg.datacache: self.cache.update(rec,data=zlib.compress(urldata)) else: # Insert as new values if self._cfg.datacache: self.cache.insert(url=url, checksum=csum, location=filename,content_length=contentlen,last_modified=lastmodified, etag=tag, updated=True,data=zlib.compress(urldata)) else: self.cache.insert(url=url, checksum=csum, location=filename,content_length=contentlen, last_modified=lastmodified, etag=tag, updated=True) def get_url_cache_data(self, urlobj): """ Get cached data for the URL from disk """ # This is returned as Unix time, i.e number of # seconds since Epoch. # This will be called from connector to avoid downloading # URL data using HTTP 304. However, we support this only # if we have data for the URL. if (not self._cfg.pagecache) or (not self._cfg.datacache): return '' url = urlobj.get_full_url() content = self.cache._url[url] if content: item = content[0] # Check if we have the data for the URL data = item.get('data','') if data: try: return zlib.decompress(data) except zlib.error, e: error('Error:',e) return '' return '' def get_last_modified_time(self, urlobj): """ Return last-modified-time and data of the given URL if it was found in the cache """ # This is returned as Unix time, i.e number of # seconds since Epoch. # This will be called from connector to avoid downloading # URL data using HTTP 304. if (not self._cfg.pagecache): return '' url = urlobj.get_full_url()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -