📄 datamgr.py
字号:
""" HarvestManDataManager.py - Data manager module for HarvestMan.
This software is part of the HarvestMan(R) program.
Author: Anand B Pillai (anandpillai at letterboxes dot org).
Copyright (C) 2004-2005 Anand B Pillai.
Created July 29 2003 Anand Modifications for 1.1 release.
(New module)
Modification History
Oct 1 2003 Anand Fixed bugs with localizing anchor type links.
Jan 2 2004 Anand 1.3.1 bug fix version.
Jun 4-9 2004 Anand 1.4 development
*Fixed a bug in localising links.
Directory urls need to be skipped.
Jun 14 2004 Anand 1.3.9 release.
"""
import os, sys
import time
import math
import binascii
import shutil
from threading import Lock, RLock, Condition
# Utils
import utils
from urlthread import harvestManUrlThreadPool
from connector import *
from common import *
# __metaclass__ = object
class harvestManDataManager:
""" The data manager cum indexer class """
def __init__(self):
self._numfailed = 0
self._projectcache = []
self._downloaddict = { '_savedfiles': [],
'_deletedfiles': [],
'_failedurls' : [],
'_cacheinfo' : [],
'_invalidurls': [],
'_validurls' : [],
}
# Used for localizing links
self._linksdict = {}
# Cache for connectors
self._connectorcache = {}
# byte count
self._bytes = 0L
# Redownload flag
self._redownload = False
# data lock
self._dataLock = Condition(RLock())
self._dataLock2 = Condition(RLock())
# Url thread group class for multithreaded downloads
self._urlThreadPool = harvestManUrlThreadPool()
self._urlThreadPool.spawn_threads()
def get_proj_cache_filename(self):
""" Return the cache filename for the current project """
# Note that this function does not actually build the cache directory.
configobj = GetObject('config')
# Get the cache file path
cachedir = os.path.join(configobj.projdir, "hm-cache")
cachefilename = os.path.join(cachedir, configobj.project + ".hmc")
return cachefilename
def get_links_dictionary(self):
return self._linksdict
def read_project_cache(self):
""" Try to read the project cache file """
# Get cache filename
moreinfo('Reading Project Cache...')
cachereader = utils.HarvestManCacheManager(self.get_proj_cache_filename())
obj = cachereader.read_project_cache()
if obj is not None:
self._projectcache = obj
return
def is_url_cache_uptodate(self, url, filename, contentlen, urldata):
""" Check with project cache and find out if the
content needs update """
# Sep 16 2003, fixed a bug in this, we need to check
# the file existence also.
# If page caching is not enabled, return False
# straightaway!
if not GetObject('config').pagecache:
return False
# Return True if cache is uptodate(no update needed)
# and Fals if cache is out-of-date(update needed)
# NOTE: We are using an comparison of the md5 checksum of
# the file's data with the md5 checksum of the cache file.
if contentlen==0:
return False
# Look up the dictionary containing the url cache info
import md5
m1=md5.new()
mdigest1=m1.update(urldata)
digest1=m1.digest()
# Assume that cache is not uptodate apriori
uptodate=False
keyfound = False
fileverified=False
# Reference to dictionary in the cache list
cachekey = {}
for x in self._projectcache:
if x.has_key('url'):
if x['url'] == url:
cachekey = x
keyfound = True
x['updated']=False
fileloc = x['location']
if os.path.exists(fileloc) and os.path.abspath(fileloc) == os.path.abspath(filename):
fileverified=True
if x.has_key('checksum'):
# This value is stored hex encrypted in the cache file
# (and hence the dictionary). In order to compare, we need
# to decrypt it, (or we need to compare it with the encrypted
# copy of the file digest, as we are doing now).
cachemd5 = bin_decrypt(x['checksum'])
if binascii.hexlify(cachemd5) == binascii.hexlify(digest1) and fileverified:
uptodate=True
break
else:
break
# If cache is not updated, update all cache keys
if not uptodate:
cachekey['checksum'] = bin_crypt(digest1)
cachekey['location'] = filename
cachekey['content-length'] = contentlen
cachekey['updated'] = True
# If key was not found, we need to create a new
# entry and append it to the list, otherwise the
# cache dictionary is modified in place.
if not keyfound:
cachekey['url'] = url
self._projectcache.append(cachekey)
# If both checksums are equal, return True
# else return False.
return uptodate
def is_url_uptodate(self, url, filename, lmt):
""" New function to check whether the url cache is out
of date by comparing last modified time """
# If page caching is not enabled, return False
# straightaway!
if not GetObject('config').pagecache:
return False
# Assume that cache is not uptodate apriori
uptodate=False
keyfound = False
FILEVerified=False
# Reference to dictionary in the cache list
cachekey = {}
for x in self._projectcache:
if x.has_key('url'):
if x['url'] == url:
cachekey = x
keyfound = True
x['updated']=False
fileloc = x['location']
if os.path.exists(fileloc) and os.path.abspath(fileloc) == os.path.abspath(filename):
fileverified=True
if x.has_key('last-modified'):
# Get current modified time
cmt = x['last-modified']
# If the latest page has a modified time greater than this
# page is out of date, otherwise it is uptodate
if lmt<=cmt:
uptodate=True
else:
break
break
# If cache is not updated, update all cache keys
if not uptodate:
cachekey['location'] = filename
cachekey['last-modified'] = lmt
cachekey['updated'] = True
# If key was not found, we need to create a new
# entry and append it to the list, otherwise the
# cache dictionary is modified in place.
if not keyfound:
cachekey['url'] = url
self._projectcache.append(cachekey)
# If both checksums are equal, return True
# else return False.
return uptodate
def conditional_cache_set(self):
""" A utility function to conditionally enable/disable
the cache mechanism """
# If the cache file exists for this project, disable
# cache, else enable it.
cfg = GetObject('config')
cachefilename = self.get_proj_cache_filename()
if os.path.exists(cachefilename) and os.path.getsize(cachefilename):
cfg.pagecache = False
else:
cfg.pagecache = True
def does_cache_need_update(self):
""" Find out if project cache needs update """
# If any of the dictionary entries has the key
# value for 'updated' set to True, the cache needs
# update, else not.
needsupdate=False
for x in self._projectcache:
if x.has_key('updated'):
if x['updated']:
needsupdate=x['updated']
break
return needsupdate
def fill_project_cache(self, name, value, fileloc):
""" This function fills the project cache from an
existing project cache file """
# if the tagname is 'location', add this entry to the list
if name=='location':
cachedict={}
cachedict[name]=value
self._projectcache.append(cachedict)
return 0
# Otherwise, lookup the location key in the list of dictionaries
# and add this new name/value pair in the dictionary
else:
ok=False
for x in self._projectcache:
if x.has_key('location'):
# We dont want to overwrite the key's value
if not x.has_key(name):
if x['location'] == fileloc:
# Add the new key/value pair
# print name, value
ok=True
x[name]= value
break
if ok:
return 0
else:
return -1
def post_download_setup(self):
""" Actions to perform after project is complete """
configobj = GetObject('config')
if configobj.retryfailed:
self._numfailed = len(self._downloaddict['_failedurls'])
moreinfo(' ')
# try downloading again
# mod: made this multithreaded
moreinfo('Redownloading failed links...')
if self._numfailed:
self._redownload=True
for urlobj in self._downloaddict['_failedurls']:
self.download_url( urlobj )
# bugfix: Moved the time calculation code here.
if sys.platform == 'win32' or os.name=='nt':
t2=time.clock()
else:
t2=time.time()
configobj.endtime = t2
# Write cache file
if configobj.pagecache and self.does_cache_need_update():
cachewriter = utils.HarvestManCacheManager( self.get_proj_cache_filename() )
cachewriter.write_project_cache(self._projectcache)
# localise downloaded file's links, dont do if jit localisation
# is enabled.
if configobj.localise and not configobj.jitlocalise:
self.localise_links()
# Get handle to rules checker object
ruleschecker = GetObject('ruleschecker')
# dump downloaded urls to a text file
if configobj.urlslistfile:
# Get urls list file
ruleschecker.dump_urls( configobj.urlslistfile )
# print stats of the project
nlinks, nservers, ndirs = ruleschecker.get_stats()
nfailed = self._numfailed
numstillfailed = len(self._downloaddict['_failedurls'])
numfiles = len(self._downloaddict['_savedfiles'])
numfilesincache = len(self._downloaddict['_cacheinfo'])
numretried = self._numfailed - numstillfailed
fetchtime = float((math.modf((configobj.endtime-configobj.starttime)*100.0)[1])/100.0)
statsd = { 'links' : nlinks,
'extservers' : nservers,
'extdirs' : ndirs,
'failed' : nfailed,
'fatal' : numstillfailed,
'files' : numfiles,
'filesincache' : numfilesincache,
'retries' : numretried,
'fetchtime' : fetchtime,
}
self.print_project_info(statsd)
def update_bytes(self, count):
""" Update the global byte count """
try:
self._dataLock.acquire()
self._bytes += count
finally:
self._dataLock.release()
def update_dead_links(self, url):
""" Add this link to the 404 (dead links) database """
try:
self._dataLock.acquire()
try:
self._downloaddict['_invalidurls'].index(url)
except:
self._downloaddict['_invalidurls'].append(url)
finally:
self._dataLock.release()
def update_valid_links(self, url):
""" Add this link to the working links database """
try:
self._dataLock.acquire()
try:
self._downloaddict['_validurls'].index(url)
except:
self._downloaddict['_validurls'].append(url)
finally:
self._dataLock.release()
def update_connector_cache(self, conn, urlobj):
""" Add this connector to the connector cache """
try:
self._dataLock.acquire()
self._connectorcache[urlobj] = conn
finally:
self._dataLock.release()
def is_a_dead_link(self, url):
""" Check whether the passed url is a dead (404) link """
dead = False
try:
self._downloaddict['_invalidurls'].index( url )
dead = True
except:
pass
return dead
def update_failed_files(self, urlObject):
""" Add the passed information to the failed files list """
if self._redownload: return -1
# From version 1.1, we try to pass around instances of
# urlPathParser objects instead of urls or filenames or
# their tuples. Many functions have changed, this is one
# of them.
# It makes sense to add the complte urlPathParser object
# since, we can get all information from it later on.
try:
self._dataLock.acquire()
try:
self._downloaddict['_failedurls'].index(urlObject)
except:
self._downloaddict['_failedurls'].append(urlObject)
finally:
self._dataLock.release()
def update_file_stats(self, urlObject, status):
""" Add the passed information to the saved file list """
try:
self._dataLock.acquire()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -