📄 datamgr.py
字号:
configobj = GetObject('config')
if urlObject is None: return -1
# Bug: we should be getting this url as rooturl and not
# the base url of this url.
filename, rooturl = urlObject.get_full_filename(), urlObject.get_full_url()
ok=False
# Status == 1 or 2 means look up in "_savedfiles"
# Status == 3 means look up in "_cacheinfo"
lookuplist=[]
if status == 1 or status == 2:
lookuplist = self._downloaddict['_savedfiles']
elif status == 3:
lookuplist = self._downloaddict['_cacheinfo']
else:
return -1
for x in lookuplist:
# Already added
if x[0] == filename:
ok=True
break
if not ok:
lookuplist.append( filename )
finally:
self._dataLock.release()
lsaved = len(self._downloaddict['_savedfiles'])
lmax = configobj.maxfiles
if lsaved == lmax:
moreinfo('Specified file limit of', lmax ,'reached!')
# Get tracker queue object
tq = GetObject('trackerqueue')
tq.killTrackers()
# see if some tracker still managed to download
# files while we were killing, then delete them!
if lsaved > lmax:
diff = lsaved - lmax
savedcopy = (self._downloaddict['_savedfiles'])[0:]
for x in range(0, diff):
# 2 bugs: fixed a bug where the deletion
# was not controlled
lastfile = savedcopy[lsaved - x -1]
# sometimes files may not be there, attempt
# to delete only if file is there (makes sense)
if os.path.exists(lastfile):
try:
extrainfo('Deleting file ', lastfile)
os.remove(lastfile)
(self._downloaddict['_deletedfiles']).append(lastfile)
# bugfix: If any files are deleted, remove it from this list.
self._downloaddict['_savedfiles'].remove(lastfile)
except (OSError, IndexError, ValueError), e:
print e
def update_links(self, filename, urlobj):
""" Update the links dictionary for this html file """
try:
self._dataLock2.acquire()
if self._linksdict.has_key(filename):
links = self._linksdict[filename]
try:
links.index(urlobj)
except:
links.append(urlobj)
else:
l=[urlobj]
self._linksdict[filename] = l
finally:
self._dataLock2.release()
def thread_download(self, urlObj):
""" Download this url object in a separate thread """
# Add this task to the url thread pool
self._urlThreadPool.push( urlObj )
def has_download_threads(self):
""" Return true if there are any download sub-threads
running, else return false """
num_threads = self._urlThreadPool.has_busy_threads()
if num_threads:
return True
return False
def last_download_thread_report_time(self):
""" Get the time stamp of the last completed
download (sub) thread """
return self._urlThreadPool.last_thread_report_time()
def kill_download_threads(self):
""" Terminate all the download threads """
self._urlThreadPool.end_all_threads()
def create_local_directory(self, urlObj):
""" Create the directories on the disk for downloading
this url object """
directory = urlObj.get_local_directory()
try:
if not os.path.isdir( directory ):
if not os.path.exists( directory ):
os.makedirs( directory )
extrainfo("Created => ", directory)
return 0
except OSError, e:
moreinfo("OS Exception", e)
return -1
return 0
def download_url(self, urlobj):
try:
data=""
configobj = GetObject('config')
if not configobj.usethreads or urlobj.is_webpage():
# Connector object
# New Feature: use cached connections
# Check the connector cache dictionary for a previous
# connection of this url.
new_connector = False
server = urlobj.get_domain()
try:
conn = self._connectorcache[urlobj]
except:
conn_factory = GetObject('connectorfactory')
# This call will block if we exceed the number of connections
# moreinfo("Creating connector for url ", urlobj.get_full_url())
conn = conn_factory.create_connector( server )
new_connector = True
res = conn.save_url( urlobj, new_connector )
# Remove the connector from the factory
if new_connector:
conn_factory.remove_connector(conn, server)
# Return values for res
# 0 => error, file not downloaded
# 1 => file downloaded ok
# 2 => file downloaded with filename modification
# 3 => file was not downloaded because cache was uptodate
filename = urlobj.get_full_filename()
if res:
if res==2:
# There was a filename modification, so get the new filename
filename = GetObject('modfilename')
else:
filename = urlobj.get_full_filename()
if res==1:
moreinfo("Saved to ", filename)
self.update_file_stats( urlobj, res )
# Get the data fetched and return it
if urlobj.is_webpage():
data=conn.get_data()
else:
fetchurl = urlobj.get_full_url()
extrainfo( "Failed to download url", fetchurl)
# We dont re-fetch fatal errors
err = conn.get_error()
# print 'Error number => ', err['number']
if not err['fatal']:
self.update_failed_files(urlobj)
else:
self.thread_download( urlobj )
return data
finally:
pass
def is_file_downloaded(self, filename):
""" Find if the <filename> is present in the
saved files list """
yes=0
abspath1=os.path.abspath(filename)
for filename in self._downloaddict['_savedfiles']:
abspath2=os.path.abspath(filename)
if abspath1==abspath2:
yes=1
break
return yes
def localise_links(self):
""" Localise all links (urls) of the downloaded html pages """
info('Localising links of downloaded html files ...')
dmgr = GetObject('datamanager')
for filename in dmgr.get_links_dictionary().keys():
debug("Localising links for ", filename)
self.localise_file_links(filename)
def localise_file_links(self, filename):
""" Localise links for this file """
# open file
try:
f=open(filename, 'r')
except IOError, e:
debug('IOException: ', str(e))
return -1
configobj = GetObject('config')
data=''
while (1):
try:
l = f.readline()
if l=='': break
# Bugfix: skip the BASE HREF line
if l.lower().find("<base href") != -1: continue
data += l
except (IOError, EOFError), e:
print e
return -1
# close file
f.close()
dmgr = GetObject('datamanager')
links = (dmgr.get_links_dictionary())[filename]
for u in links:
url_object = u
# bug fix, dont localize cgi links
if url_object.is_cgi() or not url_object.is_filename_url(): continue
typ = url_object.get_type()
v = url_object.get_url()
configobj = GetObject('config')
# if the link is relative, and we dont need
# absolute localization, we dont need to do
# anything.
if configobj.localise==2:
if url_object.is_relative_path() and not url_object.is_relative_to_server():
continue
fullfilename = os.path.abspath( url_object.get_full_filename() )
urlfilename=''
# Modification: localisation w.r.t relative pathnames
if configobj.localise==2:
urlfilename = url_object.get_relative_filename()
elif configobj.localise==1:
urlfilename = fullfilename
# modification, get any filename mappings from
# HarvestManGlobals module (probably a dynamically generated file
# which was renamed)
try:
oldnewmappings = GetObject('oldnewmappings')
newfilename = oldnewmappings[fullfilename]
if configobj.localise==2:
urlfilename = (os.path.split(newfilename))[1]
elif configobj.localise==1:
urlfilename = os.path.abspath(newfilename)
except KeyError:
urlfilename = urlfilename
# replace '\\' with '/'
urlfilename = urlfilename.replace('\\','/')
newurl=''
# bug: if we cannot get the filenames, replace
# relative url paths will full url paths so that
# the user can connect to them.
if not os.path.exists(fullfilename):
# moreinfo("Path does not exist !", fullfilename)
# for relative links, replace it with the
# full url path
fullurlpath = url_object.get_full_url_sans_port()
newurl = "href=\"" + fullurlpath + "\""
else:
# replace url with urlfilename
# bug: fix for anchor links
if typ == 'anchor':
urlfilename += url_object.get_anchor()
if configobj.localise == 1:
newurl= "href=\"" + "file://" + urlfilename + "\""
else:
newurl= "href=\"" + urlfilename + "\""
# Get the location of the link in the file
oldurl1 = "href=\"" + v + "\""
# oldurl2 = "HREF=\"" + v + "\""
try:
data=data.replace(oldurl1, newurl)
data=data.replace(oldurl2, newurl)
except:
continue
try:
fw=open(filename, 'w')
except IOError, e:
debug('IOException: ', str(e))
return -1
fw.write(data)
fw.close()
def print_project_info(self, statsd):
""" Print project information """
nlinks = statsd['links']
nservers = statsd['extservers'] + 1
nfiles = statsd['files']
ndirs = statsd['extdirs'] + 1
numfailed = statsd['failed']
nretried = statsd['retries']
fatal = statsd['fatal']
fetchtime = statsd['fetchtime']
nfilesincache = statsd['filesincache']
# Bug fix, download time to be calculated
# precisely...
cfg = GetObject('config')
dnldtime = fetchtime
strings = [('link', nlinks), ('server', nservers), ('file', nfiles), ('file', nfilesincache),
('directory', ndirs), ('link', numfailed), ('link', fatal),
('link', nretried) ]
fns = map(plural, strings)
info(' ')
if fetchtime and nfiles:
fps = (float(nfiles/dnldtime))
fps = float((math.modf(fps*100.0))[1]/100.0)
else:
fps=0.0
bytes = self._bytes
ratespec='KB/sec'
if bytes and dnldtime:
bps = (float(bytes/dnldtime))/100.0
bps = float((math.modf(bps*100.0))[1]/1000.0)
if bps<1.0:
bps *= 1000.0
ratespec='bytes/sec'
else:
bps = 0.0
configobj = GetObject('config')
info('HarvestMan mirror',configobj.project,'completed in',fetchtime,'seconds.')
if nlinks: info(nlinks,fns[0],'scanned in',nservers,fns[1],',',nfiles,fns[2],'written.')
else: info('No links scanned, no file written.\n')
if nfilesincache:
info(nfilesincache,fns[3],wasOrWere(nfilesincache),' already uptodate in the cache for this project and',wasOrWere(nfilesincache),'not updated.')
if fatal: info(fatal,fns[6],'had fatal errors and failed to download.')
if bytes: info(bytes,'bytes received at the rate of',bps,ratespec,'.\n')
nlocked = GetObject('trackerqueue').get_locked_instances()
debug('(Thread Locking situation was avoided ', nlocked, 'times.)')
# get current time stamp
s=time.localtime()
tz=(time.tzname)[0]
format='%b %d %Y '+tz+' %H:%M:%S'
tstamp=time.strftime(format, s)
# Write stats to a stats file
statsfile = configobj.project + '.hst'
statsfile = os.path.abspath(os.path.join(configobj.projdir, statsfile))
print 'Writing stats file ', statsfile , '...'
# Append to files contents
sf=open(statsfile, 'a')
# Write url, file count, links count, time taken,
# files per second, failed file count & time stamp
infostr='url:'+configobj.url+','
infostr +='files:'+str(nfiles)+','
infostr +='links:'+str(nlinks)+','
infostr +='dirs:'+str(ndirs)+','
infostr +='failed:'+str(numfailed)+','
infostr +='refetched:'+str(nretried)+','
infostr +='fatal:'+str(fatal)+','
infostr +='elapsed:'+str(fetchtime)+','
infostr +='fps:'+str(fps)+','
infostr +='bps:'+str(bps)+','
infostr +='timestamp:'+tstamp
infostr +='\n'
sf.write(infostr)
sf.close()
print 'Done.'
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -