📄 datamgr.py
字号:
oldurlre2 = "".join(('href','=','\\"?',v,'\\"?')) oldurlre = re.compile("".join(('(',oldurlre1,'|',oldurlre2,')'))) except Exception, e: debug("Error:",str(e)) continue http_strs=('href','src') for item in http_strs: try: oldurl = "".join((item, "=\"", v, "\"")) if oldurl != newurl: data = re.sub(oldurlre, newurl, data,1) except: pass try: fw.write(data) fw.close() except IOError, e: logconsole(e) return HARVESTMAN_FAIL return HARVESTMAN_OK def print_project_info(self, statsd): """ Print project information """ nlinks = statsd['links'] nservers = statsd['extservers'] + 1 nfiles = statsd['files'] ndirs = statsd['extdirs'] + 1 numfailed = statsd['failed'] nretried = statsd['retries'] fatal = statsd['fatal'] fetchtime = statsd['fetchtime'] nfilesincache = statsd['filesincache'] nfilesinrepos = statsd['filesinrepos'] nbroken = statsd['broken'] # Bug fix, download time to be calculated # precisely... dnldtime = fetchtime strings = [('link', nlinks), ('server', nservers), ('file', nfiles), ('file', nfilesinrepos), ('directory', ndirs), ('link', numfailed), ('link', fatal), ('link', nretried), ('file', nfilesincache), ('link', nbroken) ] fns = map(plural, strings) info(' ') bytes = self.bytes savedbytes = self.savedbytes ratespec='KB/sec' if bytes and dnldtime: bps = float(bytes/dnldtime)/1024.0 if bps<1.0: bps *= 1000.0 ratespec='bytes/sec' bps = '%.2f' % bps else: bps = '0.0' fetchtime = float((math.modf(fetchtime*100.0)[1])/100.0) if self._cfg.simulate: info("HarvestMan crawl simulation of",self._cfg.project,"completed in",fetchtime,"seconds.") else: info('HarvestMan mirror',self._cfg.project,'completed in',fetchtime,'seconds.') if nlinks: info(nlinks,fns[0],'scanned in',nservers,fns[1],'.') else: info('No links parsed.') if nfiles: info(nfiles,fns[2],'written.') else:info('No file written.') if nfilesinrepos: info(nfilesinrepos,fns[3],wasOrWere(nfilesinrepos),'already uptodate in the repository for this project and',wasOrWere(nfilesinrepos),'not updated.') if nfilesincache: info(nfilesincache,fns[8],wasOrWere(nfilesincache),'updated from the project cache.') if nbroken: info(nbroken,fns[9],wasOrWere(nbroken),'were broken.') if fatal: info(fatal,fns[5],'had fatal errors and failed to download.') if bytes: info(bytes,' bytes received at the rate of',bps,ratespec,'.') if savedbytes: info(savedbytes,' bytes were written to disk.\n') info('*** Log Completed ***\n') # get current time stamp s=time.localtime() tz=(time.tzname)[0] format='%b %d %Y '+tz+' %H:%M:%S' tstamp=time.strftime(format, s) if not self._cfg.simulate: # Write statistics to the crawl database HarvestManDbManager.add_stats_record(statsd) logconsole('Done.') # No longer writing a stats file... # Write stats to a stats file #statsfile = self._cfg.project + '.hst' #statsfile = os.path.abspath(os.path.join(self._cfg.projdir, statsfile)) #logconsole('Writing stats file ', statsfile , '...') # Append to files contents #sf=open(statsfile, 'a') # Write url, file count, links count, time taken, # files per second, failed file count & time stamp #infostr='url:'+self._cfg.url+',' #infostr +='files:'+str(nfiles)+',' #infostr +='links:'+str(nlinks)+',' #infostr +='dirs:'+str(ndirs)+',' #infostr +='failed:'+str(numfailed)+',' #infostr +='refetched:'+str(nretried)+',' #infostr +='fatal:'+str(fatal)+',' #infostr +='elapsed:'+str(fetchtime)+',' #infostr +='fps:'+str(fps)+',' #infostr +='kbps:'+str(bps)+',' #infostr +='timestamp:'+tstamp #infostr +='\n' #sf.write(infostr) #sf.close() def dump_urltree(self): """ Dump url tree to a file """ # This creats an html file with # each url and its children below # it. Each url is a hyperlink to # itself on the net if the file # is an html file. # urltreefile is <projdir>/urls.html urlfile = os.path.join(self._cfg.projdir, 'urltree.html') try: if os.path.exists(urlfile): os.remove(urlfile) except OSError, e: logconsole(e) info('Dumping url tree to file', urlfile) fextn = ((os.path.splitext(urlfile))[1]).lower() try: f=open(urlfile, 'w') if fextn in ('', '.txt'): self.dump_urltree_textmode(f) elif fextn in ('.htm', '.html'): self.dump_urltree_htmlmode(f) f.close() except Exception, e: logconsole(e) return DUMP_URL_ERROR debug("Done.") return DUMP_URL_OK def dump_urltree_textmode(self, stream): """ Dump urls in text mode """ for node in self.collections.preorder(): coll = node.get() idx = 0 links = [self.get_url(index) for index in coll.getAllURLs()] children = [] for link in links: if not link: continue # Get base link, only for first # child url, since base url will # be same for all child urls. if idx==0: children = [] base_url = link.get_parent_url().get_full_url() stream.write(base_url + '\n') childurl = link.get_full_url() if childurl and childurl not in children: stream.write("".join(('\t',childurl,'\n'))) children.append(childurl) idx += 1 def dump_urltree_htmlmode(self, stream): """ Dump urls in html mode """ # Write html header stream.write('<html>\n') stream.write('<head><title>') stream.write('Url tree generated by HarvestMan - Project %s' % self._cfg.project) stream.write('</title></head>\n') stream.write('<body>\n') stream.write('<p>\n') stream.write('<ol>\n') for node in self.collections.preorder(): coll = node.get() idx = 0 links = [self.get_url(index) for index in coll.getAllURLs()] children = [] for link in links: if not link: continue # Get base link, only for first # child url, since base url will # be same for all child urls. if idx==0: children = [] base_url = link.get_parent_url().get_full_url() stream.write('<li>') stream.write("".join(("<a href=\"",base_url,"\"/>",base_url,"</a>"))) stream.write('</li>\n') stream.write('<p>\n') stream.write('<ul>\n') childurl = link.get_full_url() if childurl and childurl not in children: stream.write('<li>') stream.write("".join(("<a href=\"",childurl,"\"/>",childurl,"</a>"))) stream.write('</li>\n') children.append(childurl) idx += 1 # Close the child list stream.write('</ul>\n') stream.write('</p>\n') # Close top level list stream.write('</ol>\n') stream.write('</p>\n') stream.write('</body>\n') stream.write('</html>\n') def get_url_threadpool(self): """ Return the URL thread-pool object """ return self._urlThreadPoolclass HarvestManController(threading.Thread): """ A controller class for managing exceptional conditions such as file limits. Right now this is written with the sole aim of managing file & time limits, but could get extended in future releases. """ def __init__(self): self._dmgr = objects.datamgr self._tq = objects.queuemgr self._cfg = objects.config self._exitflag = False self._starttime = 0 threading.Thread.__init__(self, None, None, 'HarvestMan Control Class') def run(self): """ Run in a loop looking for exceptional conditions """ while not self._exitflag: # Wake up every half second and look # for exceptional conditions time.sleep(1.0) if self._cfg.timelimit != -1: if self._manage_time_limits()==CONTROLLER_EXIT: break if self._cfg.maxfiles: if self._manage_file_limits()==CONTROLLER_EXIT: break if self._cfg.maxbytes: if self._manage_maxbytes_limits()==CONTROLLER_EXIT: break def stop(self): """ Stop this thread """ self._exitflag = True def terminator(self): """ The function which terminates the program in case of an exceptional condition """ # This somehow got deleted in HarvestMan 1.4.5 self._tq.endloop(True) def _manage_time_limits(self): """ Manage limits on time for the project """ t2=time.time() timediff = float((math.modf((t2-self._cfg.starttime)*100.0)[1])/100.0) timemax = self._cfg.timelimit if timediff >= timemax -1: info('Specified time limit of',timemax ,'seconds reached!') self.terminator() return CONTROLLER_EXIT return HARVESTMAN_OK def _manage_file_limits(self): """ Manage limits on maximum file count """ lsaved = self._dmgr.savedfiles lmax = self._cfg.maxfiles if lsaved >= lmax: info('Specified file limit of',lmax ,'reached!') self.terminator() return CONTROLLER_EXIT return HARVESTMAN_OK def _manage_maxbytes_limits(self): """ Manage limits on maximum bytes a crawler should download in total per job. """ lsaved = self._dmgr.savedbytes lmax = self._cfg.maxbytes # Let us check for a closer hit of 90%... if (lsaved >=0.90*lmax): info('Specified maxbytes limit of',lmax ,'reached!') self.terminator() return CONTROLLER_EXIT return HARVESTMAN_OK
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -