📄 datamgr.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
                    oldurlre2 = "".join(('href','=','\\"?',v,'\\"?'))                    oldurlre = re.compile("".join(('(',oldurlre1,'|',oldurlre2,')')))                except Exception, e:                    debug("Error:",str(e))                    continue                                http_strs=('href','src')                            for item in http_strs:                    try:                        oldurl = "".join((item, "=\"", v, "\""))                        if oldurl != newurl:                            data = re.sub(oldurlre, newurl, data,1)                    except:                        pass        try:            fw.write(data)            fw.close()        except IOError, e:            logconsole(e)            return HARVESTMAN_FAIL        return HARVESTMAN_OK    def print_project_info(self, statsd):        """ Print project information """        nlinks = statsd['links']        nservers = statsd['extservers'] + 1        nfiles = statsd['files']        ndirs = statsd['extdirs'] + 1        numfailed = statsd['failed']        nretried = statsd['retries']        fatal = statsd['fatal']        fetchtime = statsd['fetchtime']        nfilesincache = statsd['filesincache']        nfilesinrepos = statsd['filesinrepos']        nbroken = statsd['broken']                # Bug fix, download time to be calculated        # precisely...        dnldtime = fetchtime        strings = [('link', nlinks), ('server', nservers),                   ('file', nfiles), ('file', nfilesinrepos),                   ('directory', ndirs), ('link', numfailed), ('link', fatal),                   ('link', nretried), ('file', nfilesincache), ('link', nbroken) ]        fns = map(plural, strings)        info(' ')        bytes = self.bytes        savedbytes = self.savedbytes                ratespec='KB/sec'        if bytes and dnldtime:            bps = float(bytes/dnldtime)/1024.0            if bps<1.0:                bps *= 1000.0                ratespec='bytes/sec'            bps = '%.2f' % bps        else:            bps = '0.0'        fetchtime = float((math.modf(fetchtime*100.0)[1])/100.0)        if self._cfg.simulate:            info("HarvestMan crawl simulation of",self._cfg.project,"completed in",fetchtime,"seconds.")        else:            info('HarvestMan mirror',self._cfg.project,'completed in',fetchtime,'seconds.')                    if nlinks: info(nlinks,fns[0],'scanned in',nservers,fns[1],'.')        else: info('No links parsed.')        if nfiles: info(nfiles,fns[2],'written.')        else:info('No file written.')                if nfilesinrepos:            info(nfilesinrepos,fns[3],wasOrWere(nfilesinrepos),'already uptodate in the repository for this project and',wasOrWere(nfilesinrepos),'not updated.')        if nfilesincache:            info(nfilesincache,fns[8],wasOrWere(nfilesincache),'updated from the project cache.')        if nbroken: info(nbroken,fns[9],wasOrWere(nbroken),'were broken.')        if fatal: info(fatal,fns[5],'had fatal errors and failed to download.')        if bytes: info(bytes,' bytes received at the rate of',bps,ratespec,'.')        if savedbytes: info(savedbytes,' bytes were written to disk.\n')                info('*** Log Completed ***\n')                # get current time stamp        s=time.localtime()        tz=(time.tzname)[0]        format='%b %d %Y '+tz+' %H:%M:%S'        tstamp=time.strftime(format, s)        if not self._cfg.simulate:            # Write statistics to the crawl database            HarvestManDbManager.add_stats_record(statsd)            logconsole('Done.')            # No longer writing a stats file...            # Write stats to a stats file            #statsfile = self._cfg.project + '.hst'            #statsfile = os.path.abspath(os.path.join(self._cfg.projdir, statsfile))            #logconsole('Writing stats file ', statsfile , '...')            # Append to files contents            #sf=open(statsfile, 'a')            # Write url, file count, links count, time taken,            # files per second, failed file count & time stamp            #infostr='url:'+self._cfg.url+','            #infostr +='files:'+str(nfiles)+','            #infostr +='links:'+str(nlinks)+','            #infostr +='dirs:'+str(ndirs)+','            #infostr +='failed:'+str(numfailed)+','            #infostr +='refetched:'+str(nretried)+','            #infostr +='fatal:'+str(fatal)+','            #infostr +='elapsed:'+str(fetchtime)+','            #infostr +='fps:'+str(fps)+','            #infostr +='kbps:'+str(bps)+','            #infostr +='timestamp:'+tstamp            #infostr +='\n'                        #sf.write(infostr)            #sf.close()    def dump_urltree(self):        """ Dump url tree to a file """        # This creats an html file with        # each url and its children below        # it. Each url is a hyperlink to        # itself on the net if the file        # is an html file.        # urltreefile is <projdir>/urls.html        urlfile = os.path.join(self._cfg.projdir, 'urltree.html')                try:            if os.path.exists(urlfile):                os.remove(urlfile)        except OSError, e:            logconsole(e)        info('Dumping url tree to file', urlfile)        fextn = ((os.path.splitext(urlfile))[1]).lower()                        try:            f=open(urlfile, 'w')            if fextn in ('', '.txt'):                self.dump_urltree_textmode(f)            elif fextn in ('.htm', '.html'):                self.dump_urltree_htmlmode(f)            f.close()        except Exception, e:            logconsole(e)            return DUMP_URL_ERROR        debug("Done.")        return DUMP_URL_OK    def dump_urltree_textmode(self, stream):        """ Dump urls in text mode """        for node in self.collections.preorder():            coll = node.get()            idx = 0            links = [self.get_url(index) for index in coll.getAllURLs()]            children = []                        for link in links:                if not link: continue                # Get base link, only for first                # child url, since base url will                # be same for all child urls.                if idx==0:                    children = []                    base_url = link.get_parent_url().get_full_url()                    stream.write(base_url + '\n')                childurl = link.get_full_url()                if childurl and childurl not in children:                    stream.write("".join(('\t',childurl,'\n')))                    children.append(childurl)                idx += 1    def dump_urltree_htmlmode(self, stream):        """ Dump urls in html mode """        # Write html header        stream.write('<html>\n')        stream.write('<head><title>')        stream.write('Url tree generated by HarvestMan - Project %s'                     % self._cfg.project)        stream.write('</title></head>\n')        stream.write('<body>\n')        stream.write('<p>\n')        stream.write('<ol>\n')                for node in self.collections.preorder():            coll = node.get()                        idx = 0            links = [self.get_url(index) for index in coll.getAllURLs()]                        children = []            for link in links:                if not link: continue                # Get base link, only for first                # child url, since base url will                # be same for all child urls.                if idx==0:                    children = []                                       base_url = link.get_parent_url().get_full_url()                    stream.write('<li>')                                        stream.write("".join(("<a href=\"",base_url,"\"/>",base_url,"</a>")))                    stream.write('</li>\n')                    stream.write('<p>\n')                    stream.write('<ul>\n')                                                 childurl = link.get_full_url()                if childurl and childurl not in children:                    stream.write('<li>')                    stream.write("".join(("<a href=\"",childurl,"\"/>",childurl,"</a>")))                    stream.write('</li>\n')                                        children.append(childurl)                                    idx += 1                            # Close the child list            stream.write('</ul>\n')            stream.write('</p>\n')                    # Close top level list        stream.write('</ol>\n')                stream.write('</p>\n')        stream.write('</body>\n')        stream.write('</html>\n')    def get_url_threadpool(self):        """ Return the URL thread-pool object """        return self._urlThreadPoolclass HarvestManController(threading.Thread):    """ A controller class for managing exceptional    conditions such as file limits. Right now this    is written with the sole aim of managing file    & time limits, but could get extended in future    releases. """    def __init__(self):        self._dmgr = objects.datamgr        self._tq =  objects.queuemgr        self._cfg = objects.config        self._exitflag = False        self._starttime = 0        threading.Thread.__init__(self, None, None, 'HarvestMan Control Class')    def run(self):        """ Run in a loop looking for        exceptional conditions """        while not self._exitflag:            # Wake up every half second and look            # for exceptional conditions            time.sleep(1.0)            if self._cfg.timelimit != -1:                if self._manage_time_limits()==CONTROLLER_EXIT:                    break            if self._cfg.maxfiles:                if self._manage_file_limits()==CONTROLLER_EXIT:                    break            if self._cfg.maxbytes:                if self._manage_maxbytes_limits()==CONTROLLER_EXIT:                    break                def stop(self):        """ Stop this thread """        self._exitflag = True    def terminator(self):        """ The function which terminates the program        in case of an exceptional condition """        # This somehow got deleted in HarvestMan 1.4.5        self._tq.endloop(True)    def _manage_time_limits(self):        """ Manage limits on time for the project """        t2=time.time()        timediff = float((math.modf((t2-self._cfg.starttime)*100.0)[1])/100.0)        timemax = self._cfg.timelimit                if timediff >= timemax -1:            info('Specified time limit of',timemax ,'seconds reached!')                        self.terminator()            return CONTROLLER_EXIT                return HARVESTMAN_OK    def _manage_file_limits(self):        """ Manage limits on maximum file count """        lsaved = self._dmgr.savedfiles        lmax = self._cfg.maxfiles        if lsaved >= lmax:            info('Specified file limit of',lmax ,'reached!')            self.terminator()            return CONTROLLER_EXIT                return HARVESTMAN_OK    def _manage_maxbytes_limits(self):        """ Manage limits on maximum bytes a crawler should download in total per job. """        lsaved = self._dmgr.savedbytes        lmax = self._cfg.maxbytes        # Let us check for a closer hit of 90%...        if (lsaved >=0.90*lmax):            info('Specified maxbytes limit of',lmax ,'reached!')            self.terminator()               return CONTROLLER_EXIT                return HARVESTMAN_OK
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -