⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 datamgr.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 4 页
字号:
        content = self.cache._url[url]        if content:            return content[0].get('last_modified', '')        else:            return ''    def get_etag(self, urlobj):        """ Return the etag of the given URL if it was found in  the cache """        # This will be called from connector to avoid downloading        # URL data using HTTP 304.         if (not self._cfg.pagecache):            return ''        url = urlobj.get_full_url()        content = self.cache._url[url]        if content:            return content[0].get('etag', '')        else:            return ''            def is_url_cache_uptodate(self, urlobj, filename, urldata, contentlen=0, last_modified=0, etag=''):        """ Check with project cache and find out if the        content needs update """                # If page caching is not enabled, return False        # straightaway!        # print 'Inside is_url_cache_uptodate...'                if not self._cfg.pagecache:            return (False, False)        # Return True if cache is uptodate(no update needed)        # and False if cache is out-of-date(update needed)        # NOTE: We are using an comparison of the sha checksum of        # the file's data with the sha checksum of the cache file.                # Assume that cache is not uptodate apriori        uptodate, fileverified = False, False        url = urlobj.get_full_url()        content = self.cache._url[url]        if content:            cachekey = content[0]            cachekey['updated']=False            fileloc = cachekey['location']            if os.path.exists(fileloc) and os.path.abspath(fileloc) == os.path.abspath(filename):                fileverified=True            # Use a cascading logic - if last_modified is available use it first            if last_modified:                if cachekey['last_modified']:                    # Get current modified time                    cmt = cachekey['last_modified']                    # print cmt,'=>',lmt                    # If the latest page has a modified time greater than this                    # page is out of date, otherwise it is uptodate                    if last_modified<=cmt:                        uptodate=True            # Else if etag is available use it...            elif etag:                if cachekey['etag']:                    tag = cachekey['etag']                    if etag==tag:                        uptodate = True            # Finally use a checksum of actual data if everything else fails            elif urldata:                if cachekey['checksum']:                    cachesha = cachekey['checksum']                    digest = sha.new(urldata).hexdigest()                                        if cachesha == digest:                        uptodate=True                if not uptodate:            # Modified this logic - Anand Jan 10 06                        self.update_cache_for_url(urlobj, filename, urldata, contentlen, last_modified, etag)        return (uptodate, fileverified)    def conditional_cache_set(self):        """ A utility function to conditionally enable/disable        the cache mechanism """        # If already page cache is disabled, do not do anything        if not self._cfg.pagecache:            return                # If the cache file exists for this project, disable        # cache, else enable it.        cachefilename = self.get_proj_cache_filename()        if os.path.exists(cachefilename) and os.path.getsize(cachefilename):            self._cfg.writecache = False        else:            self._cfg.writecache = True    def post_download_setup(self):        """ Actions to perform after project is complete """        # Loop through URL db, one by one and then for those        # URLs which were downloaded but did not succeed, try again.        # But make sure we don't download links which were not-modified        # on server-side (HTTP 304) and hence were skipped.        failed = []        # Broken links (404)        nbroken = 0                for node in self._urldb.preorder():            urlobj = node.get()            # print 'URL=>',urlobj.get_full_url()                        if urlobj.status == 404:                # print 'BROKEN', urlobj.get_full_url()                nbroken += 1            elif urlobj.qstatus == urlparser.URL_DONE_DOWNLOAD and \                   urlobj.status != 0 and urlobj.status != 304:                failed.append(urlobj)                            self._numfailed = len(failed)        # print 'BROKEN=>', nbroken                if self._cfg.retryfailed:            info(' ')            # try downloading again            if self._numfailed:                info('Redownloading failed links...',)                self._redownload=True                                for urlobj in failed:                    if urlobj.fatal or urlobj.starturl: continue                    extrainfo('Re-downloading',urlobj.get_full_url())                    self._numretried += 1                    self.thread_download(urlobj)                                    # Wait for the downloads to complete...                if self._numretried:                    extrainfo("Waiting for the re-downloads to complete...")                    self._urlThreadPool.wait(10.0, self._cfg.timeout)                worked = 0                # Let us calculate the failed rate again...                for urlobj in failed:                    if urlobj.status == 0:                        # Download was done                        worked += 1                self._numfailed2 = self._numfailed - worked        # Stop the url thread pool        # Stop worker threads        self._urlThreadPool.stop_all_threads()                            # bugfix: Moved the time calculation code here.        t2=time.time()        self._cfg.endtime = t2        # Write cache file        if self._cfg.pagecache and self._cfg.writecache:            cachewriter = utils.HarvestManCacheReaderWriter(self.get_proj_cache_directory())            self.add_headers_to_cache()            cachewriter.write_project_cache(self.cache)        # If url header dump is enabled, dump it        if self._cfg.urlheaders:            self.dump_headers()        if self._cfg.localise:            self.localise_links()        # Write archive file...        if self._cfg.archive:            self.archive_project()                    # dump url tree (dependency tree) to a file        if self._cfg.urltreefile:            self.dump_urltree()        if not self._cfg.project: return        nlinks = self._urldb.size        # print stats of the project        nservers, ndirs, nfiltered = objects.rulesmgr.get_stats()        nfailed = self._numfailed        numstillfailed = self._numfailed2        numfiles = self.savedfiles        numfilesinrepos = self.reposfiles        numfilesincache = self.cachefiles        numretried = self._numretried                fetchtime = self._cfg.endtime-self._cfg.starttime                statsd = { 'links' : nlinks,                   'filtered': nfiltered,                   'processed': nlinks - nfiltered,                   'broken': nbroken,                   'extservers' : nservers,                   'extdirs' : ndirs,                   'failed' : nfailed,                   'fatal' : numstillfailed,                   'files' : numfiles,                   'filesinrepos' : numfilesinrepos,                   'filesincache' : numfilesincache,                   'retries' : numretried,                   'bytes': self.bytes,                   'fetchtime' : fetchtime,                }        self.print_project_info(statsd)        objects.eventmgr.raise_event('postdownload', None)            def check_exists(self, urlobj):        # Check if this URL object exits (is a duplicate)        return self._urldb.lookup(urlobj.index)            def update_bytes(self, count):        """ Update the global byte count """        self.bytes += count    def update_saved_bytes(self, count):        """ Update the saved byte count """        self.savedbytes += count            def update_file_stats(self, urlObject, status):        """ Add the passed information to the saved file list """        if not urlObject: return NULL_URLOBJECT_ERROR        filename = urlObject.get_full_filename()        if status == DOWNLOAD_YES_OK:            self.savedfiles += 1        elif status == DOWNLOAD_NO_UPTODATE:            self.reposfiles += 1        elif status == DOWNLOAD_NO_CACHE_SYNCED:            self.cachefiles += 1        elif status == DOWNLOAD_NO_WRITE_FILTERED:            self.filteredfiles += 1                                        return HARVESTMAN_OK        def update_links(self, source, collection):        """ Update the links dictionary for this collection """                self.collections.insert(source.index, collection)    def thread_download(self, url):        """ Schedule download of this web document in a separate thread """        # Add this task to the url thread pool        if self._urlThreadPool:            url.qstatus = urlparser.URL_QUEUED            self._urlThreadPool.push(url)    def has_download_threads(self):        """ Return true if there are any download sub-threads        running, else return false """        if self._urlThreadPool:            num_threads = self._urlThreadPool.has_busy_threads()            if num_threads:                return True        return False    def last_download_thread_report_time(self):        """ Get the time stamp of the last completed        download (sub) thread """        if self._urlThreadPool:            return self._urlThreadPool.last_thread_report_time()        else:            return 0    def kill_download_threads(self):        """ Terminate all the download threads """        if self._urlThreadPool:            self._urlThreadPool.end_all_threads()    def create_local_directory(self, directory):        """ Create the directories on the disk named 'directory' """        # new in 1.4.5 b1 - No need to create the        # directory for raw saves using the nocrawl        # option.        if self._cfg.rawsave:            return CREATE_DIRECTORY_OK                try:            # Fix for EIAO bug #491            # Sometimes, however had we try, certain links            # will be saved as files, whereas they might be            # in fact directories. In such cases, check if this            # is a file, then create a folder of the same name            # and move the file as index.html to it.            path = directory            while path:                if os.path.isfile(path):                    # Rename file to file.tmp                    fname = path                    os.rename(fname, fname + '.tmp')                    # Now make the directory                    os.makedirs(path)                    # If successful, move the renamed file as index.html to it                    if os.path.isdir(path):                        fname = fname + '.tmp'                        shutil.move(fname, os.path.join(path, 'index.html'))                                    path2 = os.path.dirname(path)                # If we hit the root, break                if path2 == path: break                path = path2                            if not os.path.isdir(directory):                os.makedirs( directory )                extrainfo("Created => ", directory)            return CREATE_DIRECTORY_OK        except OSError, e:            error("Error in creating directory", directory)            error(str(e))            return CREATE_DIRECTORY_NOT_OK        return CREATE_DIRECTORY_OK    def download_multipart_url(self, urlobj, clength):        """ Download a URL using HTTP/1.1 multipart download        using range headers """        # First add entry of this domain in        # dictionary, if not there        domain = urlobj.get_full_domain()

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -