📄 datamgr.py
字号:
content = self.cache._url[url] if content: return content[0].get('last_modified', '') else: return '' def get_etag(self, urlobj): """ Return the etag of the given URL if it was found in the cache """ # This will be called from connector to avoid downloading # URL data using HTTP 304. if (not self._cfg.pagecache): return '' url = urlobj.get_full_url() content = self.cache._url[url] if content: return content[0].get('etag', '') else: return '' def is_url_cache_uptodate(self, urlobj, filename, urldata, contentlen=0, last_modified=0, etag=''): """ Check with project cache and find out if the content needs update """ # If page caching is not enabled, return False # straightaway! # print 'Inside is_url_cache_uptodate...' if not self._cfg.pagecache: return (False, False) # Return True if cache is uptodate(no update needed) # and False if cache is out-of-date(update needed) # NOTE: We are using an comparison of the sha checksum of # the file's data with the sha checksum of the cache file. # Assume that cache is not uptodate apriori uptodate, fileverified = False, False url = urlobj.get_full_url() content = self.cache._url[url] if content: cachekey = content[0] cachekey['updated']=False fileloc = cachekey['location'] if os.path.exists(fileloc) and os.path.abspath(fileloc) == os.path.abspath(filename): fileverified=True # Use a cascading logic - if last_modified is available use it first if last_modified: if cachekey['last_modified']: # Get current modified time cmt = cachekey['last_modified'] # print cmt,'=>',lmt # If the latest page has a modified time greater than this # page is out of date, otherwise it is uptodate if last_modified<=cmt: uptodate=True # Else if etag is available use it... elif etag: if cachekey['etag']: tag = cachekey['etag'] if etag==tag: uptodate = True # Finally use a checksum of actual data if everything else fails elif urldata: if cachekey['checksum']: cachesha = cachekey['checksum'] digest = sha.new(urldata).hexdigest() if cachesha == digest: uptodate=True if not uptodate: # Modified this logic - Anand Jan 10 06 self.update_cache_for_url(urlobj, filename, urldata, contentlen, last_modified, etag) return (uptodate, fileverified) def conditional_cache_set(self): """ A utility function to conditionally enable/disable the cache mechanism """ # If already page cache is disabled, do not do anything if not self._cfg.pagecache: return # If the cache file exists for this project, disable # cache, else enable it. cachefilename = self.get_proj_cache_filename() if os.path.exists(cachefilename) and os.path.getsize(cachefilename): self._cfg.writecache = False else: self._cfg.writecache = True def post_download_setup(self): """ Actions to perform after project is complete """ # Loop through URL db, one by one and then for those # URLs which were downloaded but did not succeed, try again. # But make sure we don't download links which were not-modified # on server-side (HTTP 304) and hence were skipped. failed = [] # Broken links (404) nbroken = 0 for node in self._urldb.preorder(): urlobj = node.get() # print 'URL=>',urlobj.get_full_url() if urlobj.status == 404: # print 'BROKEN', urlobj.get_full_url() nbroken += 1 elif urlobj.qstatus == urlparser.URL_DONE_DOWNLOAD and \ urlobj.status != 0 and urlobj.status != 304: failed.append(urlobj) self._numfailed = len(failed) # print 'BROKEN=>', nbroken if self._cfg.retryfailed: info(' ') # try downloading again if self._numfailed: info('Redownloading failed links...',) self._redownload=True for urlobj in failed: if urlobj.fatal or urlobj.starturl: continue extrainfo('Re-downloading',urlobj.get_full_url()) self._numretried += 1 self.thread_download(urlobj) # Wait for the downloads to complete... if self._numretried: extrainfo("Waiting for the re-downloads to complete...") self._urlThreadPool.wait(10.0, self._cfg.timeout) worked = 0 # Let us calculate the failed rate again... for urlobj in failed: if urlobj.status == 0: # Download was done worked += 1 self._numfailed2 = self._numfailed - worked # Stop the url thread pool # Stop worker threads self._urlThreadPool.stop_all_threads() # bugfix: Moved the time calculation code here. t2=time.time() self._cfg.endtime = t2 # Write cache file if self._cfg.pagecache and self._cfg.writecache: cachewriter = utils.HarvestManCacheReaderWriter(self.get_proj_cache_directory()) self.add_headers_to_cache() cachewriter.write_project_cache(self.cache) # If url header dump is enabled, dump it if self._cfg.urlheaders: self.dump_headers() if self._cfg.localise: self.localise_links() # Write archive file... if self._cfg.archive: self.archive_project() # dump url tree (dependency tree) to a file if self._cfg.urltreefile: self.dump_urltree() if not self._cfg.project: return nlinks = self._urldb.size # print stats of the project nservers, ndirs, nfiltered = objects.rulesmgr.get_stats() nfailed = self._numfailed numstillfailed = self._numfailed2 numfiles = self.savedfiles numfilesinrepos = self.reposfiles numfilesincache = self.cachefiles numretried = self._numretried fetchtime = self._cfg.endtime-self._cfg.starttime statsd = { 'links' : nlinks, 'filtered': nfiltered, 'processed': nlinks - nfiltered, 'broken': nbroken, 'extservers' : nservers, 'extdirs' : ndirs, 'failed' : nfailed, 'fatal' : numstillfailed, 'files' : numfiles, 'filesinrepos' : numfilesinrepos, 'filesincache' : numfilesincache, 'retries' : numretried, 'bytes': self.bytes, 'fetchtime' : fetchtime, } self.print_project_info(statsd) objects.eventmgr.raise_event('postdownload', None) def check_exists(self, urlobj): # Check if this URL object exits (is a duplicate) return self._urldb.lookup(urlobj.index) def update_bytes(self, count): """ Update the global byte count """ self.bytes += count def update_saved_bytes(self, count): """ Update the saved byte count """ self.savedbytes += count def update_file_stats(self, urlObject, status): """ Add the passed information to the saved file list """ if not urlObject: return NULL_URLOBJECT_ERROR filename = urlObject.get_full_filename() if status == DOWNLOAD_YES_OK: self.savedfiles += 1 elif status == DOWNLOAD_NO_UPTODATE: self.reposfiles += 1 elif status == DOWNLOAD_NO_CACHE_SYNCED: self.cachefiles += 1 elif status == DOWNLOAD_NO_WRITE_FILTERED: self.filteredfiles += 1 return HARVESTMAN_OK def update_links(self, source, collection): """ Update the links dictionary for this collection """ self.collections.insert(source.index, collection) def thread_download(self, url): """ Schedule download of this web document in a separate thread """ # Add this task to the url thread pool if self._urlThreadPool: url.qstatus = urlparser.URL_QUEUED self._urlThreadPool.push(url) def has_download_threads(self): """ Return true if there are any download sub-threads running, else return false """ if self._urlThreadPool: num_threads = self._urlThreadPool.has_busy_threads() if num_threads: return True return False def last_download_thread_report_time(self): """ Get the time stamp of the last completed download (sub) thread """ if self._urlThreadPool: return self._urlThreadPool.last_thread_report_time() else: return 0 def kill_download_threads(self): """ Terminate all the download threads """ if self._urlThreadPool: self._urlThreadPool.end_all_threads() def create_local_directory(self, directory): """ Create the directories on the disk named 'directory' """ # new in 1.4.5 b1 - No need to create the # directory for raw saves using the nocrawl # option. if self._cfg.rawsave: return CREATE_DIRECTORY_OK try: # Fix for EIAO bug #491 # Sometimes, however had we try, certain links # will be saved as files, whereas they might be # in fact directories. In such cases, check if this # is a file, then create a folder of the same name # and move the file as index.html to it. path = directory while path: if os.path.isfile(path): # Rename file to file.tmp fname = path os.rename(fname, fname + '.tmp') # Now make the directory os.makedirs(path) # If successful, move the renamed file as index.html to it if os.path.isdir(path): fname = fname + '.tmp' shutil.move(fname, os.path.join(path, 'index.html')) path2 = os.path.dirname(path) # If we hit the root, break if path2 == path: break path = path2 if not os.path.isdir(directory): os.makedirs( directory ) extrainfo("Created => ", directory) return CREATE_DIRECTORY_OK except OSError, e: error("Error in creating directory", directory) error(str(e)) return CREATE_DIRECTORY_NOT_OK return CREATE_DIRECTORY_OK def download_multipart_url(self, urlobj, clength): """ Download a URL using HTTP/1.1 multipart download using range headers """ # First add entry of this domain in # dictionary, if not there domain = urlobj.get_full_domain()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -