📄 datamgr.py
字号:
orig_url = urlobj.get_full_url() old_urlobj = urlobj.get_original_state() domain_changed_a_lot = False # If this was a re-directed URL, check if there is a # considerable change in the domains. If there is, # there is a very good chance that the original URL # is redirecting to mirrors, so we can split on # the original URL and this would automatically # produce a split-mirror download without us having # to do any extra work! if urlobj.redirected and old_urlobj != None: old_domain = old_urlobj.get_domain() if old_domain != domain: # Check if it is somewhat similar # if domain.find(old_domain) == -1: domain_changed_a_lot = True try: self._serversdict[domain] except KeyError: self._serversdict[domain] = {'accept-ranges': True} if self.mirrormgr.mirrors_available(urlobj): return self.mirrormgr.download_multipart_url(urlobj, clength, self._cfg.numparts, self._urlThreadPool) else: if domain_changed_a_lot: urlobj = old_urlobj # Set a flag to indicate this urlobj.redirected_old = True parts = self._cfg.numparts # Calculate size of each piece piecesz = clength/parts # Calculate size of each piece pcsizes = [piecesz]*parts # For last URL add the reminder pcsizes[-1] += clength % parts # Create a URL object for each and set range urlobjects = [] for x in range(parts): urlobjects.append(copy.copy(urlobj)) prev = 0 for x in range(parts): curr = pcsizes[x] next = curr + prev urlobject = urlobjects[x] # Set mirror_url attribute urlobject.mirror_url = urlobj urlobject.trymultipart = True urlobject.clength = clength urlobject.range = (prev, next-1) urlobject.mindex = x prev = next self._urlThreadPool.push(urlobject) # Push this URL objects to the pool return URL_PUSHED_TO_POOL def download_url(self, caller, url): no_threads = (not self._cfg.usethreads) or \ url.is_webpage() or \ url.is_stylesheet() data="" if no_threads: # This call will block if we exceed the number of connections url.qstatus = urlparser.URL_QUEUED conn = objects.connfactory.create_connector() # Set status to queued url.qstatus = urlparser.URL_IN_QUEUE res = conn.save_url( url ) objects.connfactory.remove_connector(conn) filename = url.get_full_filename() if res != CONNECT_NO_ERROR: filename = url.get_full_filename() self.update_file_stats( url, res ) if res==DOWNLOAD_YES_OK: info("Saved",filename) if url.is_webpage(): if self._cfg.datamode==CONNECTOR_DATA_MODE_INMEM: data = conn.get_data() elif os.path.isfile(filename): # Need to read data from the file... data = open(filename, 'rb').read() else: fetchurl = url.get_full_url() extrainfo( "Failed to download url", fetchurl) self._urldb.update(url.index, url) else: # Set status to queued self.thread_download( url ) return data def clean_up(self): """ Purge data for a project by cleaning up lists, dictionaries and resetting other member items""" # Reset byte count if self._urldb and self._urldb.size: del self._urldb if self.collections and self.collections.size: del self.collections self.reset() def archive_project(self): """ Archive project files into a tar archive file. The archive will be further compressed in gz or bz2 format """ extrainfo("Archiving project files...") # Get project directory projdir = self._cfg.projdir # Get archive format if self._cfg.archformat=='bzip': format='bz2' elif self._cfg.archformat=='gzip': format='gz' else: error("Archive Error: Archive format not recognized") return INVALID_ARCHIVE_FORMAT # Create tarfile name ptarf = os.path.join(self._cfg.basedir, "".join((self._cfg.project,'.tar.',format))) cwd = os.getcwd() os.chdir(self._cfg.basedir) # Create tarfile object tf = tarfile.open(ptarf,'w:'+format) # Projdir base name pbname = os.path.basename(projdir) # Add directories for item in os.listdir(projdir): # Skip cache directory, if any if item=='hm-cache': continue # Add directory fullpath = os.path.join(projdir,item) if os.path.isdir(fullpath): tf.add(os.path.join(pbname,item)) # Dump the tarfile tf.close() os.chdir(cwd) # Check whether writing was done if os.path.isfile(ptarf): extrainfo("Wrote archive file",ptarf) return FILE_WRITE_OK else: error("Error in writing archive file",ptarf) return FILE_WRITE_ERROR def add_headers_to_cache(self): """ Add original URL headers of urls downloaded as an entry to the cache file """ # Navigate in pre-order, i.e in the order of insertion... for node in self.collections.preorder(): coll = node.get() # Get list of links for this collection for urlobjidx in coll.getAllURLs(): urlobj = self.get_url(urlobjidx) if urlobj==None: continue url = urlobj.get_full_url() # Get headers headers = urlobj.get_url_content_info() if headers: content = self.cache._url[url] if content: urldict = content[0] urldict['headers'] = headers def dump_headers(self): """ Dump the headers of the web pages downloaded, into a DBM file """ # print dbmfile extrainfo("Writing url headers database") headersdict = {} for node in self.collections.preorder(): coll = node.get() for urlobjidx in coll.getAllURLs(): urlobj = self.get_url(urlobjidx) if urlobj: url = urlobj.get_full_url() # Get headers headers = urlobj.get_url_content_info() if headers: headersdict[url] = str(headers) cache = utils.HarvestManCacheReaderWriter(self.get_proj_cache_directory()) return cache.write_url_headers(headersdict) def localise_links(self): """ Localise all links (urls) of the downloaded html pages """ # Dont confuse 'localising' with language localization. # This means just converting the outward (Internet) pointing # URLs in files to local files. info('Localising links of downloaded web pages...',) count = 0 localized = [] for node in self.collections.preorder(): coll = node.get() sourceurl = self.get_url(coll.getSourceURL()) childurls = [self.get_url(index) for index in coll.getAllURLs()] filename = sourceurl.get_full_filename() if (not filename in localized) and os.path.exists(filename): extrainfo('Localizing links for',filename) if SUCCESS(self.localise_file_links(filename, childurls)): count += 1 localized.append(filename) info('Localised links of',count,'web pages.') def localise_file_links(self, filename, links): """ Localise links for this file """ data='' try: fw=open(filename, 'r+') data=fw.read() fw.seek(0) fw.truncate(0) except (OSError, IOError),e: return FILE_TRUNCATE_ERROR # Regex1 to replace ( at the end r1 = re.compile(r'\)+$') r2 = re.compile(r'\(+$') # MOD: Replace any <base href="..."> line basehrefre = re.compile(r'<base href=.*>', re.IGNORECASE) if basehrefre.search(data): data = re.sub(basehrefre, '', data) for u in links: if not u: continue url_object = u typ = url_object.get_type() if url_object.is_image(): http_str="src" else: http_str="href" v = url_object.get_original_url() if v == '/': continue # Somehow, some urls seem to have an # unbalanced parantheses at the end. # Remove it. Otherwise it will crash # the regular expressions below. v = r1.sub('', v) v2 = r2.sub('', v) # Bug fix, dont localize cgi links if typ != 'base': if url_object.is_cgi(): continue fullfilename = os.path.abspath( url_object.get_full_filename() ) urlfilename='' # Modification: localisation w.r.t relative pathnames if self._cfg.localise==2: urlfilename = url_object.get_relative_filename() elif self._cfg.localise==1: urlfilename = fullfilename # replace '\\' with '/' urlfilename = urlfilename.replace('\\','/') newurl='' oldurl='' # If we cannot get the filenames, replace # relative url paths will full url paths so that # the user can connect to them. if not os.path.exists(fullfilename): # for relative links, replace it with the # full url path fullurlpath = url_object.get_full_url_sans_port() newurl = "href=\"" + fullurlpath + "\"" else: # replace url with urlfilename if typ == 'anchor': anchor_part = url_object.get_anchor() urlfilename = "".join((urlfilename, anchor_part)) # v = "".join((v, anchor_part)) if self._cfg.localise == 1: newurl= "".join((http_str, "=\"", "file://", urlfilename, "\"")) else: newurl= "".join((http_str, "=\"", urlfilename, "\"")) else: newurl="".join((http_str,"=\"","\"")) if typ != 'img': oldurl = "".join((http_str, "=\"", v, "\"")) try: oldurlre = re.compile("".join((http_str,'=','\\"?',v,'\\"?'))) except Exception, e: debug("Error:",str(e)) continue # Get the location of the link in the file try: if oldurl != newurl: data = re.sub(oldurlre, newurl, data,1) except Exception, e: debug("Error:",str(e)) continue else: try: oldurlre1 = "".join((http_str,'=','\\"?',v,'\\"?'))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -