📄 connector.py
字号:
return False # Create progress object prog = self._cfg.progressobj prog.setTopic(topic) #if n==1: prog.set(100, 100) #else: # prog.set(n, 100) if nolengthmode: prog.setNoLengthMode(True) if n>0: prog.setHasSub(True) if not nolengthmode: for x in range(1,n+1): prog.setSubTopic(x, subtopics[x-1]) prog.setSub(x, 0.0, 100) else: pass def make_tmp_fname(self, filename, directory='.'): """ Creates a temporary filename for download """ random.seed() while True: fint = int(random.random()*random.random()*10000000) fname = ''.join(('.',filename,'#',str(fint))) fpath = os.path.join(directory, fname) if not os.path.isfile(fpath): return fpath def create_request(self, urltofetch, lmt='', etag='', useragent=True): """ Creates request object for the URL 'urltofetch' and return it """ # This function takes care of adding any additional headers # etc in addition to creating the request object. # create a request object if lmt or etag: # print 'Making a head request', lmt, etag # Create a head request... request = HeadRequest(urltofetch) if lmt != '': ts = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.localtime(lmt)) request.add_header('If-Modified-Since', ts) if etag != '': request.add_header('If-None-Match', etag) else: request = urllib2.Request(urltofetch) # Some sites do not like User-Agent strings and raise a Bad Request # (HTTP 400) error. Egs: http://www.bad-ischl.ooe.gv.at/. In such # cases, the connect method, sets useragent flag to False and calls # this method again. # print 'User agent', self._cfg.USER_AGENT if useragent: request.add_header('User-Agent', self._cfg.USER_AGENT) # Check if any HTTP username/password are required username, password = self._cfg.username, self._cfg.passwd if username and password: # Add basic HTTP auth headers authstring = base64.encodestring('%s:%s' % (username, password)) request.add_header('Authorization','Basic %s' % authstring) return request def get_url_data(self, url): """ Downloads data for the given URL and returns it """ try: urlobj = urlparser.HarvestManUrl(url) res = self.connect(urlobj) return self._data except urlparser.HarvestManUrlError, e: error("URL Error: ",e) def connect2(self, urlobj, resuming=False): """ Connects to the Internet and fetches data for the URL encapsulated in the object 'urlobj'. This is the method used by Hget """ data = '' # Reset the http headers self._headers.clear() retries = self._cfg.retryfailed self._numtries = 0 urltofetch = urlobj.get_full_url() filename = urlobj.get_filename() dmgr = objects.datamgr rulesmgr = objects.rulesmgr showprogress = self._cfg.showprogress # Flag indicating we are reusing a previously redirected URL # to produce automatic mirror split automirror = urlobj.redirected and urlobj.redirected_old # print self, urltofetch while self._numtries <= retries: # If automirror we don't exit on even fatal errors since # a new request on the old URL can lead to a new mirror # which can work. If automirror we exit only after number # of retries are completed. if not automirror and self._error.fatal: break # Reset status self._status = 0 errnum = 0 try: # Reset error self._error.reset() self._numtries += 1 request = self.create_request(urltofetch) byterange = urlobj.range if byterange: range1 = byterange[0] range2 = byterange[-1] # For a repeat connection, don't redownload already downloaded data. if self._fo: datasofar = self._fo.get_datalen() if datasofar: range1 += datasofar # If this was a redirected old URL which we are re-using for # producing further redirections for auto-mirror downloads # don't add this header now, but add it later, after the # redirection has happened. if not automirror: request.add_header('Range','bytes=%d-%d' % (range1,range2)) self._freq = urllib2.urlopen(request) # Set status to 1 self._status = 1 actual_url = self._freq.geturl() if actual_url != urltofetch: # Don't do this for mirrors... if not urlobj.trymultipart: logconsole('Redirected to %s...' % actual_url) else: extrainfo('Redirected to %s...' % actual_url) if actual_url.replace(urltofetch, '') != '/': no_change = False else: no_change = True if no_change: if (actual_url[-1] == '/' and urltofetch[-1] != '/'): # Setting directory URL urlobj.set_directory_url() else: # Considerable change urlobj.redirected = True urlobj.url = actual_url debug("Re-resolving URL: Current is %s..." % urlobj.get_full_url()) urlobj.wrapper_resolveurl() debug("Re-resolving URL: New is %s..." % urlobj.get_full_url()) # Get filename again filename = urlobj.get_filename() # Get URL again if byterange and automirror: # print 'Automirror URL...!' range1, range2 = byterange # For a repeat connection, don't redownload already downloaded data. if self._fo: datasofar = self._fo.get_datalen() if datasofar: range1 += datasofar request = self.create_request(urlobj.get_full_url()) request.add_header('Range','bytes=%d-%d' % (range1,range2)) self._freq.close() self._freq = urllib2.urlopen(request) # Set http headers self.set_http_headers() encoding = self.get_content_encoding() ctype = self.get_content_type() clength = int(self.get_content_length()) if clength==0: clength_str = 'Unknown' elif clength>=1024*1024: clength_str = '%dM' % (clength/(1024*1024)) elif clength >=1024: clength_str = '%dK' % (clength/1024) else: clength_str = '%d bytes' % clength if showprogress and (resuming or (not urlobj.range)): if clength: logconsole('Length: %d (%s) Type: %s' % (clength, clength_str, ctype)) nolengthmode = False else: logconsole('Length: (%s) Type: %s' % (clength_str, ctype)) nolengthmode = True logconsole('Content Encoding: %s\n' % encoding) # FTP servers do not support HTTP like byte-range # requests. The way to do multipart for FTP is to use # the FTP restart (REST) command, but that requires writing # new wrappers on top of ftplib instead of the current simpler # way of routing everything using urllib2. This is planned # for later. # However if mirror search is enabled, we try to do it if urlobj.protocol == 'ftp://' and not self._cfg.mirrorsearch: trynormal = True if self._cfg.forcesplit: logconsole('FTP request, not trying multipart download, defaulting to single thread') else: trynormal = False # Check constraint on file size if (not byterange) and (not trynormal) and (not self._cfg.nomultipart) and self._cfg.forcesplit: # I hate local imports, but there is no other way to do this # except moving the function to this module. from harvestman.lib import mirrors if self._cfg.forcesplit and not self._cfg.mirrorsearch: logconsole('Forcing download into %d parts' % self._cfg.numparts) if (not self._headers.get('accept-ranges', '').lower() == 'bytes') and \ (not self._cfg.mirrorfile) and \ not mirrors.is_multipart_download_supported(urlobj) and \ not (self._cfg.mirrorsearch): logconsole('Checking whether server supports multipart downloads...') # See if the server supports 'Range' header # by requesting half the length self._headers.clear() # Need to re-create request in case URL has changed request = self.create_request(urlobj.get_full_url()) request.add_header('Range','bytes=%d-%d' % (0,clength/2)) self._freq.close() self._freq = urllib2.urlopen(request) # Set http headers self.set_http_headers() range_result = self._headers.get('accept-ranges', '') if range_result.lower()=='bytes': logconsole('Server supports multipart downloads') self._freq.close() else: logconsole('Server does not support multipart downloads') resp = raw_input('Do you still want to download this URL [y/n] ?') if resp.lower() !='y': logconsole('Aborting download.') return CONNECT_DOWNLOAD_ABORTED else: # Create a fresh request object self._freq.close() request = self.create_request(urlobj.get_full_url()) self._freq = urllib2.urlopen(request) logconsole('Downloading' % urlobj.get_full_url()) trynormal = True else: logconsole('Server supports multipart downloads') if not trynormal: logconsole('Trying multipart download...') urlobj.trymultipart = True ret = dmgr.download_multipart_url(urlobj, clength) if ret == URL_PUSHED_TO_POOL: # Set flag which indicates a multipart # download is in progress self._cfg.multipart = True # Set progress object if showprogress: self.set_progress_object(filename,1,[filename],nolengthmode) return CONNECT_MULTIPART_DOWNLOAD elif ret == MIRRORS_NOT_FOUND: return ret # if this is the not the first attempt, print a success msg if self._numtries>1: extrainfo("Reconnect succeeded => ", urlobj.get_full_url()) try: # Don't set progress object if multipart download - it # would have been done before. if showprogress and (resuming or (not urlobj.range)): self.set_progress_object(filename,1,[filename],nolengthmode) prog = self._cfg.progressobj mypercent = 0.0 # Report fname to calling thread ct = threading.currentThread() # Only set tmpfname if this is a fresh download. if self._tmpfname=='': if not self._cfg.hgetnotemp: if urlobj.trymultipart: localurl = urlobj.mirror_url.get_original_url() else: localurl = urlobj.get_original_url() tmpd = os.path.join(GetMyTempDir(), str(abs(hash(localurl)))) else: tmpd = '.' self._tmpfname = self.make_tmp_fname(filename, tmpd) if ct.__class__.__name__ == 'HarvestManUrlThread': ct.set_tmpfname(self._tmpfname) if self._fo==None: self._fo = HarvestManFileObject(self._freq, self._tmpfname, cle
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -