📄 connector.py
字号:
urllib2.FTPHandler, #urllib2.GopherHandler, urllib2.FileHandler, urllib2.HTTPDefaultErrorHandler, cookiehandler) opener.addheaders=[] #Need to clear default headers so we can apply our own urllib2.install_opener(opener) return CONFIGURE_PROTOCOL_OK # Get methods def get_useproxy(self): """ Returns whether we are going through a proxy server """ return self._useproxy def get_proxy_info(self): """ Return proxy information as a tuple. The first member of the tuple is the proxy server dictionary and the second member the proxy authentication information """ return (self._proxydict, self._proxyauth) def increment_socket_errors(self, val=1): """ Increment socket error count """ self._sockerrs += val def decrement_socket_errors(self, val=1): """ Decrement socket error count """ self._sockerrs -= val def get_socket_errors(self): """ Get socket error count """ return self._sockerrsclass HarvestManUrlError(object): """ Class encapsulating errors raised by HarvestManUrlConnector objects while connecting and downloading data from the Internet """ def __init__(self): """ Overloaded __init__ method """ self.initialize() def initialize(self): """ Initializes an instance of this class """ self.reset() def __str__(self): """ Returns string representation of an instance of the class """ return ''.join((str(self.errclass),' ', str(self.number),': ',self.msg)) def reset(self): """ Resets attributes """ self.number = 0 self.msg = '' self.fatal = False self.errclass = '' class HarvestManUrlConnector(object): """ Class which performs the work of fetching data for URLs from the Internet and save data to the disk """ __metaclass__ = MethodWrapperMetaClass def __str__(self): """ Return a string representation of an instance of this class """ return `self` def __init__(self): """ Overloaded __init__ method """ # file like object returned by # urllib2.urlopen(...) self._freq = None # data downloaded self._data = '' # length of data downloaded self._datalen = 0 # error object self._error = HarvestManUrlError() # time to wait before reconnect # in case of failed connections self._sleeptime = 0.5 # global network configurator self.network_conn = objects.connmgr # Config object self._cfg = objects.config # Http header for current connection self._headers = CaselessDict() # HarvestMan file object self._fo = None # Elasped time for reading data self._elapsed = 0.0 # Mode for data download self._mode = self._cfg.datamode # Temporary filename if any self._tmpfname = '' # Status of connection # 0 => no connection # 1 => connected, download in progress self._status = 0 # Number of tries self._numtries = 0 # Acquired flag self._acquired = True # Block write flag - to be used # to indicate to connector to # not save the data to disk self.blockwrite = False # Throttle sleeping time to be # set on the file object self.throttle_time = 0 def __del__(self): del self._data self._data = None del self._freq self._freq = None del self._error self._error = None del self.network_conn self.network_conn = None del self._cfg self._cfg = None def _proxy_query(self, queryauth=1, queryserver=0): """ Query the user for proxy related information """ self.network_conn.set_useproxy(True) if queryserver or queryauth: # There is an error in the config file/project file/user input SetUserDebug("Error in proxy server settings (Regenerate the config/project file)") # Get proxy info from user try: if queryserver: server=bin_crypt(raw_input('Enter the name/ip of your proxy server: ')) port=int(raw_input('Enter the proxy port: ')) self.network_conn.set_proxy(server, port) if queryauth: user=bin_crypt(raw_input('Enter username for your proxy server: ')) # Ask for password only if a valid user is given. if user: passwd=bin_crypt(getpass.getpass('Enter password for your proxy server: ')) # Set it on myself and re-configure self.network_conn.set_authinfo(user,passwd) except EOFError, e: error("Proxy Setting Error:",e) info('Re-configuring protocol handlers...') self.network_conn.configure_protocols() extrainfo('Done.') def release(self): """ Marks the connector object as released """ self._acquired = False def is_released(self): """ Returns whether the connector was released or not """ return (not self._acquired) def urlopen(self, url): """ Opens the URL and returns the url file stream """ try: urlobj = urlparser.HarvestManUrl(url) self.connect(urlobj, True, self._cfg.retryfailed ) # return the file like object if self._error.fatal: return None else: return self._freq except urlparser.HarvestManUrlError, e: error("URL Error:",e) def robot_urlopen(self, url): """ Opens a robots.txt URL and returns the request object """ try: urlobj = urlparser.HarvestManUrl(url) self.connect(urlobj, False, 0) # return the file like object if self._error.fatal: return None else: return self._freq except urlparser.HarvestManUrlError, e: error("URL Error:",e) def connect(self, urlobj, fetchdata=True, retries=1, lastmodified='', etag=''): """ Connects to the Internet and fetches data for the URL encapsulated in the object 'urlobj' """ # This is the work-horse method of this class... data = '' dmgr = objects.datamgr rulesmgr = objects.rulesmgr self._numtries = 0 three_oh_four = False # Reset the http headers self._headers.clear() urltofetch = urlobj.get_full_url() lmt, tag = lastmodified, etag # Raise an event... if objects.eventmgr.raise_event('beforeconnect', urlobj, None, last_modified=lastmodified, etag=etag)==False: return CONNECT_NO_FILTERED add_ua = self._cfg._connaddua while self._numtries <= retries and not self._error.fatal: # Reset status self._status = 0 errnum = 0 try: # Reset error self._error.reset() self._numtries += 1 # create a request object # If we are passed either the lastmodified time or # the etag value or both, we will be creating a # head request. Now if either the etag or lastmodified # time match, the server should produce a 304 error # and we break the loop automatically. If not, we have # to set lmt and tag values to null strings so that # we make an actual request. # Set lmt, tag to null strings if try count is greater # than 1... if self._numtries>1: lmt, tag = '', '' request = self.create_request(urltofetch, lmt, tag, useragent=add_ua) # Check for urlobject which is trying to do # multipart download. #byterange = urlobj.range #if byterange: # range1 = byterange[0] # range2 = byterange[-1] # request.add_header('Range','bytes=%d-%d' % (range1, range2)) # If we accept http-compression, add the required header. if self._cfg.httpcompress: request.add_header('Accept-Encoding', 'gzip') self._freq = urllib2.urlopen(request) # Set status to 1 self._status = 1 # Set http headers self.set_http_headers() clength = int(self.get_content_length()) if urlobj: urlobj.clength = clength trynormal = False # Check constraint on file size, dont do this on # objects which are already downloading pieces of # a multipart download. if not self.check_content_length(): # and not byterange maxsz = self._cfg.maxfilesize extrainfo("Url",urltofetch,"does not match size constraints") # Raise an event... objects.eventmgr.raise_event('afterconnect', urlobj, None) return CONNECT_NO_RULES_VIOLATION ## supports_multipart = dmgr.supports_range_requests(urlobj) ## # Dont do range checking on FTP servers since they## # typically support it by default.## if urlobj.protocol != 'ftp' and supports_multipart==0:## # See if the server supports 'Range' header## # by requesting half the length## self._headers.clear()## request.add_header('Range','bytes=%d-%d' % (0,clength/2))## self._freq = urllib2.urlopen(request)## # Set http headers## self.set_http_headers()## range_result = self._headers.get('accept-ranges')## if range_result.lower()=='bytes':## supports_multipart = 1## else:## extrainfo('Server %s does not support multipart downloads' % urlobj.domain)## extrainfo('Aborting download of URL %s.' % urltofetch)## return CONNECT_NO_RULES_VIOLATION## if supports_multipart==1:## extrainfo('Server %s supports multipart downloads' % urlobj.domain)## dmgr.download_multipart_url(urlobj, clength)## return CONNECT_MULTIPART_DOWNLOAD # The actual url information is used to # differentiate between directory like urls # and file like urls. actual_url = self._freq.geturl() # Replace the urltofetch in actual_url with null if actual_url: no_change = (actual_url == urltofetch) if not no_change: replacedurl = actual_url.replace(urltofetch, '') # If the difference is only as a directory url if replacedurl=='/': no_change = True else: no_change = False # Sometimes, there could be HTTP re-directions which # means the actual url may not be same as original one. if no_change: if (actual_url[-1] == '/' and urltofetch[-1] != '/'):
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -