📄 connector.py
字号:
extrainfo('Setting directory url=>',urltofetch) urlobj.set_directory_url() else: # There is considerable change in the URL. # So we need to re-resolve it, since otherwies # some child URLs which derive from this could # be otherwise invalid and will result in 404 # errors. urlobj.redirected = True urlobj.url = actual_url debug('Actual URL=>',actual_url) debug("Re-resolving URL: Current is %s..." % urlobj.get_full_url()) urlobj.wrapper_resolveurl() debug("Re-resolving URL: New is %s..." % urlobj.get_full_url()) urltofetch = urlobj.get_full_url() # Find the actual type... if type was assumed # as wrong, correct it. content_type = self.get_content_type() urlobj.manage_content_type(content_type) # update byte count # if this is the not the first attempt, print a success msg if self._numtries>1: extrainfo("Reconnect succeeded => ", urltofetch) # Update content info on urlobject self.set_content_info(urlobj) if fetchdata: try: # If gzip-encoded, need to deflate data encoding = self.get_content_encoding() clength = self.get_content_length() t1 = time.time() if self._fo==None: if self._mode==CONNECTOR_DATA_MODE_FLUSH: if self._cfg.projtmpdir: self._tmpfname = self.make_tmp_fname(urlobj.get_filename(), self._cfg.projtmpdir) else: # For stand-alone use outside crawls self._tmpfname = self.make_tmp_fname(urlobj.get_filename(), GetMyTempDir()) self._fo = HarvestManFileObject(self._freq, self._tmpfname, clength, self._mode, float(self._cfg.bandwidthlimit)) self._fo.initialize() else: self._fo.set_fileobject(self._freq) self._fo.read() self._elapsed = time.time() - t1 self._freq.close() if self._mode==CONNECTOR_DATA_MODE_INMEM: data = self._fo.get_data() self._datalen = len(data) # Save a reference data0 = data self._freq.close() dmgr.update_bytes(len(data)) debug('Encoding',encoding) if encoding.strip().find('gzip') != -1: try: gzfile = gzip.GzipFile(fileobj=cStringIO.StringIO(data)) data = gzfile.read() gzfile.close() except (IOError, EOFError), e: data = data0 pass else: self._datalen = self._fo.get_datalen() dmgr.update_bytes(self._datalen) except MemoryError, e: # Catch memory error for sockets error("Memory Error:",str(e)) # Explicitly set the status of urlobj to zero since # download was completed... urlobj.status = 0 break #except Exception, e: # raise except urllib2.HTTPError, e: try: errbasic, errdescn = (str(e)).split(':',1) parts = errbasic.strip().split() self._error.number = int(parts[-1]) self._error.msg = errdescn.strip() self._error.errclass = "HTTPError" except: pass if self._error.msg: error(self._error.msg, '=> ',urltofetch) else: error('HTTPError:',urltofetch) try: errnum = int(self._error.number) except: pass if errnum==304: # Page not modified three_oh_four = True self._error.fatal = False # Need to do this to ensure that the crawler # proceeds further! content_type = self.get_content_type() urlobj.manage_content_type(content_type) break if errnum in range(400, 407): # 400 => bad request # 401 => Unauthorized # 402 => Payment required (not used) # 403 => Forbidden # 404 => Not found # 405 => Method not allowed # 406 => Not acceptable # If error is 400, 405 or 406, then we # retry with the useragent string not set. if errnum in (400, 405, 406): self._cfg._badrequests += 1 # If we get many badrequests in a row # we disable UA addition for this crawl. if self._cfg._badrequests>=5: self._cfg._connaddua = False if self._numtries<=retries: add_ua = False else: self._error.fatal = True else: self._error.fatal = True elif errnum == 407: # Proxy authentication required self._proxy_query(1, 1) elif errnum == 408: # Request timeout, try again pass elif errnum == 412: # Pre-condition failed, this has been # detected due to our user-agent on some # websites (sample URL: http://guyh.textdriven.com/) self._error.fatal = True elif errnum in range(409, 418): # Error codes in 409-417 contain a mix of # fatal and non-fatal states. For example # 410 indicates requested resource is no # Longer available, but we could try later. # However for all practical purposes, we # are marking these codes as fatal errors # for the time being. self._error.fatal = True elif errnum == 500: # Internal server error, can try again pass elif errnum == 501: # Server does not implement the functionality # to fulfill the request - fatal self._error.fatal = True elif errnum == 502: # Bad gateway, can try again ? pass elif errnum in (503, 506): # 503 - Service unavailable # 504 - Gatway timeout # 505 - HTTP version not supported self._error.fatal = True if self._error.fatal: rulesmgr.add_to_filter(urltofetch) except urllib2.URLError, e: # print 'urlerror',urltofetch errdescn = '' self._error.errclass = "URLError" try: errbasic, errdescn = (str(e)).split(':',1) parts = errbasic.split() except: try: errbasic, errdescn = (str(e)).split(',') parts = errbasic.split('(') errdescn = (errdescn.split("'"))[1] except: pass try: self._error.number = int(parts[-1]) except: pass if errdescn: self._error.msg = errdescn if self._error.msg: error(self._error.msg, '=> ',urltofetch) else: error('URLError:',urltofetch) errnum = self._error.number if errnum == 10049 or errnum == 10061: # Proxy server error self._proxy_query(1, 1) elif errnum == 10055: # no buffer space available self.network_conn.increment_socket_errors() # If the number of socket errors is >= 4 # we decrease max connections by 1 sockerrs = self.network_conn.get_socket_errors() if sockerrs>=4: self._cfg.connections -= 1 self.network_conn.decrement_socket_errors(4) except IOError, e: self._error.number = URL_IO_ERROR self._error.fatal=True self._error.msg = str(e) self._error.errclass = "IOError" # Generated by invalid ftp hosts and # other reasons, # bug(url: http://www.gnu.org/software/emacs/emacs-paper.html) error(e ,'=> ',urltofetch) except BadStatusLine, e: self._error.number = URL_BADSTATUSLINE self._error.msg = str(e) self._error.errclass = "BadStatusLine" error(e, '=> ',urltofetch) except TypeError, e: self._error.number = URL_TYPE_ERROR self._error.msg = str(e) self._error.errclass = "TypeError" error(e, '=> ',urltofetch) except ValueError, e: self._error.number = URL_VALUE_ERROR self._error.msg = str(e) self._error.errclass = "ValueError" error(e, '=> ',urltofetch) except AssertionError, e: self._error.number = URL_ASSERTION_ERROR self._error.msg = str(e) self._error.errclass = "AssertionError" error(e ,'=> ',urltofetch) except socket.error, e: self._error.msg = str(e) self._error.number = URL_SOCKET_ERROR self._error.errclass = "SocketError" errmsg = self._error.msg error('Socket Error: ', errmsg,'=> ',urltofetch) if errmsg.lower().find('connection reset by peer') != -1: # Connection reset by peer (socket error) self.network_conn.increment_socket_errors() # If the number of socket errors is >= 4 # we decrease max connections by 1 sockerrs = self.network_conn.get_socket_errors() if sockerrs>=4: self._cfg.connections -= 1 self.network_conn.decrement_socket_errors(4) except socket.timeout, e: self._error['msg'] = 'socket timed out' self._error['number'] = URL_SOCKET_TIMEOUT errmsg = self._error['msg'] error('Socket Error: ', errmsg,'=> ',urltofetch) except Exception, e: self._error.msg = str(e) self._error.number = URL_GENERAL_ERROR self._error.errclass = "GeneralError" errmsg = self._error.msg error('General Error: ', errmsg,'=> ',urltofetch) # attempt reconnect after some time # self.evnt.sleep() time.sleep(self._sleeptime) if data: self._data = data # Set hash on URL object urlobj.pagehash = sha.new(data).hexdigest() # print 'URLOBJ STATUS=>',urlobj.status if urlobj and urlobj.status != 0: urlobj.status = self._error.number urlobj.fatal = self._error.fatal debug('Setting %s status to %s' % (urlobj.get_full_url(), str(urlobj.status))) # Raise an event... objects.eventmgr.raise_event('afterconnect', urlobj, None) if three_oh_four: return CONNECT_NO_UPTODATE if self._data or self._datalen: return CONNECT_YES_DOWNLOADED else: return CONNECT_NO_ERROR def set_progress_object(self, topic, n=0, subtopics=[], nolengthmode=False): """ Set the progress bar with the given topic and sub-topics """ # n=> number of subtopics # topic => Topic # subtopics => List of subtopics # n should be = len(subtopics) if n != len(subtopics):
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -