⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 connector.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 5 页
字号:
                                extrainfo('Setting directory url=>',urltofetch)                                urlobj.set_directory_url()                                                        else:                            # There is considerable change in the URL.                            # So we need to re-resolve it, since otherwies                            # some child URLs which derive from this could                            # be otherwise invalid and will result in 404                            # errors.                            urlobj.redirected = True                                                        urlobj.url = actual_url                            debug('Actual URL=>',actual_url)                            debug("Re-resolving URL: Current is %s..." % urlobj.get_full_url())                            urlobj.wrapper_resolveurl()                            debug("Re-resolving URL: New is %s..." % urlobj.get_full_url())                            urltofetch = urlobj.get_full_url()                                    # Find the actual type... if type was assumed                # as wrong, correct it.                content_type = self.get_content_type()                urlobj.manage_content_type(content_type)                                        # update byte count                # if this is the not the first attempt, print a success msg                if self._numtries>1:                    extrainfo("Reconnect succeeded => ", urltofetch)                # Update content info on urlobject                self.set_content_info(urlobj)                if fetchdata:                    try:                        # If gzip-encoded, need to deflate data                        encoding = self.get_content_encoding()                        clength = self.get_content_length()                                                t1 = time.time()                                                if self._fo==None:                            if self._mode==CONNECTOR_DATA_MODE_FLUSH:                                if self._cfg.projtmpdir:                                    self._tmpfname = self.make_tmp_fname(urlobj.get_filename(),                                                                         self._cfg.projtmpdir)                                else:                                    # For stand-alone use outside crawls                                    self._tmpfname = self.make_tmp_fname(urlobj.get_filename(),                                                                         GetMyTempDir())                            self._fo = HarvestManFileObject(self._freq,                                                            self._tmpfname,                                                            clength,                                                            self._mode,                                                            float(self._cfg.bandwidthlimit))                            self._fo.initialize()                        else:                            self._fo.set_fileobject(self._freq)                                                self._fo.read()                        self._elapsed = time.time() - t1                                                self._freq.close()                                                if self._mode==CONNECTOR_DATA_MODE_INMEM:                            data = self._fo.get_data()                            self._datalen = len(data)                            # Save a reference                            data0 = data                            self._freq.close()                                                    dmgr.update_bytes(len(data))                            debug('Encoding',encoding)                                                    if encoding.strip().find('gzip') != -1:                                try:                                    gzfile = gzip.GzipFile(fileobj=cStringIO.StringIO(data))                                    data = gzfile.read()                                    gzfile.close()                                except (IOError, EOFError), e:                                    data = data0                                    pass                        else:                            self._datalen = self._fo.get_datalen()                            dmgr.update_bytes(self._datalen)                                                except MemoryError, e:                        # Catch memory error for sockets                        error("Memory Error:",str(e))                # Explicitly set the status of urlobj to zero since                # download was completed...                urlobj.status = 0                                        break            #except Exception, e:            #     raise                        except urllib2.HTTPError, e:                                try:                    errbasic, errdescn = (str(e)).split(':',1)                    parts = errbasic.strip().split()                    self._error.number = int(parts[-1])                    self._error.msg = errdescn.strip()                    self._error.errclass = "HTTPError"                except:                    pass                if self._error.msg:                    error(self._error.msg, '=> ',urltofetch)                else:                    error('HTTPError:',urltofetch)                try:                    errnum = int(self._error.number)                except:                    pass                if errnum==304:                    # Page not modified                    three_oh_four = True                    self._error.fatal = False                    # Need to do this to ensure that the crawler                    # proceeds further!                    content_type = self.get_content_type()                    urlobj.manage_content_type(content_type)                                        break                if errnum in range(400, 407):                    # 400 => bad request                    # 401 => Unauthorized                    # 402 => Payment required (not used)                    # 403 => Forbidden                    # 404 => Not found                    # 405 => Method not allowed                    # 406 => Not acceptable                                        # If error is 400, 405 or 406, then we                    # retry with the useragent string not set.                    if errnum in (400, 405, 406):                        self._cfg._badrequests += 1                        # If we get many badrequests in a row                        # we disable UA addition for this crawl.                        if self._cfg._badrequests>=5:                            self._cfg._connaddua = False                                                    if self._numtries<=retries:                            add_ua = False                        else:                            self._error.fatal = True                                                else:                        self._error.fatal = True                elif errnum == 407:                    # Proxy authentication required                    self._proxy_query(1, 1)                elif errnum == 408:                    # Request timeout, try again                    pass                elif errnum == 412:                    # Pre-condition failed, this has been                    # detected due to our user-agent on some                    # websites (sample URL: http://guyh.textdriven.com/)                    self._error.fatal =  True                elif errnum in range(409, 418):                    # Error codes in 409-417 contain a mix of                    # fatal and non-fatal states. For example                    # 410 indicates requested resource is no                    # Longer available, but we could try later.                    # However for all practical purposes, we                    # are marking these codes as fatal errors                    # for the time being.                    self._error.fatal = True                elif errnum == 500:                    # Internal server error, can try again                    pass                elif errnum == 501:                    # Server does not implement the functionality                    # to fulfill the request - fatal                    self._error.fatal = True                elif errnum == 502:                    # Bad gateway, can try again ?                    pass                elif errnum in (503, 506):                    # 503 - Service unavailable                    # 504 - Gatway timeout                    # 505 - HTTP version not supported                    self._error.fatal = True                if self._error.fatal:                    rulesmgr.add_to_filter(urltofetch)                                except urllib2.URLError, e:                # print 'urlerror',urltofetch                                errdescn = ''                self._error.errclass = "URLError"                                try:                    errbasic, errdescn = (str(e)).split(':',1)                    parts = errbasic.split()                                            except:                    try:                        errbasic, errdescn = (str(e)).split(',')                        parts = errbasic.split('(')                        errdescn = (errdescn.split("'"))[1]                    except:                        pass                try:                    self._error.number = int(parts[-1])                except:                    pass                                if errdescn:                    self._error.msg = errdescn                if self._error.msg:                    error(self._error.msg, '=> ',urltofetch)                else:                    error('URLError:',urltofetch)                errnum = self._error.number                if errnum == 10049 or errnum == 10061: # Proxy server error                    self._proxy_query(1, 1)                elif errnum == 10055:                    # no buffer space available                    self.network_conn.increment_socket_errors()                    # If the number of socket errors is >= 4                    # we decrease max connections by 1                    sockerrs = self.network_conn.get_socket_errors()                    if sockerrs>=4:                        self._cfg.connections -= 1                        self.network_conn.decrement_socket_errors(4)            except IOError, e:                self._error.number = URL_IO_ERROR                self._error.fatal=True                self._error.msg = str(e)                self._error.errclass = "IOError"                                # Generated by invalid ftp hosts and                # other reasons,                # bug(url: http://www.gnu.org/software/emacs/emacs-paper.html)                error(e ,'=> ',urltofetch)            except BadStatusLine, e:                self._error.number = URL_BADSTATUSLINE                self._error.msg = str(e)                self._error.errclass = "BadStatusLine"                error(e, '=> ',urltofetch)            except TypeError, e:                self._error.number = URL_TYPE_ERROR                self._error.msg = str(e)                self._error.errclass = "TypeError"                                error(e, '=> ',urltofetch)                            except ValueError, e:                self._error.number = URL_VALUE_ERROR                self._error.msg = str(e)                self._error.errclass = "ValueError"                                error(e, '=> ',urltofetch)            except AssertionError, e:                self._error.number = URL_ASSERTION_ERROR                self._error.msg = str(e)                self._error.errclass = "AssertionError"                                error(e ,'=> ',urltofetch)            except socket.error, e:                self._error.msg = str(e)                self._error.number = URL_SOCKET_ERROR                self._error.errclass = "SocketError"                                errmsg = self._error.msg                error('Socket Error: ', errmsg,'=> ',urltofetch)                if errmsg.lower().find('connection reset by peer') != -1:                    # Connection reset by peer (socket error)                    self.network_conn.increment_socket_errors()                    # If the number of socket errors is >= 4                    # we decrease max connections by 1                    sockerrs = self.network_conn.get_socket_errors()                    if sockerrs>=4:                        self._cfg.connections -= 1                        self.network_conn.decrement_socket_errors(4)            except socket.timeout, e:                self._error['msg'] = 'socket timed out'                self._error['number'] = URL_SOCKET_TIMEOUT                errmsg = self._error['msg']                error('Socket Error: ', errmsg,'=> ',urltofetch)                            except Exception, e:                self._error.msg = str(e)                self._error.number = URL_GENERAL_ERROR                self._error.errclass = "GeneralError"                                errmsg = self._error.msg                            error('General Error: ', errmsg,'=> ',urltofetch)                            # attempt reconnect after some time            # self.evnt.sleep()            time.sleep(self._sleeptime)        if data:            self._data = data            # Set hash on URL object            urlobj.pagehash = sha.new(data).hexdigest()        # print 'URLOBJ STATUS=>',urlobj.status        if urlobj and urlobj.status != 0:            urlobj.status = self._error.number            urlobj.fatal = self._error.fatal            debug('Setting %s status to %s' % (urlobj.get_full_url(), str(urlobj.status)))                    # Raise an event...        objects.eventmgr.raise_event('afterconnect', urlobj, None)                if three_oh_four:            return CONNECT_NO_UPTODATE                    if self._data or self._datalen:            return CONNECT_YES_DOWNLOADED        else:            return CONNECT_NO_ERROR    def set_progress_object(self, topic, n=0, subtopics=[], nolengthmode=False):        """ Set the progress bar with the given topic and sub-topics """        # n=> number of subtopics        # topic => Topic        # subtopics => List of subtopics        # n should be = len(subtopics)        if n != len(subtopics):

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -