⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 connector.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 5 页
字号:
                                               urllib2.FTPHandler,                                               #urllib2.GopherHandler,                                               urllib2.FileHandler,                                               urllib2.HTTPDefaultErrorHandler,                                               cookiehandler)                opener.addheaders=[] #Need to clear default headers so we can apply our own        urllib2.install_opener(opener)        return CONFIGURE_PROTOCOL_OK    # Get methods    def get_useproxy(self):        """ Returns whether we are going through a proxy server """        return self._useproxy        def get_proxy_info(self):        """ Return proxy information as a tuple. The first member        of the tuple is the proxy server dictionary and the second        member the proxy authentication information """                return (self._proxydict, self._proxyauth)    def increment_socket_errors(self, val=1):        """ Increment socket error count """                self._sockerrs += val    def decrement_socket_errors(self, val=1):        """ Decrement socket error count """                self._sockerrs -= val            def get_socket_errors(self):        """ Get socket error count """                return self._sockerrsclass HarvestManUrlError(object):    """ Class encapsulating errors raised by HarvestManUrlConnector objects    while connecting and downloading data from the Internet """        def __init__(self):        """ Overloaded __init__ method """                self.initialize()    def initialize(self):        """ Initializes an instance of this class """        self.reset()    def __str__(self):        """ Returns string representation of an instance of the class """                return ''.join((str(self.errclass),' ', str(self.number),': ',self.msg))    def reset(self):        """ Resets attributes """                self.number = 0        self.msg = ''        self.fatal = False        self.errclass = ''                class HarvestManUrlConnector(object):    """ Class which performs the work of fetching data for URLs    from the Internet and save data to the disk """    __metaclass__ = MethodWrapperMetaClass        def __str__(self):        """ Return a string representation of an instance of this class """        return `self`             def __init__(self):        """ Overloaded __init__ method """        # file like object returned by        # urllib2.urlopen(...)        self._freq = None        # data downloaded        self._data = ''        # length of data downloaded        self._datalen = 0        # error object        self._error = HarvestManUrlError()        # time to wait before reconnect        # in case of failed connections        self._sleeptime = 0.5        # global network configurator        self.network_conn = objects.connmgr        # Config object        self._cfg = objects.config            # Http header for current connection        self._headers = CaselessDict()        # HarvestMan file object        self._fo = None        # Elasped time for reading data        self._elapsed = 0.0        # Mode for data download        self._mode = self._cfg.datamode        # Temporary filename if any        self._tmpfname = ''        # Status of connection        # 0 => no connection        # 1 => connected, download in progress        self._status = 0        # Number of tries        self._numtries = 0        # Acquired flag        self._acquired = True        # Block write flag - to be used        # to indicate to connector to        # not save the data to disk        self.blockwrite = False        # Throttle sleeping time to be        # set on the file object        self.throttle_time = 0            def __del__(self):        del self._data        self._data = None        del self._freq        self._freq = None        del self._error        self._error = None        del self.network_conn        self.network_conn = None        del self._cfg        self._cfg = None            def _proxy_query(self, queryauth=1, queryserver=0):        """ Query the user for proxy related information """        self.network_conn.set_useproxy(True)                if queryserver or queryauth:            # There is an error in the config file/project file/user input            SetUserDebug("Error in proxy server settings (Regenerate the config/project file)")        # Get proxy info from user        try:            if queryserver:                server=bin_crypt(raw_input('Enter the name/ip of your proxy server: '))                port=int(raw_input('Enter the proxy port: '))                         self.network_conn.set_proxy(server, port)            if queryauth:                user=bin_crypt(raw_input('Enter username for your proxy server: '))                # Ask for password only if a valid user is given.                if user:                    passwd=bin_crypt(getpass.getpass('Enter password for your proxy server: '))                    # Set it on myself and re-configure                    self.network_conn.set_authinfo(user,passwd)        except EOFError, e:            error("Proxy Setting Error:",e)        info('Re-configuring protocol handlers...')        self.network_conn.configure_protocols()                extrainfo('Done.')    def release(self):        """ Marks the connector object as released """        self._acquired = False    def is_released(self):        """ Returns whether the connector was released or not """        return (not self._acquired)        def urlopen(self, url):        """ Opens the URL and returns the url file stream """        try:            urlobj = urlparser.HarvestManUrl(url)            self.connect(urlobj, True, self._cfg.retryfailed )            # return the file like object            if self._error.fatal:                return None            else:                return self._freq        except urlparser.HarvestManUrlError, e:            error("URL Error:",e)                def robot_urlopen(self, url):        """ Opens a robots.txt URL and returns the request object """        try:            urlobj = urlparser.HarvestManUrl(url)            self.connect(urlobj, False, 0)            # return the file like object            if self._error.fatal:                return None            else:                return self._freq        except urlparser.HarvestManUrlError, e:            error("URL Error:",e)            def connect(self, urlobj, fetchdata=True, retries=1, lastmodified='', etag=''):        """ Connects to the Internet and fetches data for the URL encapsulated        in the object 'urlobj' """        # This is the work-horse method of this class...                data = ''        dmgr = objects.datamgr        rulesmgr = objects.rulesmgr        self._numtries = 0        three_oh_four = False        # Reset the http headers        self._headers.clear()        urltofetch = urlobj.get_full_url()        lmt, tag = lastmodified, etag        # Raise an event...        if objects.eventmgr.raise_event('beforeconnect', urlobj, None, last_modified=lastmodified, etag=etag)==False:            return CONNECT_NO_FILTERED        add_ua = self._cfg._connaddua                while self._numtries <= retries and not self._error.fatal:            # Reset status            self._status = 0                        errnum = 0            try:                # Reset error                self._error.reset()                self._numtries += 1                # create a request object                # If we are passed either the lastmodified time or                # the etag value or both, we will be creating a                # head request. Now if either the etag or lastmodified                # time match, the server should produce a 304 error                # and we break the loop automatically. If not, we have                # to set lmt and tag values to null strings so that                # we make an actual request.                # Set lmt, tag to null strings if try count is greater                # than 1...                if self._numtries>1:                    lmt, tag = '', ''                                    request = self.create_request(urltofetch, lmt, tag, useragent=add_ua)                # Check for urlobject which is trying to do                # multipart download.                #byterange = urlobj.range                #if byterange:                #    range1 = byterange[0]                #    range2 = byterange[-1]                #    request.add_header('Range','bytes=%d-%d' % (range1, range2))                # If we accept http-compression, add the required header.                if self._cfg.httpcompress:                    request.add_header('Accept-Encoding', 'gzip')                self._freq = urllib2.urlopen(request)                # Set status to 1                self._status = 1                                # Set http headers                self.set_http_headers()                clength = int(self.get_content_length())                if urlobj: urlobj.clength = clength                                trynormal = False                # Check constraint on file size, dont do this on                # objects which are already downloading pieces of                # a multipart download.                if not self.check_content_length(): # and not byterange                    maxsz = self._cfg.maxfilesize                    extrainfo("Url",urltofetch,"does not match size constraints")                    # Raise an event...                    objects.eventmgr.raise_event('afterconnect', urlobj, None)                                        return CONNECT_NO_RULES_VIOLATION                ##                     supports_multipart = dmgr.supports_range_requests(urlobj)                    ##                     # Dont do range checking on FTP servers since they##                     # typically support it by default.##                     if urlobj.protocol != 'ftp' and supports_multipart==0:##                         # See if the server supports 'Range' header##                         # by requesting half the length##                         self._headers.clear()##                         request.add_header('Range','bytes=%d-%d' % (0,clength/2))##                         self._freq = urllib2.urlopen(request)##                         # Set http headers##                         self.set_http_headers()##                         range_result = self._headers.get('accept-ranges')##                         if range_result.lower()=='bytes':##                             supports_multipart = 1##                         else:##                             extrainfo('Server %s does not support multipart downloads' % urlobj.domain)##                             extrainfo('Aborting download of  URL %s.' % urltofetch)##                             return CONNECT_NO_RULES_VIOLATION##                     if supports_multipart==1:##                         extrainfo('Server %s supports multipart downloads' % urlobj.domain)##                         dmgr.download_multipart_url(urlobj, clength)##                         return CONNECT_MULTIPART_DOWNLOAD                                    # The actual url information is used to                # differentiate between directory like urls                # and file like urls.                actual_url = self._freq.geturl()                                # Replace the urltofetch in actual_url with null                if actual_url:                    no_change = (actual_url == urltofetch)                                        if not no_change:                        replacedurl = actual_url.replace(urltofetch, '')                        # If the difference is only as a directory url                        if replacedurl=='/':                            no_change = True                        else:                            no_change = False                                                    # Sometimes, there could be HTTP re-directions which                        # means the actual url may not be same as original one.                        if no_change:                            if (actual_url[-1] == '/' and urltofetch[-1] != '/'):

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -