⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 connector.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 3 页
字号:
        """ Query the user for proxy related information """

        self.network_conn.set_useproxy(1)
        
        if queryserver or queryauth:
            # There is an error in the config file/project file/user input
            SetUserDebug("Error in proxy server settings (Regenerate the config/project file)")

        # Get proxy info from user
        try:
            if queryserver:
                server=bin_crypt(raw_input('Enter the name/ip of your proxy server: '))
                port=int(raw_input('Enter the proxy port: '))         
                self.network_conn.set_proxy(server, port)

            if queryauth:
                user=bin_crypt(raw_input('Enter username for your proxy server: '))
                import getpass
                passwd=bin_crypt(getpass.getpass('Enter password for your proxy server: '))
                # Set it on myself and re-configure
                if user and passwd:
                    self.network_conn.set_authinfo(user,passwd)
        except EOFError, e:
            debug(str(e))

        moreinfo('Re-configuring protocol handlers...')
        self.network_conn.configure_protocols()
        
        moreinfo('Done.')

    def urlopen(self, url):
        """ Open the url and return the url file stream """

        cfg = GetObject('config')
        self.connect(url, None, True, cfg.cookies, cfg.retryfailed )
        # return the file like object
        if self.__error['fatal']:
            return None
        else:
            return self.__freq

    def robot_urlopen(self, url):
        """ Open a robots.txt url """

        cfg = GetObject('config')
        self.connect(url, None, False, False, 0)
        # return the file like object
        if self.__error['fatal']:
            return None
        else:
            return self.__freq
    
    def connect(self, urltofetch, url_obj = None, fetchdata = True, getcookies = True, retries=1):
        """ Connect to the Internet/Intranet and fetch the data of the passed url """

        data = ''

        _cfg = GetObject('config')
        dmgr = GetObject('datamanager')
        rulesmgr = GetObject('ruleschecker')
        factory = GetObject('connectorfactory')

        # Find out if this is an intranet url by
        # using socket's methods

        if url_obj:
            hu = url_obj
        else:
            try:
                hu = HarvestManUrlParser(urltofetch, 0)
            except HarvestManUrlParserError, e:
                debug(e)

        domain = hu.get_domain()
        intranet = False

        # We need to perform this check only if
        # proxies/firewalls are being used. If it
        # is a direct connection to internet, then
        # the crawler makes no distinction about
        # intranet/internet servers since hostname
        # resolution will happen transparently. In
        # such a case we could as well use urllib2
        # methods since it has more methods than
        # urllib.
        
        if self.network_conn.get_useproxy():
            try:
                socket.gethostbyname(domain)
                intranet = True
            except socket.error:
                pass

        numtries = 0
        
        while numtries <= retries and not self.__error['fatal']:

            try:
                # Reset error
                self.__error = { 'number' : 0,
                                 'msg' : '',
                                 'fatal' : False }

                numtries += 1

                if not intranet:
                    # create a request object
                    request = urllib2.Request(urltofetch)
                    # request.add_header('Connection', 'keep-alive')
                    request.add_header('keep-alive', '300')
                    # add cookie headers for this request
                    if getcookies:
                        self.fill_cookie_headers(request)
                        
                # Increment request count
                if domain: factory.add_request(domain)

                # For intranet use urllib
                if intranet:
                    self.__freq = urllib.urlopen(urltofetch)
                else:
                    self.__freq = urllib2.urlopen(request)
                    
                # The actual url information is used to
                # differentiate between directory like urls
                # and file like urls.
                actual_url = self.__freq.geturl()
                moredebug('URL, ACTUAL URL=>', urltofetch, actual_url)

                if actual_url[-1] == '/' and urltofetch[-1] != '/':
                    # directory url
                    self.__urlobject.set_directory_url(True)

                # Decrement request count
                if domain: factory.remove_request(domain)

                # write cookies for this request
                if getcookies:
                    self.write_cookies()

                # update byte count
                # if this is the not the first attempt, print a success msg
                if numtries>1:
                    moreinfo("Reconnect succeeded => ", urltofetch)

                # Update content info on urlobject
                self.set_content_info()

                if fetchdata:
                    try:
                        data = self.__freq.read()
                        self.__freq.close()
                    # New, catch memory error for sockets
                    except MemoryError:
                        print 'Error: Memory low...!'
                        return data

                    self.__bytes += len(data)
                    dmgr.update_bytes(self.__bytes)
                break
            except urllib2.HTTPError, e:

                try:
                    self.__error['number'], self.__error['msg'] = e
                except:
                    errbasic, errdescn = (str(e)).split(':')
                    parts = errbasic.strip().split()
                    self.__error['number'] = int(parts[-1])
                    self.__error['msg'] = errdescn.strip()

                if self.__error['msg']:
                    extrainfo(self.__error['msg'], '=> ',urltofetch)
                else:
                    extrainfo('HTTPError: => ',urltofetch)

                errnum = int(self.__error['number'])

                if errnum == 407: # Proxy authentication required
                    self.__proxy_query(1, 1)
                elif errnum == 503: # Service unavailable
                    rulesmgr.add_to_filter(urltofetch)
                    self.__error['fatal']=True                        
                elif errnum == 504: # Gateway timeout
                    rulesmgr.add_to_filter(urltofetch)
                    self.__error['fatal']=True                        
                elif errnum in range(500, 505): # Server error
                    self.__error['fatal']=True                                              
                elif errnum == 404:
                    # Link not found, this might
                    # be a file wrongly fetched as directory
                    # Add to filter
                    rulesmgr.add_to_filter(urltofetch)
                    self.__error['fatal']=True
                elif errnum == 401: # Site authentication required
                    self.__error['fatal']=True                                              
                    q=GetObject('trackerqueue')
                    # right now we terminate project for this error
                    # q.stopTrackers()
                    break

            except urllib2.URLError, e:

                try:
                    self.__error['number'], self.__error['msg'] = e
                except:
                    try:
                        errbasic, errdescn = (str(e)).split(':')
                        parts = errbasic.split()                            
                    except:
                        try:
                            errbasic, errdescn = (str(e)).split(',')
                            parts = errbasic.split('(')
                            errdescn = (errdescn.split("'"))[1]
                        except:
                            pass

                    try:
                        self.__error['number'] = int(parts[-1])
                    except:
                        pass

                    self.__error['msg'] = errdescn                      


                if self.__error['msg']:
                    extrainfo(self.__error['msg'], '=> ',urltofetch)
                else:
                    extrainfo('URLError: => ',urltofetch)

                errnum = self.__error['number']
                if errnum == 10049 or errnum == 10061: # Proxy server error
                    self.__proxy_query(1, 1)
                elif errnum == 10055:
                    # no buffer space available
                    self.network_conn.increment_socket_errors()
                    # If the number of socket errors is >= 4
                    # we decrease max connections by 1
                    sockerrs = self.network_conn.get_socket_errors()
                    if sockerrs>=4:
                        _cfg.connections -= 1
                        self.network_conn.decrement_socket_errors(4)

            except IOError, e:
                self.__error['number'] = 31
                self.__error['fatal']=True
                self.__error['msg'] = str(e)                    
                # Generated by invalid ftp hosts and
                # other reasons,
                # bug(url: http://www.gnu.org/software/emacs/emacs-paper.html)
                extrainfo(e ,'=> ',urltofetch)

            except ValueError, e:
                self.__error['number'] = 41
                self.__error['msg'] = str(e)                    
                extrainfo(e, '=> ',urltofetch)

            except AssertionError, e:
                self.__error['number'] = 51
                self.__error['msg'] = str(e)
                extrainfo(e ,'=> ',urltofetch)

            except socket.error, e:
                self.__error['msg'] = str(e)
                errmsg = self.__error['msg']

                extrainfo('Socket Error: ', errmsg,'=> ',urltofetch)

                if errmsg.lower().find('connection reset by peer') != -1:
                    # Connection reset by peer (socket error)
                    self.network_conn.increment_socket_errors()
                    # If the number of socket errors is >= 4
                    # we decrease max connections by 1
                    sockerrs = self.network_conn.get_socket_errors()

                    if sockerrs>=4:
                        _cfg.connections -= 1
                        self.network_conn.decrement_socket_errors(4)                        

            # attempt reconnect after some time
            time.sleep(self.__sleeptime)

        if url_obj:     
            url_obj.set_download_status(self.__error['number'])

        if data:
            self.__data = copy.copy(data)
            
        else: self.__data = ""
        return data


    def get__data(self):
        return self.__data

    def get_error(self):
        return self.__error

    # New functions added on Aug 23 2003 for adding cache/updation
    # feature

    # Begin New functions ...
    def set_content_info(self):
        """ Set the content information on the current
        url object """

        if self.__urlobject is None: return -1

        # get content length
        contentlen = self.get_content_length()
        # get content type
        contenttype = self.get_content_type()
        # set this on the url object
        self.__urlobject.set_url_content_info(contentlen, contenttype)

    def get_http_headers(self):
        """ Return all the http headers """
        
        return self.__freq.headers

    def get_cookies(self):
        """ Return the cookie related headers """

        # NOTE: This function returns the cookies
        # as a list.

        cookies=[]
        headers=self.get_http_headers()

        for k in headers.keys():
            # The cookie header key is of the form
            # 'set-cookie', case insensitive.
            if k.lower() == 'set-cookie':
                # found cookie header
                cookies.append({k : headers[k]})

        return cookies

    def fill_cookie_headers(self, request):
        """ This function looks up our cookie manager
        to find cookies which match the url of this
        connector, adding it to the request object headers """

        cookie_manager = GetObject('cookiestore')

        if cookie_manager is None or self.__urlobject is None: return -1            
        cookie_manager.add_cookie_header(request, self.__urlobject.get_full_domain())

    def write_cookies(self):
        """ Function to write cookies for urls. This
        function writes the cookie headers to a database
        using our own cookie manager object """

        # Check for a valid url object
        if self.__urlobject is None: return -1

        # Get cookie headers
        cookies = self.get_cookies()
        # if the list is empty, there are no cookies to set
        if len(cookies) == 0:
            return -1

        # Write the cookies to the CookieManager
        cookie_manager = GetObject('cookiestore')
        if cookie_manager is None: return -1

        url = self.__urlobject.get_full_url()

        for cookie in cookies:
            cookie_manager.set_cookie(cookie, url)

    def print_http_headers(self):
        """ Print the HTTP headers for this connection """

        print 'HTTP Headers '
        for k,v in self.get_http_headers().items():
            print k,'=> ', v

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -