⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 connector.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 3 页
字号:
""" HarvestManUrlConnector.py - Module to manage and retrieve data
    from an internet connection using urllib2. This software is
    part of the HARVESTMan(R) program.

    Author: Anand B Pillai (anandpillai at letterboxes dot org).

    For licensing information see the file LICENSE.txt that
    is included in this distribution.

    Modification History
    ====================

    Jan  2 2004 Anand   1.4 bug fix version development started.
    Feb 10 2004 Anand   1.3.1 bug fix version released.
                        Added FileHandler handler for local files.
    Apri 20 2004 Anand  1.3.4 bug fix release. Fixed a bug with
                        configuring proxies.

    June 5-9    Anand   1.4   *Rewrote intranet handling routines.
                              There is no longer a flag specifying
                              intranet setup in the configuration file.
                              Instead the program determines whether the
                              url is in intranet/internet by trying to
                              resolve the hostname, depending on proxy.

                              *Reduced dependency of urlparser on
                              this class. We decide on directory/file like
                              urls by fetching the actual url here, which
                              is then set in the original url object.

                              *Rewrote arguments for connect(...) call.
                              Added an extra argument for url object.

                              *Merged connect routines for intranet/extranet
                              connections.

                              *Set download status in the url objects. This
                              is the error number, if any. 0 denotes successful
                              download.

                              *Prefixed urllib2 namespace for its methods instead
                              of doing a separate import of all methods.

                              *Close file like object returned by urlopen
                              after extracting data.
                              
"""

import sys
import md5
import socket
import time
import copy
from threading import Lock, Condition, Event
from Queue import Queue, Empty, Full

import urllib2 
import urllib

from common import *

# HarvestManUrlParser module
from urlparser import HarvestManUrlParser, HarvestManUrlParserError
from cookiemgr import CookieManager

__protocols__=["http", "ftp"]

class HarvestManNetworkConnector:
    """ This class keeps the internet settings and configures
    the network. """
    
    def __init__(self):
        # use proxies flag
        self.__useproxy=0
        # check for ssl support in python
        self.__initssl=False
        # Number of socket errors
        self.__sockerrs = 0
        # Config object
        self.__cfg = GetObject('config')
        
        if hasattr(socket, 'ssl'):
            self.__initssl=True
            __protocols__.append("https")
            
        self.initiailize_proxy()
        self.__configure()

    def initiailize_proxy(self):

        proxystring = 'proxy:80'

        # proxy variables
        # dictionary of protocol:proxy values
        self.__proxydict = {"http" : proxystring,
                            "https" : proxystring,
                            "ftp"  : proxystring}
        # dictionary of protocol:proxy auth values
        self.__proxyauth = {"http" : '',
                            "https" : '',
                            "ftp" : '' }


    def set_useproxy(self, val=1):
        """ Set the value of use-proxy flag """
        
        self.__useproxy=val

    def set_ftp_proxy(self, proxyserver, proxyport, authinfo=(), encrypted=True):
        """ Set ftp proxy """

        if encrypted:
            self.__proxydict["ftp"] = bin_decrypt(proxyserver) + ':' + str(proxyport)
        else:
            self.__proxydict["ftp"] = proxyserver + ':' + str(proxyport)

        if authinfo:
            try:
                username, passwd = authinfo
            except ValueError:
                username, passwd = '', ''

            if encrypted:
                passwdstring= bin_decrypt(username) + ':' + bin_decrypt(passwd)
            else:
                passwdstring = username + ':' + passwd

            self.__proxyauth["ftp"] = passwdstring

    def set_https_proxy(self, proxyserver, proxyport, authinfo=(), encrypted=True):
        """ Set https(ssl) proxy  """

        if encrypted:
            self.__proxydict["https"] = bin_decrypt(proxyserver) + ':' + str(proxyport)
        else:
            self.__proxydict["https"] = proxyserver + ':' + str(proxyport)

        if authinfo:
            try:
                username, passwd = authinfo
            except ValueError:
                username, passwd = '', ''

            if encrypted:
                passwdstring= bin_decrypt(username) + ':' + bin_decrypt(passwd)
            else:
                passwdstring = username + ':' + passwd

            self.__proxyauth["https"] = passwdstring

    def set_http_proxy(self, proxyserver, proxyport, authinfo=(), encrypted=True):
        """ Set http proxy """

        if encrypted:
            self.__proxydict["http"] = bin_decrypt(proxyserver) + ':' + str(proxyport)
        else:
            self.__proxydict["http"] = proxyserver + ':' + str(proxyport)

        if authinfo:
            try:
                username, passwd = authinfo
            except ValueError:
                username, passwd = '', ''

            if encrypted:
                passwdstring= bin_decrypt(username) + ':' + bin_decrypt(passwd)
            else:
                passwdstring=username + ':' + passwd

            self.__proxyauth["http"] = passwdstring

    def set_proxy(self, server, port, authinfo=(), encrypted=True):
        """ Set generic (all protocols) proxy values.
        For most users, only this method will be called,
        rather than the specific method for each protocol,
        as proxies are normally shared for all tcp/ip protocols """

        # Modified Sep 02 2003 Anand
        # Modified method to call the specific protocol methods
        if encrypted:
            proxystring = bin_decrypt(server) + ':' + str(port)
        else:
            proxystring = str(server) + ':' + str(port)

        for p in __protocols__:
            # eval helps to do this dynamically
            s='self.set_' + p + '_proxy'
            func=eval(s, locals())

            func(server, port, authinfo, encrypted)
            func(server, port, authinfo, encrypted)
            func(server, port, authinfo, encrypted)

    def set_authinfo(self, username, passwd, encrypted=True):
        """ Set authentication information for proxy.
        Note: If this function is used all protocol specific
        authentication will be replaced by this authentication. """

        if encrypted:
            passwdstring = bin_decrypt(username) + ':' + bin_decrypt(passwd)
        else:
            passwdstring = username + ':' + passwd

        self.__proxyauth = {"http" : passwdstring,
                            "https" : passwdstring,
                            "ftp" : passwdstring }

    def configure_protocols(self):
        """ Just a wrapper """
        
        self.__configure_protocols()

    def configure_network(self):
        """ Just a wrapper """

        self.__configure_network()

    def __configure(self):
        """ Wrapping up wrappers """
        
        self.__configure_network()
        self.__configure_protocols()
        
    def __configure_network(self):
        """ Initialise network for the user """

        # First: Configuration of network (proxies/intranet etc)
        
        # Check for proxies in the config object
        if self.__cfg.proxy and not self.__cfg.intranet:
            self.set_useproxy()
            proxy = self.__cfg.proxy
            
            index = proxy.rfind(':')
            if index != -1:
                port = proxy[(index+1):].strip()
                server = proxy[:index]
                # strip of any 'http://' from server
                index = server.find('http://')
                if index != -1:
                    server = server[(index+7):]

                self.set_proxy(server, int(port))

            else:
                port = self.__cfg.proxyport
                server = self.__cfg.proxy
                self.set_proxy(server, int(port))

            # Set proxy username and password, if specified
            puser, ppasswd = self.__cfg.puser, self.__cfg.ppasswd
            if puser and ppasswd: self.set_authinfo(puser, ppasswd)


    def __configure_protocols(self):
        """ Configure protocol handlers """
        
        # Second: Configuration of protocol handlers.

        # TODO: Verify gopher protocol
        # TODO: Add CacheFTPHandler instead of FTPHandler.
        authhandler = urllib2.HTTPBasicAuthHandler()

        # set timeout for sockets to thread timeout, for Python 2.3
        version_number = (sys.version.split())[0]
        if version_number=='2.3':
            socket.setdefaulttimeout( self.__cfg.timeout )
            
        # If we are behing proxies/firewalls
        if self.__useproxy:
            if self.__proxyauth:
                httpproxystring = 'http://' + self.__proxyauth['http'] + '@' + self.__proxydict['http']
                ftpproxystring = 'http://' + self.__proxyauth['ftp'] + '@' + self.__proxydict['ftp']
                httpsproxystring = 'http://' + self.__proxyauth['https'] + '@' + self.__proxydict['https']
            else:
                httpproxystring = 'http://' + self.__proxydict['http']
                ftpproxystring = 'http://' + self.__proxydict['ftp']
                httpsproxystring = 'http://' + self.__proxydict['https']

            proxy_support = urllib2.ProxyHandler({"http" : httpproxystring,
                                          "https": httpsproxystring,
                                          "ftp": ftpproxystring})

            # build opener and install it
            if self.__initssl:
                opener = urllib2.build_opener(authhandler,
                                            proxy_support,
                                            urllib2.HTTPHandler,
                                            urllib2.CacheFTPHandler,
                                            urllib2.GopherHandler,
                                            urllib2.HTTPSHandler,
                                            urllib2.HTTPRedirectHandler,
                                            urllib2.FileHandler )
            else:
                opener = urllib2.build_opener(authhandler,
                                            proxy_support,
                                            urllib2.HTTPHandler,
                                            urllib2.CacheFTPHandler,
                                            urllib2.GopherHandler,
                                            urllib2.HTTPRedirectHandler,
                                            urllib2.FileHandler )

        else:
            # Direct connection to internet
            if self.__initssl:
                opener = urllib2.build_opener(authhandler,
                                            urllib2.HTTPHandler,
                                            urllib2.CacheFTPHandler,
                                            urllib2.HTTPSHandler,
                                            urllib2.HTTPRedirectHandler,
                                            urllib2.GopherHandler,
                                            urllib2.FileHandler )
            else:
                opener = urllib2.build_opener( authhandler,
                                             urllib2.HTTPHandler,
                                             urllib2.CacheFTPHandler,
                                             urllib2.HTTPRedirectHandler,
                                             urllib2.GopherHandler,
                                             urllib2.FileHandler )

        opener.addheaders = [ ('User-agent', GetObject('USER_AGENT')) ]
        urllib2.install_opener(opener)

        return 0

    # Get methods
    def get_useproxy(self):
        """ Find out if we are using proxies """

        return self.__useproxy
    
    def get_proxy_info(self):
        return (self.__proxydict, self.__proxyauth)

    def is_intranet(self):
        return self.__cfg.intranet

    def increment_socket_errors(self, val=1):
        self.__sockerrs += val

    def decrement_socket_errors(self, val=1):
        self.__sockerrs -= val
        
    def get_socket_errors(self):
        return self.__sockerrs
        
class HarvestManUrlConnector:
    """ Class which helps to connect to the internet """

    def __str__(self):
        return `self` # + str(self.__dict__)
        
    def __init__(self):
        """ Constructor for this class """

        # file like object returned by
        # urllib2.urlopen(...)
        self.__freq = urllib2.Request('file://')
        # data downloaded
        self.__data = ''
        # error dictionary
        self.__error={ 'msg' : '',
                       'number': 0,
                       'fatal' : False
                       }
        # for keeping track of bytes downloaded
        self.__bytes = 0L
        # time to wait before reconnect
        # in case of failed connections
        self.__sleeptime = 0.5
        # local url object
        self.__urlobject = None
        # global network configurator
        self.network_conn = GetObject('connector')

    def __proxy_query(self, queryauth=1, queryserver=0):

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -