⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 connector.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 3 页
字号:

        print '\n'

    def get_content_length(self):

        for k in self.__freq.headers.keys():
            if k.lower() == 'content-length':
                return self.__freq.headers[k]

        return len(self.__data)

    def get_last_modified_time(self):

        s=""
        for k in self.__freq.headers.keys():
            if k.lower() == 'last-modified':
                s=self.__freq.headers[k]
                break

        return s
    
    def get_content_type(self):

        contenttype='text/html'
        for k in self.__freq.headers.keys():
            if k.lower() == 'content-type':
                contenttype = self.__freq.headers[k]

        return contenttype

    # End New functions ...

    def __write_url(self, filename):
        """ Write downloaded data to the passed file """

        if self.__data=='': return 0

        try:
            f=open(filename, 'wb')
            f.write(self.__data)
            f.close()
        except IOError,e:
            debug('IO Exception' , e)
            return 0

        cfg=GetObject('config')

        if cfg.checkfiles:
            verify=self.verify_checksum(filename)
        else:
            verify=1

        # find out if we need to rename files,
        # this can perhaps be useful for dynamically created
        # images, which will have an invalid extension
        # normally (image.php => image.gif)

        if cfg.renamefiles:
            # try renaming
            if rename(filename):
                moreinfo('Renamed ', filename, 'to ', modfilename)
                # the caller need to check the return
                # value to find if the filename has changed
                if verify: return 2
                else: return verify
            
        return verify

    def save_url(self, urlObj, connect=True):
        """ Download data for the url object <urlObj> and
        write its file """

        res=0
        locked=False
        
        self.__urlobject = urlObj
        res=self.__save_url_file(connect)

        return res

    def check_url_exists(self, url):
        """ Check a url existence (Check for 404 errors) """

        dmgr = GetObject('datamanager')

        if dmgr.is_a_dead_link(url):
            return False
            
        self.connect(url, None, True, True, 0)
        if self.__error['number'] == 404:
            dmgr.update_dead_links(url)
            return False

        dmgr.update_valid_links(url)
        return True
        
    def __save_url_file(self, connect=True):
        """ Download data from the url <url> and write to
        the file <filename> """

        cfg = GetObject('config')
        
        url = self.__urlobject.get_full_url()

        self.connect(url, self.__urlobject, True, cfg.cookies, cfg.retryfailed)
            
        dmgr=GetObject('datamanager')

        # Find out if we need to update this file
        # by checking with the cache.
        filename = self.__urlobject.get_full_filename()

        # Apply word filter
        if not self.__urlobject.is_start_url():
            if self.__urlobject.is_webpage() and not GetObject('ruleschecker').apply_word_filter(self.__data):
                extrainfo("Word filter prevents download of url =>", url)
                return 4

        # Get last modified time
        timestr = self.get_last_modified_time()

        # Modification in checking project cache, use
        # last modified times, if possible.
        
        lmt = -1
        if timestr:
            try:
                lmt = time.mktime( time.strptime(timestr, "%a, %d %b %Y %H:%M:%S GMT"))
            except ValueError, e:
                debug(e)

        if lmt != -1:
            if dmgr.is_url_uptodate(url, filename, lmt):
                # No need to download
                extrainfo("Project cache is uptodate =>", url)
                return 3
        if dmgr.is_url_cache_uptodate(url, filename, self.get_content_length(), self.__data):
            # No need to download
            extrainfo("Project cache is uptodate =>", url)
            return 3

        res=0
        if self.__data:
            if dmgr.create_local_directory(self.__urlobject) == 0:
                extrainfo('Writing file ', filename)
                res=self.__write_url( filename )
        else:
            extrainfo("Error in getting data for", url)

        return res

    def url_to_file(self, url, filename):
        """ Save the contents of this url <url> to the file <filename>.
        This is a function used by the test code only """

        self.connect( url )
        dmgr=GetObject('datamanager')

        if self.__data:
            print '*------------------------------------------------------------*\n'
            print 'Data fetched from ',url
            res=self.__write_url( filename )
            if res:
                print 'Data wrote to file ', filename ,'\n'
                return res
        else:
            print 'Error in fetching data from ',url ,'\n'

        return 0

    def verify_checksum(self, filename):
        """ Verify data written to file using md5 checksum """

        m1=md5.new()
        m1.update(self.__data)
        mdigest1=m1.digest()
        mdigest2=''

        m2=md5.new()
        try:
            m2.update(open(filename, 'rb').read())
        except:
            return 0

        mdigest2=m2.digest()
        # compare the 2 digests
        if mdigest1 == mdigest2:
            # file was written correctly
            return 1
        else:
            # there was an error in writing the file
            return 0

    def get_data(self):
        return self.__data
    
    def get__error(self):
        """ Return last network error code """

        return self.__error

    def get__bytes(self):
        return self.__bytes

class HarvestManUrlConnectorFactory:
    """ This is the control object + factory for connectors """

    def __init__(self, maxsize):
        self.__connectors = []
        self.__requests = {}
        self.__bytes = {}
        self.q = Queue(maxsize + 1)
        self.__maxsz = maxsize
        self.lock1 = Condition(Lock())
        self.lock2 = Condition(Lock())
        self._evt = Event()
        self._evt.set()
        
    def get_number_of_connectors(self):
        return len(self.__connectors)
    
    def create_connector(self, server):

        # Check if the number of active requests
        # to this server is above the limit

        cfg = GetObject('config')
        
        # This call will block if the queue size limit
        # is reached
        self.q.put(1)

        try:
            self.lock1.acquire()
            try:
                connector = HarvestManUrlConnector()
                self.__connectors.append(connector)
                return connector
            except Exception, e:
                print e
        finally:
            self.lock1.release()
        
    def remove_connector(self, connector, server):

        # We dont want this call to block,
        # since the queue can be empty
        try:
            self.q.get(0)
        except Empty:
            pass

        try:
            self.lock1.acquire()
            try:
                self.__connectors.remove(connector)
            except Exception, e:
                print e
        finally:
            self.lock1.release()

    def add_request(self, server):

        cfg = GetObject('config')

        try:
            # Wait on the event object
            # This will block the thread calling this
            # method if the request limit has been reached
            self._evt.wait()
            # Acquire lock
            self.lock2.acquire()
            # If request limit reached, clear the event object
            currval = self.__requests.get(server, 0)
            if currval >= cfg.requests:
                self._evt.clear()
            else:
                # Otherwise, set the event object's flag
                # if it is cleared, (This will wake up
                # all threads waiting on this event)
                if not self._evt.isSet():
                    self._evt.set()
                    
                self.__requests[server] = currval + 1
                # print 'Adding requests to ', server, currval + 1
        finally:
            # Release lock
            self.lock2.release()
            pass

    def remove_request(self, server):

        cfg = GetObject('config')

        currval=0
        try:
            # Acquire lock
            self.lock2.acquire()

            try:
                currval = self.__requests.get(server, 0)
                if currval:
                    self.__requests[server] = currval - 1
            except KeyError, e:
                print e
                return None

            # Set the event object's flag
            # if it is cleared, if the request
            # count falls below critical
            if currval < cfg.requests:
                if not self._evt.isSet():
                    self._evt.set()
                
            # print 'Removing requests from ', server, self.__requests[ip]         
        finally:
            # Release lock          
            self.lock2.release()
            
    def get_requests(self, server):

        try:
            self.lock2.acquire()
            return self.__requests.get(server, 0)
        finally:
            self.lock2.release()
        
# test code
if __name__=="__main__":

    conn = harvestManUrlConnector()

    from HarvestManGlobals import *
    
    Initialize()
    # Note: this test works only for a client
    # directly connected to the internet. If
    # you are behind a proxy, add proxy code of
    # harvestManUrlConnectorUrl class here.

    # FIXME: I need to move this initialize to somewhere else!
    conn.initialize()
    conn.configure()

    # Check for http connections
    print 'Testing HTTP connections...'
    conn.url_to_file('http://www.python.org/index.html', 'python.org-index.html')
    # print the HTTP headers
    conn.print_http_headers()
    
    conn.url_to_file('http://www.rediff.com', 'rediff.com-index.html')
    # print the HTTP headers
    conn.print_http_headers()
    
    conn.url_to_file('http://www.j3d.org', 'j3d-org.index.html')
    # print the HTTP headers
    conn.print_http_headers()
    
    conn.url_to_file('http://www.yahoo.com', 'yahoo-com.index.html')
    # print the HTTP headers
    conn.print_http_headers()   

    # Check for ftp connections
    # print 'Testing FTP connections...'  
    # conn.url_to_file('ftp://ftp.gnu.org', 'ftp.gnu.org-index.html')

    # check for Cookies support
    print 'Testing cookies...'
    conn.url_to_file('http://www.playboy.com', 'pb-com.index.html')
    # print the HTTP headers
    conn.print_http_headers()   
    for cookie in conn.get_cookies():
        print cookie

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -