⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 connector.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 5 页
字号:
# -- coding: utf-8""" connector.py - Module to manage and retrieve data    from an internet connection using urllib2. This module is    part of the HarvestMan program.    Author: Anand B Pillai <abpillai at gmail dot com>    For licensing information see the file LICENSE.txt that    is included in this distribution.    Modification History    ====================    Aug 16 06         Restarted dev cycle. Fixed 2 bugs with 404                      errors, one with setting directory URLs                      and another with re-resolving URLs.    Feb 8 2007        Added hooks support.    Mar 5 2007        Modified cache check logic slightly to add                      support for HTTP 304 errors. HarvestMan will                      now use HTTP 304 if caching is enabled and                      we have data cache for the URL being checked.                      This adds true server-side cache check.                      Older caching logic retained as fallback.   Mar 7 2007         Added HTTP compression (gzip) support.   Mar 8 2007         Added connect2 method for grabbing URLs.                      Added interactive progress bar for connect2 method.                      Improved interactive progress bar to resize                      with changing size of terminal.   Mar 9 2007         Made progress bar use Progress class borrowed                      from SMART package manager (Thanks to Vaibhav                      for pointing this out!)   Mar 14 2007        Completed implementation of multipart with                      range checks and all.   Mar 26 2007        Finished implementation of multipart, integrating                      with the crawler pieces. Resuming of URLs and                      caching changes are pending.   April 20 2007  Anand Added force-splitting option for hget.   April 30 2007  Anand Using datetime module to convert seconds to                        hh:mm:ss display.                        HarvestManFileObject obejcts not recreated when a lost                        connection is resumed, instead new data is                        added to existing data, by adjusting byte range                        if necessary.   Aug 14 2007    Anand Fixed a bug with download after querying a server                        for multipart download abilities. Also split                        _write_url function and rewrote it.   Aug 22 2007    Anand  MyRedirectHandler is buggy - replaced with                         urllib2.HTTPRedirectHandler.   Mar 07 2008    Anand  Made connect to create HEAD request (instead of 'GET')                         when either last modified time or etag is given. Added etag                         support to connect and HarvestMan cache.                            Copyright (C) 2004 Anand B Pillai.                                  """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import sysimport socketimport timeimport datetimeimport threadingimport urllib2 import urlparseimport gzipimport cStringIOimport osimport shutilimport globimport randomimport base64import shaimport weakrefimport getpassimport cookielibfrom httplib import BadStatusLinefrom harvestman.lib import documentfrom harvestman.lib import urlparserfrom harvestman.lib.methodwrapper import MethodWrapperMetaClassfrom harvestman.lib.common.common import *from harvestman.lib.common.macros import *from harvestman.lib.common.spincursor import InfiniteSpinCursorfrom harvestman.lib.common import keepalive# Defining pluggable functions__plugins__ = { 'save_url_plugin': 'HarvestManUrlConnector:save_url' }# Defining functions with callbacks__callbacks__ = { 'connect_callback' : 'HarvestManUrlConnector:connect' }__protocols__=["http", "ftp"]# Error Macros with arbitrary error numbersURL_IO_ERROR = 31URL_BADSTATUSLINE = 41URL_TYPE_ERROR = 51URL_VALUE_ERROR = 61URL_ASSERTION_ERROR = 71URL_SOCKET_ERROR = 81URL_SOCKET_TIMEOUT = 91URL_GENERAL_ERROR = 101FILEOBJECT_EXCEPTION = 111TEST = Falseclass HeadRequest(urllib2.Request):    """ A request class which performs a HEAD request """    def get_method(self):        return 'HEAD'    class HarvestManFileObjectException(Exception):    """ Exception class for HarvestManFileObject class """    passclass HarvestManFileObject(threading.Thread):    """ A class which imitates a file object. This wraps    around the file object returned by urllib2 and provides    features such as block reading, throttling and a progress    bar """    # Class level attributes used for multipart    ORIGLENGTH = 0    START_TIME = 0.0    CONTENTLEN = []    MULTIPART = False    NETDATALEN = 0        def __init__(self, fobj, filename, clength, mode = 0, bwlimit = 0):        """ Overloaded __init__ method """        self._fobj = fobj        self._data = ''        self._clength = int(clength)        self._start = 0.0        self._flag = False        # Mode: 0 => flush data to file (default)        #     : 1 => keep data in memory        self._mode = mode        self._filename = filename        if self._mode == CONNECTOR_DATA_MODE_FLUSH:            self._tmpf = open(filename, 'wb')        else:            self._tmpf = None        # Content-length so far        self._contentlen = 0        self._index = 0        # Initialized flag        self._init = False        # Last error        self._lasterror = None        # Bandwidth limit as bytes/sec        self._bwlimit = bwlimit        self._bs = 4096        threading.Thread.__init__(self, None, None, 'data reader')            def initialize(self):        """ Initialize before using an instance of this class.        This methods sets the start time and the init flag """                self._start = time.time()        self._init = True    def is_initialized(self):        """ Returns the init flag """                return self._init    def set_fileobject(self, fileobj):        """ Setter method for the encapsulated file object """                self._fobj = fileobj    def throttle(self, bytecount, start_time, factor):        """ Throttle to fall within limits of specified download speed """                    diff = float(bytecount)/self._bwlimit - (time.time() - start_time)        diff = factor*diff/HarvestManUrlConnectorFactory.connector_count        # We need to sleep. But a time.sleep does waste raw CPU        # cycles. Still there does not seem to be an option here        # since we cannot use SleepEvent class here as there could        # be many connectors at any given time and hence the threading        # library may not be able to create so many distinct Event        # objects...        # print 'Diff=>',diff        if diff>0:            # We are 'ahead' of the required bandwidth, so sleep            # the time difference off.            if self._bs>=256:                 self._bs -= 128            time.sleep(diff)        elif diff<0:            # We are behind the required bandwidth, so read the            # additional data            self._bs += int(self._bwlimit*abs(diff))    def run(self):        """ Overloaded run method """                self.initialize()        self.read()    def read(self):        reads = 0                dmgr = objects.datamgr        config = objects.config        start_time = config.starttime        tfactor = config.throttlefactor        while not self._flag:            try:                block = self._fobj.read(self._bs)                if block=='':                    self._flag = True                    # Close the file                    if self._mode==CONNECTOR_DATA_MODE_FLUSH:                        self.close()                                    break                else:                    reads += 1                    self._data = self._data + block                    self._contentlen += len(block)                    if self._bwlimit:                        self.throttle(dmgr.bytes, start_time, tfactor)                                        # Flush data to disk                    if self._mode==CONNECTOR_DATA_MODE_FLUSH:                        self.flush()            except socket.error, e:                self._flag = True                self._lasterror = e                break            except Exception, e:                self._flag = True                self._lasterror = e                break        self._fobj.close()    def readNext(self):        """ Method which reads the next block of data from the URL """        dmgr = objects.datamgr        config = objects.config        start_time = config.starttime        tfactor = config.throttlefactor        try:            block = self._fobj.read(self._bs)            if block=='':                self._flag = True                # Close the file                if self._mode==CONNECTOR_DATA_MODE_FLUSH:                    self.close()                return False            else:                self._data = self._data + block                self._contentlen += len(block)                if self._bwlimit:                    self.throttle(dmgr.bytes, start_time, tfactor)                                # Flush data to disk                if self._mode==CONNECTOR_DATA_MODE_FLUSH:                    self.flush()        except socket.error, e:            self._fobj.close()            raise HarvestManFileObjectException, str(e)        except Exception, e:            self._fobj.close()                        raise HarvestManFileObjectException, str(e)                   def flush(self):        """ Flushes data to the temporary file on disk """        self._tmpf.write(self._data)        self._data = ''    def close(self):        """ Closes the temporary file object """                self._tmpf.close()    def get_tmpfile(self):        return self._tmpf        def get_lasterror(self):        """ Returns the last error object """                return self._lasterror        def get_info(self):        """ Returns percentage, data downloaded, bandwidth and estimated time to        complete as a tuple """        curr = time.time()        per, pertotal, bandwidth, l, eta = -1, -1, 0, 0, -1                if not self.__class__.MULTIPART:            if self._clength:                pertotal = float(100.0*self._contentlen)/float(self._clength)                        l = self._contentlen            self.__class__.NETDATALEN = self._contentlen                        per = pertotal                        if curr>self._start:                bandwidth = float(l)/float(curr - self._start)                        if bandwidth and self._clength:                eta = int((self._clength - l)/float(bandwidth))        else:            kls = self.__class__

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -