📄 connector.py
字号:
# -- coding: utf-8""" connector.py - Module to manage and retrieve data from an internet connection using urllib2. This module is part of the HarvestMan program. Author: Anand B Pillai <abpillai at gmail dot com> For licensing information see the file LICENSE.txt that is included in this distribution. Modification History ==================== Aug 16 06 Restarted dev cycle. Fixed 2 bugs with 404 errors, one with setting directory URLs and another with re-resolving URLs. Feb 8 2007 Added hooks support. Mar 5 2007 Modified cache check logic slightly to add support for HTTP 304 errors. HarvestMan will now use HTTP 304 if caching is enabled and we have data cache for the URL being checked. This adds true server-side cache check. Older caching logic retained as fallback. Mar 7 2007 Added HTTP compression (gzip) support. Mar 8 2007 Added connect2 method for grabbing URLs. Added interactive progress bar for connect2 method. Improved interactive progress bar to resize with changing size of terminal. Mar 9 2007 Made progress bar use Progress class borrowed from SMART package manager (Thanks to Vaibhav for pointing this out!) Mar 14 2007 Completed implementation of multipart with range checks and all. Mar 26 2007 Finished implementation of multipart, integrating with the crawler pieces. Resuming of URLs and caching changes are pending. April 20 2007 Anand Added force-splitting option for hget. April 30 2007 Anand Using datetime module to convert seconds to hh:mm:ss display. HarvestManFileObject obejcts not recreated when a lost connection is resumed, instead new data is added to existing data, by adjusting byte range if necessary. Aug 14 2007 Anand Fixed a bug with download after querying a server for multipart download abilities. Also split _write_url function and rewrote it. Aug 22 2007 Anand MyRedirectHandler is buggy - replaced with urllib2.HTTPRedirectHandler. Mar 07 2008 Anand Made connect to create HEAD request (instead of 'GET') when either last modified time or etag is given. Added etag support to connect and HarvestMan cache. Copyright (C) 2004 Anand B Pillai. """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import sysimport socketimport timeimport datetimeimport threadingimport urllib2 import urlparseimport gzipimport cStringIOimport osimport shutilimport globimport randomimport base64import shaimport weakrefimport getpassimport cookielibfrom httplib import BadStatusLinefrom harvestman.lib import documentfrom harvestman.lib import urlparserfrom harvestman.lib.methodwrapper import MethodWrapperMetaClassfrom harvestman.lib.common.common import *from harvestman.lib.common.macros import *from harvestman.lib.common.spincursor import InfiniteSpinCursorfrom harvestman.lib.common import keepalive# Defining pluggable functions__plugins__ = { 'save_url_plugin': 'HarvestManUrlConnector:save_url' }# Defining functions with callbacks__callbacks__ = { 'connect_callback' : 'HarvestManUrlConnector:connect' }__protocols__=["http", "ftp"]# Error Macros with arbitrary error numbersURL_IO_ERROR = 31URL_BADSTATUSLINE = 41URL_TYPE_ERROR = 51URL_VALUE_ERROR = 61URL_ASSERTION_ERROR = 71URL_SOCKET_ERROR = 81URL_SOCKET_TIMEOUT = 91URL_GENERAL_ERROR = 101FILEOBJECT_EXCEPTION = 111TEST = Falseclass HeadRequest(urllib2.Request): """ A request class which performs a HEAD request """ def get_method(self): return 'HEAD' class HarvestManFileObjectException(Exception): """ Exception class for HarvestManFileObject class """ passclass HarvestManFileObject(threading.Thread): """ A class which imitates a file object. This wraps around the file object returned by urllib2 and provides features such as block reading, throttling and a progress bar """ # Class level attributes used for multipart ORIGLENGTH = 0 START_TIME = 0.0 CONTENTLEN = [] MULTIPART = False NETDATALEN = 0 def __init__(self, fobj, filename, clength, mode = 0, bwlimit = 0): """ Overloaded __init__ method """ self._fobj = fobj self._data = '' self._clength = int(clength) self._start = 0.0 self._flag = False # Mode: 0 => flush data to file (default) # : 1 => keep data in memory self._mode = mode self._filename = filename if self._mode == CONNECTOR_DATA_MODE_FLUSH: self._tmpf = open(filename, 'wb') else: self._tmpf = None # Content-length so far self._contentlen = 0 self._index = 0 # Initialized flag self._init = False # Last error self._lasterror = None # Bandwidth limit as bytes/sec self._bwlimit = bwlimit self._bs = 4096 threading.Thread.__init__(self, None, None, 'data reader') def initialize(self): """ Initialize before using an instance of this class. This methods sets the start time and the init flag """ self._start = time.time() self._init = True def is_initialized(self): """ Returns the init flag """ return self._init def set_fileobject(self, fileobj): """ Setter method for the encapsulated file object """ self._fobj = fileobj def throttle(self, bytecount, start_time, factor): """ Throttle to fall within limits of specified download speed """ diff = float(bytecount)/self._bwlimit - (time.time() - start_time) diff = factor*diff/HarvestManUrlConnectorFactory.connector_count # We need to sleep. But a time.sleep does waste raw CPU # cycles. Still there does not seem to be an option here # since we cannot use SleepEvent class here as there could # be many connectors at any given time and hence the threading # library may not be able to create so many distinct Event # objects... # print 'Diff=>',diff if diff>0: # We are 'ahead' of the required bandwidth, so sleep # the time difference off. if self._bs>=256: self._bs -= 128 time.sleep(diff) elif diff<0: # We are behind the required bandwidth, so read the # additional data self._bs += int(self._bwlimit*abs(diff)) def run(self): """ Overloaded run method """ self.initialize() self.read() def read(self): reads = 0 dmgr = objects.datamgr config = objects.config start_time = config.starttime tfactor = config.throttlefactor while not self._flag: try: block = self._fobj.read(self._bs) if block=='': self._flag = True # Close the file if self._mode==CONNECTOR_DATA_MODE_FLUSH: self.close() break else: reads += 1 self._data = self._data + block self._contentlen += len(block) if self._bwlimit: self.throttle(dmgr.bytes, start_time, tfactor) # Flush data to disk if self._mode==CONNECTOR_DATA_MODE_FLUSH: self.flush() except socket.error, e: self._flag = True self._lasterror = e break except Exception, e: self._flag = True self._lasterror = e break self._fobj.close() def readNext(self): """ Method which reads the next block of data from the URL """ dmgr = objects.datamgr config = objects.config start_time = config.starttime tfactor = config.throttlefactor try: block = self._fobj.read(self._bs) if block=='': self._flag = True # Close the file if self._mode==CONNECTOR_DATA_MODE_FLUSH: self.close() return False else: self._data = self._data + block self._contentlen += len(block) if self._bwlimit: self.throttle(dmgr.bytes, start_time, tfactor) # Flush data to disk if self._mode==CONNECTOR_DATA_MODE_FLUSH: self.flush() except socket.error, e: self._fobj.close() raise HarvestManFileObjectException, str(e) except Exception, e: self._fobj.close() raise HarvestManFileObjectException, str(e) def flush(self): """ Flushes data to the temporary file on disk """ self._tmpf.write(self._data) self._data = '' def close(self): """ Closes the temporary file object """ self._tmpf.close() def get_tmpfile(self): return self._tmpf def get_lasterror(self): """ Returns the last error object """ return self._lasterror def get_info(self): """ Returns percentage, data downloaded, bandwidth and estimated time to complete as a tuple """ curr = time.time() per, pertotal, bandwidth, l, eta = -1, -1, 0, 0, -1 if not self.__class__.MULTIPART: if self._clength: pertotal = float(100.0*self._contentlen)/float(self._clength) l = self._contentlen self.__class__.NETDATALEN = self._contentlen per = pertotal if curr>self._start: bandwidth = float(l)/float(curr - self._start) if bandwidth and self._clength: eta = int((self._clength - l)/float(bandwidth)) else: kls = self.__class__
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -