📄 zurllib.py
字号:
## zurllib.py## This is (hopefully) a drop-in for urllib which will request gzip/deflate# compression and then decompress the output if a compressed response is# received while maintaining the API.## by Robert Stone 2/22/2003# extended by Matt Chisholm# tracker announce --bind support added by Jeremy Evans 11/2005import sysimport threadingimport threadfrom BitTorrent import PeerIDuser_agent = PeerID.make_id()del PeerIDimport urllib2OldOpenerDirector = urllib2.OpenerDirectorclass MyOpenerDirector(OldOpenerDirector): def __init__(self): OldOpenerDirector.__init__(self) self.addheaders = [('User-agent', user_agent)]urllib2.OpenerDirector = MyOpenerDirectordel urllib2from httplib import HTTPConnection, HTTPfrom urllib import *from urllib2 import *from gzip import GzipFilefrom StringIO import StringIOimport pprintDEBUG = Falseurl_socket_timeout = 30http_bindaddr = None# ow ow ow.# this is here so we can track open http connections in our pending# connection count. we have to buffer because maybe urllib connections# start before rawserver does - hopefully not more than 10 of them!## this can all go away when we use a reasonable http client library# and the connections are managed inside rawserverclass PreRawServerBuffer(object): def __init__(self): self.pending_sockets = {} self.pending_sockets_lock = threading.RLock() def add_pending_connection(self, addr): # the XP connection rate limiting is unique at the IP level assert isinstance(addr, str) self.pending_sockets_lock.acquire() self.pending_sockets.setdefault(addr, 0) self.pending_sockets[addr] += 1 self.pending_sockets_lock.release() def remove_pending_connection(self, addr): self.pending_sockets_lock.acquire() self.pending_sockets[addr] -= 1 if self.pending_sockets[addr] <= 0: del self.pending_sockets[addr] self.pending_sockets_lock.release()rawserver = PreRawServerBuffer()def bind_tracker_connection(bindaddr): global http_bindaddr http_bindaddr = bindaddrdef set_zurllib_rawserver(new_rawserver): global rawserver old_rawserver = rawserver rawserver = new_rawserver while old_rawserver.pending_sockets: addr = old_rawserver.pending_sockets.keys()[0] new_rawserver.add_pending_connection(addr) old_rawserver.remove_pending_connection(addr) assert len(old_rawserver.pending_sockets) == 0unsafe_threads = []def add_unsafe_thread(): global unsafe_threads unsafe_threads.append(thread.get_ident())class BindingHTTPConnection(HTTPConnection): def connect(self): """Connect to the host and port specified in __init__.""" ident = thread.get_ident() # never, ever, ever call urlopen from any of these threads assert ident not in unsafe_threads, "You may not use urllib from this thread!" msg = "getaddrinfo returns an empty list" for res in socket.getaddrinfo(self.host, self.port, 0, socket.SOCK_STREAM): af, socktype, proto, canonname, sa = res addr = sa[0] # the obvious multithreading problem is avoided by using locks. # the lock is only acquired during the function call, so there's # no danger of urllib blocking rawserver. rawserver.add_pending_connection(addr) try: self.sock = socket.socket(af, socktype, proto) self.sock.settimeout(url_socket_timeout) if http_bindaddr: self.sock.bind((http_bindaddr, 0)) if self.debuglevel > 0: print "connect: (%s, %s)" % (self.host, self.port) self.sock.connect(sa) except socket.error, msg: if self.debuglevel > 0: print 'connect fail:', (self.host, self.port) if self.sock: self.sock.close() self.sock = None rawserver.remove_pending_connection(addr) if self.sock: break if not self.sock: raise socket.error, msgclass BindingHTTP(HTTP): _connection_class = BindingHTTPConnectionif sys.version_info >= (2,4): BindingHTTP = BindingHTTPConnectionclass HTTPContentEncodingHandler(HTTPHandler): """Inherit and add gzip/deflate/etc support to HTTP gets.""" def http_open(self, req): # add the Accept-Encoding header to the request # support gzip encoding (identity is assumed) req.add_header("Accept-Encoding","gzip") if DEBUG: print "Sending:" print req.headers print "\n" fp = self.do_open(BindingHTTP, req) headers = fp.headers if DEBUG: pprint.pprint(headers.dict) url = fp.url resp = addinfourldecompress(fp, headers, url) if hasattr(fp, 'code'): resp.code = fp.code if hasattr(fp, 'msg'): resp.msg = fp.msg return respclass addinfourldecompress(addinfourl): """Do gzip decompression if necessary. Do addinfourl stuff too.""" def __init__(self, fp, headers, url): # we need to do something more sophisticated here to deal with # multiple values? What about other weird crap like q-values? # basically this only works for the most simplistic case and will # break in some other cases, but for now we only care about making # this work with the BT tracker so.... if headers.has_key('content-encoding') and headers['content-encoding'] == 'gzip': if DEBUG: print "Contents of Content-encoding: " + headers['Content-encoding'] + "\n" self.gzip = 1 self.rawfp = fp fp = GzipStream(fp) else: self.gzip = 0 return addinfourl.__init__(self, fp, headers, url) def close(self): self.fp.close() if self.gzip: self.rawfp.close() def iscompressed(self): return self.gzipclass GzipStream(StringIO): """Magically decompress a file object. This is not the most efficient way to do this but GzipFile() wants to seek, etc, which won't work for a stream such as that from a socket. So we copy the whole shebang info a StringIO object, decompress that then let people access the decompressed output as a StringIO object. The disadvantage is memory use and the advantage is random access. Will mess with fixing this later. """ def __init__(self,fp): self.fp = fp # this is nasty and needs to be fixed at some point # copy everything into a StringIO (compressed) compressed = StringIO() r = fp.read() while r: compressed.write(r) r = fp.read() # now, unzip (gz) the StringIO to a string compressed.seek(0,0) gz = GzipFile(fileobj = compressed) str = '' r = gz.read() while r: str += r r = gz.read() # close our utility files compressed.close() gz.close() # init our stringio selves with the string StringIO.__init__(self, str) del str def close(self): self.fp.close() return StringIO.close(self)def test(): """Test this module. At the moment this is lame. """ print "Running unit tests.\n" def printcomp(fp): try: if fp.iscompressed(): print "GET was compressed.\n" else: print "GET was uncompressed.\n" except: print "no iscompressed function! this shouldn't happen" print "Trying to GET a compressed document...\n" #fp = urlopen('http://a.scarywater.net/hng/index.shtml') fp = urlopen('http://hotornot.com') print len(fp.read()) printcomp(fp) fp.close() print "Trying to GET a compressed document...\n" fp = urlopen('http://bittorrent.com') print len(fp.read()) printcomp(fp) fp.close() print "Trying to GET an unknown document...\n" fp = urlopen('http://www.otaku.org/') print len(fp.read()) printcomp(fp) fp.close()## Install the HTTPContentEncodingHandler that we've defined above.#install_opener(build_opener(HTTPContentEncodingHandler, ProxyHandler({})))if __name__ == '__main__': test()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -