📄 connector.py
字号:
print '\n'
def get_content_length(self):
for k in self.__freq.headers.keys():
if k.lower() == 'content-length':
return self.__freq.headers[k]
return len(self.__data)
def get_last_modified_time(self):
s=""
for k in self.__freq.headers.keys():
if k.lower() == 'last-modified':
s=self.__freq.headers[k]
break
return s
def get_content_type(self):
contenttype='text/html'
for k in self.__freq.headers.keys():
if k.lower() == 'content-type':
contenttype = self.__freq.headers[k]
return contenttype
# End New functions ...
def __write_url(self, filename):
""" Write downloaded data to the passed file """
if self.__data=='': return 0
try:
f=open(filename, 'wb')
f.write(self.__data)
f.close()
except IOError,e:
debug('IO Exception' , e)
return 0
cfg=GetObject('config')
if cfg.checkfiles:
verify=self.verify_checksum(filename)
else:
verify=1
# find out if we need to rename files,
# this can perhaps be useful for dynamically created
# images, which will have an invalid extension
# normally (image.php => image.gif)
if cfg.renamefiles:
# try renaming
if rename(filename):
moreinfo('Renamed ', filename, 'to ', modfilename)
# the caller need to check the return
# value to find if the filename has changed
if verify: return 2
else: return verify
return verify
def save_url(self, urlObj, connect=True):
""" Download data for the url object <urlObj> and
write its file """
res=0
locked=False
self.__urlobject = urlObj
res=self.__save_url_file(connect)
return res
def check_url_exists(self, url):
""" Check a url existence (Check for 404 errors) """
dmgr = GetObject('datamanager')
if dmgr.is_a_dead_link(url):
return False
self.connect(url, None, True, True, 0)
if self.__error['number'] == 404:
dmgr.update_dead_links(url)
return False
dmgr.update_valid_links(url)
return True
def __save_url_file(self, connect=True):
""" Download data from the url <url> and write to
the file <filename> """
cfg = GetObject('config')
url = self.__urlobject.get_full_url()
self.connect(url, self.__urlobject, True, cfg.cookies, cfg.retryfailed)
dmgr=GetObject('datamanager')
# Find out if we need to update this file
# by checking with the cache.
filename = self.__urlobject.get_full_filename()
# Apply word filter
if not self.__urlobject.is_start_url():
if self.__urlobject.is_webpage() and not GetObject('ruleschecker').apply_word_filter(self.__data):
extrainfo("Word filter prevents download of url =>", url)
return 4
# Get last modified time
timestr = self.get_last_modified_time()
# Modification in checking project cache, use
# last modified times, if possible.
lmt = -1
if timestr:
try:
lmt = time.mktime( time.strptime(timestr, "%a, %d %b %Y %H:%M:%S GMT"))
except ValueError, e:
debug(e)
if lmt != -1:
if dmgr.is_url_uptodate(url, filename, lmt):
# No need to download
extrainfo("Project cache is uptodate =>", url)
return 3
if dmgr.is_url_cache_uptodate(url, filename, self.get_content_length(), self.__data):
# No need to download
extrainfo("Project cache is uptodate =>", url)
return 3
res=0
if self.__data:
if dmgr.create_local_directory(self.__urlobject) == 0:
extrainfo('Writing file ', filename)
res=self.__write_url( filename )
else:
extrainfo("Error in getting data for", url)
return res
def url_to_file(self, url, filename):
""" Save the contents of this url <url> to the file <filename>.
This is a function used by the test code only """
self.connect( url )
dmgr=GetObject('datamanager')
if self.__data:
print '*------------------------------------------------------------*\n'
print 'Data fetched from ',url
res=self.__write_url( filename )
if res:
print 'Data wrote to file ', filename ,'\n'
return res
else:
print 'Error in fetching data from ',url ,'\n'
return 0
def verify_checksum(self, filename):
""" Verify data written to file using md5 checksum """
m1=md5.new()
m1.update(self.__data)
mdigest1=m1.digest()
mdigest2=''
m2=md5.new()
try:
m2.update(open(filename, 'rb').read())
except:
return 0
mdigest2=m2.digest()
# compare the 2 digests
if mdigest1 == mdigest2:
# file was written correctly
return 1
else:
# there was an error in writing the file
return 0
def get_data(self):
return self.__data
def get__error(self):
""" Return last network error code """
return self.__error
def get__bytes(self):
return self.__bytes
class HarvestManUrlConnectorFactory:
""" This is the control object + factory for connectors """
def __init__(self, maxsize):
self.__connectors = []
self.__requests = {}
self.__bytes = {}
self.q = Queue(maxsize + 1)
self.__maxsz = maxsize
self.lock1 = Condition(Lock())
self.lock2 = Condition(Lock())
self._evt = Event()
self._evt.set()
def get_number_of_connectors(self):
return len(self.__connectors)
def create_connector(self, server):
# Check if the number of active requests
# to this server is above the limit
cfg = GetObject('config')
# This call will block if the queue size limit
# is reached
self.q.put(1)
try:
self.lock1.acquire()
try:
connector = HarvestManUrlConnector()
self.__connectors.append(connector)
return connector
except Exception, e:
print e
finally:
self.lock1.release()
def remove_connector(self, connector, server):
# We dont want this call to block,
# since the queue can be empty
try:
self.q.get(0)
except Empty:
pass
try:
self.lock1.acquire()
try:
self.__connectors.remove(connector)
except Exception, e:
print e
finally:
self.lock1.release()
def add_request(self, server):
cfg = GetObject('config')
try:
# Wait on the event object
# This will block the thread calling this
# method if the request limit has been reached
self._evt.wait()
# Acquire lock
self.lock2.acquire()
# If request limit reached, clear the event object
currval = self.__requests.get(server, 0)
if currval >= cfg.requests:
self._evt.clear()
else:
# Otherwise, set the event object's flag
# if it is cleared, (This will wake up
# all threads waiting on this event)
if not self._evt.isSet():
self._evt.set()
self.__requests[server] = currval + 1
# print 'Adding requests to ', server, currval + 1
finally:
# Release lock
self.lock2.release()
pass
def remove_request(self, server):
cfg = GetObject('config')
currval=0
try:
# Acquire lock
self.lock2.acquire()
try:
currval = self.__requests.get(server, 0)
if currval:
self.__requests[server] = currval - 1
except KeyError, e:
print e
return None
# Set the event object's flag
# if it is cleared, if the request
# count falls below critical
if currval < cfg.requests:
if not self._evt.isSet():
self._evt.set()
# print 'Removing requests from ', server, self.__requests[ip]
finally:
# Release lock
self.lock2.release()
def get_requests(self, server):
try:
self.lock2.acquire()
return self.__requests.get(server, 0)
finally:
self.lock2.release()
# test code
if __name__=="__main__":
conn = harvestManUrlConnector()
from HarvestManGlobals import *
Initialize()
# Note: this test works only for a client
# directly connected to the internet. If
# you are behind a proxy, add proxy code of
# harvestManUrlConnectorUrl class here.
# FIXME: I need to move this initialize to somewhere else!
conn.initialize()
conn.configure()
# Check for http connections
print 'Testing HTTP connections...'
conn.url_to_file('http://www.python.org/index.html', 'python.org-index.html')
# print the HTTP headers
conn.print_http_headers()
conn.url_to_file('http://www.rediff.com', 'rediff.com-index.html')
# print the HTTP headers
conn.print_http_headers()
conn.url_to_file('http://www.j3d.org', 'j3d-org.index.html')
# print the HTTP headers
conn.print_http_headers()
conn.url_to_file('http://www.yahoo.com', 'yahoo-com.index.html')
# print the HTTP headers
conn.print_http_headers()
# Check for ftp connections
# print 'Testing FTP connections...'
# conn.url_to_file('ftp://ftp.gnu.org', 'ftp.gnu.org-index.html')
# check for Cookies support
print 'Testing cookies...'
conn.url_to_file('http://www.playboy.com', 'pb-com.index.html')
# print the HTTP headers
conn.print_http_headers()
for cookie in conn.get_cookies():
print cookie
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -