📄 connector.py
字号:
""" Query the user for proxy related information """
self.network_conn.set_useproxy(1)
if queryserver or queryauth:
# There is an error in the config file/project file/user input
SetUserDebug("Error in proxy server settings (Regenerate the config/project file)")
# Get proxy info from user
try:
if queryserver:
server=bin_crypt(raw_input('Enter the name/ip of your proxy server: '))
port=int(raw_input('Enter the proxy port: '))
self.network_conn.set_proxy(server, port)
if queryauth:
user=bin_crypt(raw_input('Enter username for your proxy server: '))
import getpass
passwd=bin_crypt(getpass.getpass('Enter password for your proxy server: '))
# Set it on myself and re-configure
if user and passwd:
self.network_conn.set_authinfo(user,passwd)
except EOFError, e:
debug(str(e))
moreinfo('Re-configuring protocol handlers...')
self.network_conn.configure_protocols()
moreinfo('Done.')
def urlopen(self, url):
""" Open the url and return the url file stream """
cfg = GetObject('config')
self.connect(url, None, True, cfg.cookies, cfg.retryfailed )
# return the file like object
if self.__error['fatal']:
return None
else:
return self.__freq
def robot_urlopen(self, url):
""" Open a robots.txt url """
cfg = GetObject('config')
self.connect(url, None, False, False, 0)
# return the file like object
if self.__error['fatal']:
return None
else:
return self.__freq
def connect(self, urltofetch, url_obj = None, fetchdata = True, getcookies = True, retries=1):
""" Connect to the Internet/Intranet and fetch the data of the passed url """
data = ''
_cfg = GetObject('config')
dmgr = GetObject('datamanager')
rulesmgr = GetObject('ruleschecker')
factory = GetObject('connectorfactory')
# Find out if this is an intranet url by
# using socket's methods
if url_obj:
hu = url_obj
else:
try:
hu = HarvestManUrlParser(urltofetch, 0)
except HarvestManUrlParserError, e:
debug(e)
domain = hu.get_domain()
intranet = False
# We need to perform this check only if
# proxies/firewalls are being used. If it
# is a direct connection to internet, then
# the crawler makes no distinction about
# intranet/internet servers since hostname
# resolution will happen transparently. In
# such a case we could as well use urllib2
# methods since it has more methods than
# urllib.
if self.network_conn.get_useproxy():
try:
socket.gethostbyname(domain)
intranet = True
except socket.error:
pass
numtries = 0
while numtries <= retries and not self.__error['fatal']:
try:
# Reset error
self.__error = { 'number' : 0,
'msg' : '',
'fatal' : False }
numtries += 1
if not intranet:
# create a request object
request = urllib2.Request(urltofetch)
# request.add_header('Connection', 'keep-alive')
request.add_header('keep-alive', '300')
# add cookie headers for this request
if getcookies:
self.fill_cookie_headers(request)
# Increment request count
if domain: factory.add_request(domain)
# For intranet use urllib
if intranet:
self.__freq = urllib.urlopen(urltofetch)
else:
self.__freq = urllib2.urlopen(request)
# The actual url information is used to
# differentiate between directory like urls
# and file like urls.
actual_url = self.__freq.geturl()
moredebug('URL, ACTUAL URL=>', urltofetch, actual_url)
if actual_url[-1] == '/' and urltofetch[-1] != '/':
# directory url
self.__urlobject.set_directory_url(True)
# Decrement request count
if domain: factory.remove_request(domain)
# write cookies for this request
if getcookies:
self.write_cookies()
# update byte count
# if this is the not the first attempt, print a success msg
if numtries>1:
moreinfo("Reconnect succeeded => ", urltofetch)
# Update content info on urlobject
self.set_content_info()
if fetchdata:
try:
data = self.__freq.read()
self.__freq.close()
# New, catch memory error for sockets
except MemoryError:
print 'Error: Memory low...!'
return data
self.__bytes += len(data)
dmgr.update_bytes(self.__bytes)
break
except urllib2.HTTPError, e:
try:
self.__error['number'], self.__error['msg'] = e
except:
errbasic, errdescn = (str(e)).split(':')
parts = errbasic.strip().split()
self.__error['number'] = int(parts[-1])
self.__error['msg'] = errdescn.strip()
if self.__error['msg']:
extrainfo(self.__error['msg'], '=> ',urltofetch)
else:
extrainfo('HTTPError: => ',urltofetch)
errnum = int(self.__error['number'])
if errnum == 407: # Proxy authentication required
self.__proxy_query(1, 1)
elif errnum == 503: # Service unavailable
rulesmgr.add_to_filter(urltofetch)
self.__error['fatal']=True
elif errnum == 504: # Gateway timeout
rulesmgr.add_to_filter(urltofetch)
self.__error['fatal']=True
elif errnum in range(500, 505): # Server error
self.__error['fatal']=True
elif errnum == 404:
# Link not found, this might
# be a file wrongly fetched as directory
# Add to filter
rulesmgr.add_to_filter(urltofetch)
self.__error['fatal']=True
elif errnum == 401: # Site authentication required
self.__error['fatal']=True
q=GetObject('trackerqueue')
# right now we terminate project for this error
# q.stopTrackers()
break
except urllib2.URLError, e:
try:
self.__error['number'], self.__error['msg'] = e
except:
try:
errbasic, errdescn = (str(e)).split(':')
parts = errbasic.split()
except:
try:
errbasic, errdescn = (str(e)).split(',')
parts = errbasic.split('(')
errdescn = (errdescn.split("'"))[1]
except:
pass
try:
self.__error['number'] = int(parts[-1])
except:
pass
self.__error['msg'] = errdescn
if self.__error['msg']:
extrainfo(self.__error['msg'], '=> ',urltofetch)
else:
extrainfo('URLError: => ',urltofetch)
errnum = self.__error['number']
if errnum == 10049 or errnum == 10061: # Proxy server error
self.__proxy_query(1, 1)
elif errnum == 10055:
# no buffer space available
self.network_conn.increment_socket_errors()
# If the number of socket errors is >= 4
# we decrease max connections by 1
sockerrs = self.network_conn.get_socket_errors()
if sockerrs>=4:
_cfg.connections -= 1
self.network_conn.decrement_socket_errors(4)
except IOError, e:
self.__error['number'] = 31
self.__error['fatal']=True
self.__error['msg'] = str(e)
# Generated by invalid ftp hosts and
# other reasons,
# bug(url: http://www.gnu.org/software/emacs/emacs-paper.html)
extrainfo(e ,'=> ',urltofetch)
except ValueError, e:
self.__error['number'] = 41
self.__error['msg'] = str(e)
extrainfo(e, '=> ',urltofetch)
except AssertionError, e:
self.__error['number'] = 51
self.__error['msg'] = str(e)
extrainfo(e ,'=> ',urltofetch)
except socket.error, e:
self.__error['msg'] = str(e)
errmsg = self.__error['msg']
extrainfo('Socket Error: ', errmsg,'=> ',urltofetch)
if errmsg.lower().find('connection reset by peer') != -1:
# Connection reset by peer (socket error)
self.network_conn.increment_socket_errors()
# If the number of socket errors is >= 4
# we decrease max connections by 1
sockerrs = self.network_conn.get_socket_errors()
if sockerrs>=4:
_cfg.connections -= 1
self.network_conn.decrement_socket_errors(4)
# attempt reconnect after some time
time.sleep(self.__sleeptime)
if url_obj:
url_obj.set_download_status(self.__error['number'])
if data:
self.__data = copy.copy(data)
else: self.__data = ""
return data
def get__data(self):
return self.__data
def get_error(self):
return self.__error
# New functions added on Aug 23 2003 for adding cache/updation
# feature
# Begin New functions ...
def set_content_info(self):
""" Set the content information on the current
url object """
if self.__urlobject is None: return -1
# get content length
contentlen = self.get_content_length()
# get content type
contenttype = self.get_content_type()
# set this on the url object
self.__urlobject.set_url_content_info(contentlen, contenttype)
def get_http_headers(self):
""" Return all the http headers """
return self.__freq.headers
def get_cookies(self):
""" Return the cookie related headers """
# NOTE: This function returns the cookies
# as a list.
cookies=[]
headers=self.get_http_headers()
for k in headers.keys():
# The cookie header key is of the form
# 'set-cookie', case insensitive.
if k.lower() == 'set-cookie':
# found cookie header
cookies.append({k : headers[k]})
return cookies
def fill_cookie_headers(self, request):
""" This function looks up our cookie manager
to find cookies which match the url of this
connector, adding it to the request object headers """
cookie_manager = GetObject('cookiestore')
if cookie_manager is None or self.__urlobject is None: return -1
cookie_manager.add_cookie_header(request, self.__urlobject.get_full_domain())
def write_cookies(self):
""" Function to write cookies for urls. This
function writes the cookie headers to a database
using our own cookie manager object """
# Check for a valid url object
if self.__urlobject is None: return -1
# Get cookie headers
cookies = self.get_cookies()
# if the list is empty, there are no cookies to set
if len(cookies) == 0:
return -1
# Write the cookies to the CookieManager
cookie_manager = GetObject('cookiestore')
if cookie_manager is None: return -1
url = self.__urlobject.get_full_url()
for cookie in cookies:
cookie_manager.set_cookie(cookie, url)
def print_http_headers(self):
""" Print the HTTP headers for this connection """
print 'HTTP Headers '
for k,v in self.get_http_headers().items():
print k,'=> ', v
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -