📄 rules.py
字号:
if indexincl != -1 and indexexcl != -1:
if indexincl < indexexcl:
# inclusion filter has precedence
return inclcheck
else:
# exclusion filter has precedence
return exclcheck
else:
# error, return allow (0)
return 0
else:
# return whichever matched
if matchincl: return inclcheck
elif matchexcl: return exclcheck
# none matched, allow it
else: return 0
# We wont reach here
return 0
def __apply_server_filter(self, urlObj):
""" See if we have a filter matching the server of
this url. Return 1 on success(blocked) and 0 on failure
(allowed) """
server = urlObj.get_domain()
serverinclfilter = self._configobj.serverinclfilter
serverexclfilter = self._configobj.serverexclfilter
if not serverexclfilter and not serverinclfilter: return 0
# We always check inclusion filter first since it is
# normally more specific than exclusion filter. Someone
# can request to not fetch any url containing /images/
# in the path, but still fetch the particular path
# /preferred/images. It will work only if we check for
# inclusion first and exclusion later.
inclcheck,exclcheck=-1,-1
matchincl, matchexcl='',''
url = urlObj.get_full_url()
if serverinclfilter:
inclcheck = 1
for f in serverinclfilter:
# see if we have a match
m=re.search(re.compile(f), server)
if m:
extrainfo('Go-through filter for url ', url, 'found')
matchincl=f
inclcheck=0
break
if serverexclfilter:
exclcheck = 1
for f in serverexclfilter:
# see if we have a match
m=re.search(re.compile(f), server)
if m:
extrainfo('No-pass filter for url ', url, 'found')
matchexcl=f
self.add_to_filter(url)
exclcheck=1
break
if inclcheck==1:
extrainfo("Inclfilter does not allow this url", url)
if exclcheck==0:
extrainfo("Exclfilter allows this url", url)
# if exclfilter and inclfilter returns different results
# (exclfilter denys, inclfilter allows)
# we check the order of the filters in the global filter. Whichever
# comes first has precedence.
if inclcheck == 0 and exclcheck == 1:
globalfilter=self._configobj.allserverfilters
try:
indexincl=globalfilter.index(matchincl)
except:
indexincl=-1
try:
indexexcl=globalfilter.index(matchexcl)
except:
indexexcl=-1
if indexincl != -1 and indexexcl != -1:
if indexincl < indexexcl:
# inclusion filter has precedence
return inclcheck
else:
# exclusion filter has precedence
return exclcheck
else:
# error, return allow (0)
return 0
else:
# return whichever matched
if matchincl: return inclcheck
elif matchexcl: return exclcheck
# none matched, allow it
else: return 0
# We wont reach here
return 0
def is_under_starting_directory(self, urlObj):
""" Check whether the url in the url object belongs
to the same directory as the starting url for the
project """
dir = urlObj.get_url_directory()
# Get the tracker queue object
tq = GetObject('trackerqueue')
baseUrlObj = tq.get_base_urlobject()
if baseUrlObj is None:
return True
bdir = baseUrlObj.get_url_directory()
index = dir.find(bdir)
if index != -1:
return True
return False
def is_external_server_link(self, urlObj):
""" Check whether the url in the url object belongs to
an external server """
# Get the tracker queue object
tq = GetObject('trackerqueue')
baseUrlObj = tq.get_base_urlobject()
if baseUrlObj is None:
return False
# Check based on the server
server = urlObj.get_domain()
baseserver = baseUrlObj.get_domain()
return not self.__compare_domains( server, baseserver )
def __is_external_link(self, urlObj):
""" Check if the url is an external link relative to starting url,
using the download rules set by the user """
# Example.
# Assume our start url is 'http://www.server.com/files/images/index.html"
# Then any url which starts with another server name or at a level
# above the start url's base directory on the same server is considered
# an external url
# i.e, http://www.yahoo.com will be external because of
# 1st reason &
# http://www.server.com/files/search.cgi will be external link because of
# 2nd reason.
# External links ?
# if under the same starting directory, return False
if self.is_under_starting_directory(urlObj):
return False
dir = urlObj.get_url_directory()
tq = GetObject('trackerqueue')
baseUrlObj = tq.get_base_urlobject()
if baseUrlObj is None:
return False
if not self.is_external_server_link(urlObj):
# print 'Same server ', urlObj.domain, baseUrlObj.domain
if self._configobj.fetchlevel==0:
return True
elif self._configobj.fetchlevel==3:
# check for the directory of the parent url
# if it is same as starting directory, allow this
# url, else deny
try:
parentUrlObj = urlObj.get_base_urlobject()
if parentUrlObj is None:
return False
parentdir = parentUrlObj.get_url_directory()
bdir = baseUrlObj.get_url_directory()
if parentdir == bdir:
self.__increment_ext_directory_count(dir)
return False
else:
return True
except HarvestManUrlParserError, e:
print e
elif self._configobj.fetchlevel > 0:
# this option takes precedence over the
# extpagelinks option, so set extpagelinks
# option to true.
self._configobj.epagelinks=1
# do other checks , just fall through
# Increment external directory count
dir = urlObj.get_url_directory()
res=self.__ext_directory_check(dir)
if not res:
extrainfo("External directory error - filtered!")
self.add_to_filter(urlObj.get_full_url())
return True
# Apply depth check for external dirs here
if self._configobj.extdepth:
if self.__apply_depth_check(urlObj, mode=2):
return True
if self._configobj.epagelinks:
# We can get external links belonging to same server,
# so this is not an external link
return False
else:
# We cannot get external links belonging to same server,
# so this is an external link
self.add_to_filter(urlObj.get_full_url())
return True
else:
# print 'Different server ', urlObj.domain, baseUrlObj.domain
# print 'Fetchlevel ', self._configobj.fetchlevel
# Both belong to different base servers
if self._configobj.fetchlevel==0 or self._configobj.fetchlevel == 1:
return True
elif self._configobj.fetchlevel==2 or self._configobj.fetchlevel==3:
# check whether the baseurl (parent url of this url)
# belongs to the starting server. If so allow fetching
# else deny. ( we assume the baseurl path is not relative! :-)
try:
parentUrlObj = urlObj.get_base_urlobject()
baseserver = baseUrlObj.get_domain()
if parentUrlObj is None:
return False
server = urlObj.get_domain()
if parentUrlObj.get_domain() == baseserver:
self.__increment_ext_server_count(server)
return False
else:
return True
except HarvestManUrlParserError, e:
print e
elif self._configobj.fetchlevel>3:
# this option takes precedence over the
# extserverlinks option, so set extserverlinks
# option to true.
self._configobj.eserverlinks=1
# do other checks , just fall through
res = self.__ext_server_check(urlObj.get_domain())
if not res:
self.add_to_filter(urlObj.get_full_url())
return True
# Apply filter for servers here
if self.__apply_server_filter(urlObj):
return True
# Apply depth check for external servers here
if self._configobj.extdepth:
if self.__apply_depth_check(urlObj, mode=2):
return True
if self._configobj.eserverlinks:
# We can get links belonging to another server, so
# this is NOT an external link
return False
else:
# We cannot get external links beloning to another server,
# so this is an external link
self.add_to_filter(urlObj.get_full_url())
return True
# We should not reach here
return False
def __apply_depth_check(self, urlObj, mode=0):
""" Apply the depth setting for this url, if any """
# depth variable is -1 means no depth-check
tq = GetObject('trackerqueue')
baseUrlObj = tq.get_base_urlobject()
if baseUrlObj is None:
return False
reldepth = urlObj.get_relative_depth(baseUrlObj, mode)
if reldepth != -1:
# check if this exceeds allowed depth
if mode == 0 and self._configobj.depth != -1:
if reldepth > self._configobj.depth:
self.add_to_filter(urlObj.get_full_url())
return True
elif mode == 2 and self._configobj.extdepth:
if reldepth > self._configobj.extdepth:
self.add_to_filter(urlObj.get_full_url())
return True
return False
def __ext_directory_check(self, directory):
""" Check whether the directory <directory>
should be considered external """
index=self.__increment_ext_directory_count(directory)
# Are we above a prescribed limit ?
if self._configobj.maxextdirs and len(self._extdirs)>self._configobj.maxextdirs:
if index != -1:
# directory index was below the limit, allow its urls
if index <= self._configobj.maxextdirs:
return True
else:
# directory index was above the limit, block its urls
return False
# new directory, block its urls
else:
return False
else:
return True
def __ext_server_check(self, server):
""" Check whether the server <server> should be considered
external """
index=self.__increment_ext_server_count(server)
# are we above a prescribed limit ?
if self._configobj.maxextservers and len(self._extservers)>self._configobj.maxextservers:
if index != -1:
# server index was below the limit, allow its urls
if index <= self._configobj.maxextservers:
return True
else:
return False
# new server, block its urls
else:
return False
else:
return True
def __increment_ext_directory_count(self, directory):
""" Increment the external dir count """
try:
self._dataLock.acquire()
index=-1
try:
index=self._extdirs.index(directory)
except:
self._extdirs.append(directory)
finally:
self._dataLock.release()
return index
def __increment_ext_server_count(self,server):
""" Increment the external server count """
try:
self._dataLock.acquire()
index=-1
try:
index=self._extservers.index(server)
except:
self._extservers.append(server)
finally:
self._dataLock.release()
return index
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -