rules.py
来自「Harvestman-最新版本」· Python 代码 · 共 1,030 行 · 第 1/3 页
PY
1,030 行
return False return False def apply_url_filter(self, urlobj): """ See if we have a filter matching the url. Return 1 for blocking the url and 0 for allowing it """ inclfilter = self._configobj.inclfilter exclfilter = self._configobj.exclfilter # neither filters are enabled, return False if not inclfilter and not exclfilter: return 0 # We always check inclusion filter first since it is # normally more specific than exclusion filter. Someone # can request to not fetch any url containing /images/ # in the path, but still fetch the particular path # /preferred/images. It will work only if we check for # inclusion first and exclusion later. inclcheck,exclcheck=-1,-1 matchincl, matchexcl=False,False url = urlobj.get_full_url() if inclfilter: inclcheck=1 # see if we have a match for f in inclfilter: m=f.search(url) if m: debug('Go-through filter for url ', url, 'found') matchincl=True inclcheck=0 break if exclfilter: exclcheck=0 # see if we have a match for f in exclfilter: m=f.search(url) if m: debug('No-pass filter for url ', url, 'found') matchexcl=True exclcheck=1 break if inclcheck==1: extrainfo("Inclfilter does not allow this url", url) if exclcheck==0: extrainfo("Exclfilter allows this url", url) # if exclfilter and inclfilter returns different results # (exclfilter denys, inclfilter allows) # we check the order of the filters in the global filter. Whichever # comes first has precedence. if inclcheck == 0 and exclcheck == 1: globalfilter=self._configobj.allfilters try: indexincl=globalfilter.index(matchincl) except: indexincl=-1 try: indexexcl=globalfilter.index(matchexcl) except: indexexcl=-1 if indexincl != -1 and indexexcl != -1: if indexincl < indexexcl: # inclusion filter has precedence return inclcheck else: # exclusion filter has precedence return exclcheck else: # error, return allow (0) return 0 else: # return whichever matched if inclcheck != -1: return inclcheck elif exclcheck != -1: return exclcheck # none matched, allow it else: return 0 # We wont reach here return 0 def apply_server_filter(self, urlObj): """ See if we have a filter matching the server of this url. Return 1 on success(blocked) and 0 on failure (allowed) """ server = urlObj.get_domain() serverinclfilter = self._configobj.serverinclfilter serverexclfilter = self._configobj.serverexclfilter if not serverexclfilter and not serverinclfilter: return 0 # We always check inclusion filter first since it is # normally more specific than exclusion filter. Someone # can request to not fetch any url containing /images/ # in the path, but still fetch the particular path # /preferred/images. It will work only if we check for # inclusion first and exclusion later. inclcheck,exclcheck=-1,-1 matchincl, matchexcl=False,False url = urlObj.get_full_url() if serverinclfilter: inclcheck = 1 for f in serverinclfilter: # see if we have a match m=re.search(re.compile(f,re.IGNORECASE), server) if m: debug('Go-through filter for url ', url, 'found') matchincl=f inclcheck=0 break if serverexclfilter: exclcheck = 1 for f in serverexclfilter: # see if we have a match m=re.search(re.compile(f,re.IGNORECASE), server) if m: debug('No-pass filter for url ', url, 'found') matchexcl=f exclcheck=1 break if inclcheck==1: extrainfo("Inclfilter does not allow this url", url) if exclcheck==0: extrainfo("Exclfilter allows this url", url) # if exclfilter and inclfilter returns different results # (exclfilter denys, inclfilter allows) # we check the order of the filters in the global filter. Whichever # comes first has precedence. if inclcheck == 0 and exclcheck == 1: globalfilter=self._configobj.allserverfilters try: indexincl=globalfilter.index(matchincl) except: indexincl=-1 try: indexexcl=globalfilter.index(matchexcl) except: indexexcl=-1 if indexincl != -1 and indexexcl != -1: if indexincl < indexexcl: # inclusion filter has precedence return inclcheck else: # exclusion filter has precedence return exclcheck else: # error, return allow (0) return 0 else: # return whichever matched if inclcheck != -1: return inclcheck elif exclcheck != -1: return exclcheck # none matched, allow it else: return 0 # We wont reach here return 0 def is_under_starting_directory(self, urlObj): """ Check whether the url in the url object belongs to the same directory as the starting url for the project """ directory = urlObj.get_url_directory() baseUrlObj = objects.queuemgr.get_base_url() if not baseUrlObj: return True # Bug: the original URL might have had been # redirected, so its base URL might have got # changed. We need to check with the original # URL in such cases. # Sample site: http://www.vegvesen.no if baseUrlObj.reresolved: bdir = baseUrlObj.get_original_url_directory() else: bdir = baseUrlObj.get_url_directory() # print 'BASEDIR=>',bdir # print 'DIRECTORY=>',directory # Look for bdir inside dir index = directory.find(bdir) if index == 0: return True # Sometimes a simple string match # is not good enough. May be both # the directories are the same but # the server names are slightly different # ex: www-106.ibm.com and www.ibm.com # for developerworks links. # Check if both of them are in the same # domain if self.compare_domains(urlObj.get_domain(), baseUrlObj.get_domain()): debug('Domains',urlObj.get_domain(),'and',baseUrlObj.get_domain(),'compare fine') # Get url directory sans domain directory = urlObj.get_url_directory_sans_domain() bdir = baseUrlObj.get_url_directory_sans_domain() debug('Directories',directory,bdir) # Check again if directory.find(bdir) == 0: return True return False def is_external_server_link(self, urlObj): """ Check whether the url in the url object belongs to an external server """ # Get the tracker queue object baseUrlObj = objects.queuemgr.get_base_url() if not baseUrlObj: return False # Check based on the server server = urlObj.get_domain() baseserver = baseUrlObj.get_domain() return not self.compare_domains( server, baseserver ) def is_external_link(self, urlObj): """ Check if the url is an external link relative to starting url, using the download rules set by the user """ # Example. # Assume our start url is 'http://www.server.com/files/images/index.html" # Then any url which starts with another server name or at a level # above the start url's base directory on the same server is considered # an external url # i.e, http://www.yahoo.com will be external because of # 1st reason & # http://www.server.com/files/search.cgi will be external link because of # 2nd reason. # External links ? # if under the same starting directory, return False if self.is_under_starting_directory(urlObj): return False directory = urlObj.get_url_directory() baseUrlObj = objects.queuemgr.get_base_url() if not baseUrlObj: return False if urlObj.get_type() == 'stylesheet': if self._configobj.getstylesheets: return False elif urlObj.get_type() == 'image': if self._configobj.getimagelinks: return False if not self.is_external_server_link(urlObj): debug('Same!') if self._configobj.fetchlevel==0: return True elif self._configobj.fetchlevel==3: # check for the directory of the parent url # if it is same as starting directory, allow this # url, else deny try: parentUrlObj = urlObj.get_parent_url() if not parentUrlObj: return False parentdir = parentUrlObj.get_url_directory() bdir = baseUrlObj.get_url_directory() if parentdir == bdir: self._increment_ext_directory_count(directory) return False else: return True except urlparser.HarvestManUrlError, e: logconsole(e) elif self._configobj.fetchlevel > 0: # do other checks , just fall through pass # Increment external directory count # directory = urlObj.get_url_directory() # res=self._ext_directory_check(directory) # if not res: # extrainfo("External directory error - filtered!") # return True # Apply depth check for external dirs here if self._configobj.extdepth: if self.apply_depth_check(urlObj, mode=2): return True #if self._configobj.epagelinks: # # We can get external links belonging to same server, # # so this is not an external link # return False #else: # # We cannot get external links belonging to same server, # # so this is an external link # return True return False else: # print 'Different!',self._configobj.fetchlevel debug('Different!') # Both belong to different base servers if self._configobj.fetchlevel==0 or self._configobj.fetchlevel == 1: return True elif self._configobj.fetchlevel==2 or self._configobj.fetchlevel==3: # check whether the baseurl (parent url of this url) # belongs to the starting server. If so allow fetching # else deny. ( we assume the baseurl path is not relative! :-) try: parentUrlObj = urlObj.get_parent_url() baseserver = baseUrlObj.get_domain()
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?