rules.py
来自「Harvestman-最新版本」· Python 代码 · 共 1,030 行 · 第 1/3 页
PY
1,030 行
if not parentUrlObj: return False server = urlObj.get_domain() if parentUrlObj.get_domain() == baseserver: self._increment_ext_server_count(server) return False else: return True except urlparser.HarvestManUrlError, e: logconsole(e) elif self._configobj.fetchlevel>3: pass # this option takes precedence over the # extserverlinks option, so set extserverlinks # option to true. # self._configobj.eserverlinks=1 # do other checks , just fall through # res = self._ext_server_check(urlObj.get_domain()) # if not res: # return True # Apply filter for servers here if self.apply_server_filter(urlObj): return True # Apply depth check for external servers here if self._configobj.extdepth: if self.apply_depth_check(urlObj, mode=2): return True #if self._configobj.eserverlinks: # # We can get links belonging to another server, so # # this is NOT an external link # return False #else: # # We cannot get external links beloning to another server, # # so this is an external link # return True return False # We should not reach here return False def apply_depth_check(self, urlObj, mode=0): """ Apply the depth setting for this url, if any """ # depth variable is -1 means no depth-check baseUrlObj = objects.queuemgr.get_base_url() if not baseUrlObj: return False reldepth = urlObj.get_relative_depth(baseUrlObj, mode) if reldepth != -1: # check if this exceeds allowed depth if mode == 0 and self._configobj.depth != -1: if reldepth > self._configobj.depth: return True elif mode == 2 and self._configobj.extdepth: if reldepth > self._configobj.extdepth: return True return False ## def _ext_directory_check(self, directory): ## """ Check whether the directory <directory> ## should be considered external """ ## index=self._increment_ext_directory_count(directory) ## # Are we above a prescribed limit ? ## if self._configobj.maxextdirs and len(self._extdirs)>self._configobj.maxextdirs: ## if index != -1: ## # directory index was below the limit, allow its urls ## if index <= self._configobj.maxextdirs: ## return True ## else: ## # directory index was above the limit, block its urls ## return False ## # new directory, block its urls ## else: ## return False ## else: ## return True ## def _ext_server_check(self, server): ## """ Check whether the server <server> should be considered ## external """ ## index=self._increment_ext_server_count(server) ## # are we above a prescribed limit ? ## if self._configobj.maxextservers and len(self._extservers)>self._configobj.maxextservers: ## if index != -1: ## # server index was below the limit, allow its urls ## if index <= self._configobj.maxextservers: ## return True ## else: ## return False ## # new server, block its urls ## else: ## return False ## else: ## return True def _increment_ext_directory_count(self, directory): """ Increment the external dir count """ index=-1 try: index=self._extdirs.index(directory) except: self._extdirs.append(directory) return index def _increment_ext_server_count(self,server): """ Increment the external server count """ index=-1 try: index=self._extservers.index(server) except: self._extservers.append(server) return index def get_stats(self): """ Return statistics as a 3 tuple. This returns a 3 tuple of number of links, number of servers, and number of directories in the base server parsed by url trackers """ numservers=len(self._extservers) numdirs=len(self._extdirs) numfiltered = len(self._filter) return (numservers, numdirs, numfiltered) def make_filters(self): urlfilter = filters.HarvestManUrlFilter(self._configobj.regexurlfilters, self._configobj.pathurlfilters, self._configobj.extnurlfilters) self._configobj.urlfilter = urlfilter # sys.exit(0) ## def make_filters(self):## """ This function creates the filter regexps## for url/text based filtering of content """## # URL regex filters ## url_filters = self._make_filter(urlfilterstr)## # print 'URL FILTERS=>',url_filters ## self._configobj.set_option('urlfilterre_value', url_filters)## server_filters = self._make_filter(serverfilterstr)## self._configobj.set_option('serverfilterre_value', server_filters)## # url/server priority filters## urlprioritystr = self._configobj.urlpriority## # The return is a dictionary## url_priorities = self._make_priority(urlprioritystr)## self._configobj.set_option('urlprioritydict_value', url_priorities)## serverprioritystr = self._configobj.serverpriority## # The return is a dictionary ## server_priorities = self._make_priority(serverprioritystr)## self._configobj.set_option('serverprioritydict_value', server_priorities)## # word filter list## wordfilterstr = self._configobj.wordfilter.strip()## # print 'Word filter string=>',wordfilterstr,len(wordfilterstr)## if wordfilterstr:## word_filter = self._make_word_filter(wordfilterstr)## self._configobj.wordfilterre = word_filter## self._madefilters = True def _make_priority(self, pstr): """ Generate a priority dictionary from the priority string """ # file priority is based on file extensions & # server priority based on server names # Priority string is of the form... # str1+pnum1,str2-pnum2,str3+pnum3 etc... # Priority range is from [-5 ... +5] # Split the string based on commas pr_strs = pstr.split(',') # For each string in list, create a dictionary # with the string as key and the priority (including # sign) as the value. d = {} for s in pr_strs: if s.find('+') != -1: key, val = s.split('+') val = int(val) elif s.find('-') != -1: key, val = s.split('-') val = -1*int(val) else: continue # Since we dont allow values outside # the range [-5 ..5] skip such values if val not in range(-5,6): continue d[key.lower()] = val return d def _make_filter(self, fstr,servers=0): """ Function used to convert url filter strings to python regular expresssions """ # First replace any ''' with '' fstr=fstr.replace("'",'') # regular expressions to include include=[] # regular expressions to exclude exclude=[] # all regular expressions all=[] index=0 previndex=-1 fstr += '+' for c in fstr: if c in ('+','-'): subs=fstr[(previndex+1):index] if subs: all.append(subs) previndex=index index+=1 l=fstr.split('+') for s in l: l2=s.split('-') for x in xrange(len(l2)): s=l2[x] if s=='': continue if x==0: include.append(s) else: exclude.append(s) # print 'Exclude=>',exclude # print 'Include=>',include exclusionfilter=self._create_filter(exclude,servers) inclusionfilter=self._create_filter(include,servers) allfilter = self._create_filter(all, servers) # return a 3 tuple of (inclusionfilter, exclusionfilter, allfilter) return (inclusionfilter, exclusionfilter, allfilter) def _create_filter(self, strlist, servers=0): """ Create a python regular expression based on the list of filter strings provided as input """ refilter = [] if servers: serverfilter=[] for s in strlist: # First replace any ''' with '' s=s.replace("'",'') # Here asteriks have a meaning, they should match # anything s=s.replace('*', '.*') serverfilter.append(s) return serverfilter for s in strlist: fstr = '' # First replace any ''' with '' extn=s.replace("'",'') # Then we remove the asteriks s=s.replace('*','.*') # Type 1 filter-> they begin with '.' now # Find out position of '.' pos=s.rfind('.') if pos == 0: s = "".join(("\\", s)) # Append a '.*$' to the string s += '$' fstr += s # Type 3 filter # These will be the form of <something>/.<extn> now elif s[pos-1] == '/': # get that <something> prefix = s[:(pos-1)] # get the <extn> extn = s[(pos+1):] myfilter = prefix myfilter += '/(?=\w+.' myfilter += extn myfilter += ')' fstr += myfilter # All other cases are considered Type 2 filters # i.e, plain strings else: fstr += s # print 'Fstr=>',fstr refilter.append(re.compile(fstr, re.IGNORECASE)) return refilter def _make_word_filter(self, s): """ Create a word filter rule for HarvestMan """ return re.compile(s, re.IGNORECASE|re.UNICODE) def clean_up(self): """ Purge data for a project by cleaning up lists, dictionaries and resetting other member items""" debug('Rules got cleaned up...!') self._filter = {} self._extservers = [] self._extdirs = [] self._robocache = [] # Reset dicts self._robots.clear()
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?