rules.py

来自「Harvestman-最新版本」· Python 代码 · 共 1,030 行 · 第 1/3 页

PY
1,030
字号
                    if not parentUrlObj:                        return False                    server = urlObj.get_domain()                    if parentUrlObj.get_domain() == baseserver:                        self._increment_ext_server_count(server)                        return False                    else:                        return True                except urlparser.HarvestManUrlError, e:                    logconsole(e)                                elif self._configobj.fetchlevel>3:                pass                # this option takes precedence over the                # extserverlinks option, so set extserverlinks                # option to true.                # self._configobj.eserverlinks=1                # do other checks , just fall through            # res = self._ext_server_check(urlObj.get_domain())            # if not res:            #   return True            # Apply filter for servers here            if self.apply_server_filter(urlObj):                return True            # Apply depth check for external servers here            if self._configobj.extdepth:                if self.apply_depth_check(urlObj, mode=2):                    return True            #if self._configobj.eserverlinks:            #    # We can get links belonging to another server, so            #    # this is NOT an external link            #    return False            #else:            #    # We cannot get external links beloning to another server,            #    # so this is an external link            #    return True            return False                # We should not reach here        return False    def apply_depth_check(self, urlObj, mode=0):        """ Apply the depth setting for this url, if any """        # depth variable is -1 means no depth-check        baseUrlObj = objects.queuemgr.get_base_url()        if not baseUrlObj:            return False        reldepth = urlObj.get_relative_depth(baseUrlObj, mode)        if reldepth != -1:            # check if this exceeds allowed depth            if mode == 0 and self._configobj.depth != -1:                if reldepth > self._configobj.depth:                    return True            elif mode == 2 and self._configobj.extdepth:                if reldepth > self._configobj.extdepth:                    return True        return False    ## def _ext_directory_check(self, directory):    ##     """ Check whether the directory <directory>    ##     should be considered external """    ##     index=self._increment_ext_directory_count(directory)    ##     # Are we above a prescribed limit ?    ##     if self._configobj.maxextdirs and len(self._extdirs)>self._configobj.maxextdirs:    ##         if index != -1:    ##             # directory index was below the limit, allow its urls    ##             if index <= self._configobj.maxextdirs:    ##                 return True    ##             else:    ##                 # directory index was above the limit, block its urls    ##                 return False    ##         # new directory, block its urls    ##         else:    ##             return False    ##     else:    ##         return True    ## def _ext_server_check(self, server):    ##     """ Check whether the server <server> should be considered    ##     external """    ##     index=self._increment_ext_server_count(server)    ##     # are we above a prescribed limit ?    ##     if self._configobj.maxextservers and len(self._extservers)>self._configobj.maxextservers:    ##         if index != -1:    ##             # server index was below the limit, allow its urls    ##             if index <= self._configobj.maxextservers:    ##                 return True    ##             else:    ##                 return False    ##         # new server, block its urls    ##         else:    ##             return False    ##     else:    ##         return True    def _increment_ext_directory_count(self, directory):        """ Increment the external dir count """        index=-1        try:            index=self._extdirs.index(directory)        except:            self._extdirs.append(directory)        return index    def _increment_ext_server_count(self,server):        """ Increment the external server count """        index=-1        try:            index=self._extservers.index(server)        except:            self._extservers.append(server)        return index    def get_stats(self):        """ Return statistics as a 3 tuple. This returns        a 3 tuple of number of links, number of servers, and        number of directories in the base server parsed by        url trackers """        numservers=len(self._extservers)        numdirs=len(self._extdirs)        numfiltered = len(self._filter)                return (numservers, numdirs, numfiltered)    def make_filters(self):        urlfilter = filters.HarvestManUrlFilter(self._configobj.regexurlfilters,                                                self._configobj.pathurlfilters,                                                self._configobj.extnurlfilters)        self._configobj.urlfilter =  urlfilter        # sys.exit(0)           ##  def make_filters(self):##         """ This function creates the filter regexps##         for url/text based filtering of content """##         # URL regex filters        ##         url_filters = self._make_filter(urlfilterstr)##         # print 'URL FILTERS=>',url_filters        ##         self._configobj.set_option('urlfilterre_value', url_filters)##         server_filters = self._make_filter(serverfilterstr)##         self._configobj.set_option('serverfilterre_value', server_filters)##         #  url/server priority filters##         urlprioritystr = self._configobj.urlpriority##         # The return is a dictionary##         url_priorities = self._make_priority(urlprioritystr)##         self._configobj.set_option('urlprioritydict_value', url_priorities)##         serverprioritystr = self._configobj.serverpriority##         # The return is a dictionary        ##         server_priorities = self._make_priority(serverprioritystr)##         self._configobj.set_option('serverprioritydict_value', server_priorities)##         # word filter list##         wordfilterstr = self._configobj.wordfilter.strip()##         # print 'Word filter string=>',wordfilterstr,len(wordfilterstr)##         if wordfilterstr:##             word_filter = self._make_word_filter(wordfilterstr)##             self._configobj.wordfilterre = word_filter##         self._madefilters = True            def _make_priority(self, pstr):        """ Generate a priority dictionary from the priority string """        # file priority is based on file extensions &        # server priority based on server names        # Priority string is of the form...        # str1+pnum1,str2-pnum2,str3+pnum3 etc...        # Priority range is from [-5 ... +5]        # Split the string based on commas        pr_strs = pstr.split(',')        # For each string in list, create a dictionary        # with the string as key and the priority (including        # sign) as the value.        d = {}        for s in pr_strs:            if s.find('+') != -1:                key, val = s.split('+')                val = int(val)            elif s.find('-') != -1:                key, val = s.split('-')                val = -1*int(val)            else:                continue            # Since we dont allow values outside            # the range [-5 ..5] skip such values            if val not in range(-5,6): continue            d[key.lower()] = val        return d    def _make_filter(self, fstr,servers=0):        """ Function used to convert url filter strings        to python regular expresssions """        # First replace any ''' with ''        fstr=fstr.replace("'",'')                    # regular expressions to include        include=[]        # regular expressions to exclude                exclude=[]        # all regular expressions        all=[]        index=0        previndex=-1        fstr += '+'        for c in fstr:            if c in ('+','-'):                subs=fstr[(previndex+1):index]                if subs: all.append(subs)                previndex=index            index+=1        l=fstr.split('+')        for s in l:            l2=s.split('-')            for x in xrange(len(l2)):                s=l2[x]                if s=='': continue                if x==0:                    include.append(s)                else:                    exclude.append(s)        # print 'Exclude=>',exclude        # print 'Include=>',include                exclusionfilter=self._create_filter(exclude,servers)        inclusionfilter=self._create_filter(include,servers)        allfilter = self._create_filter(all, servers)        # return a 3 tuple of (inclusionfilter, exclusionfilter, allfilter)        return (inclusionfilter, exclusionfilter, allfilter)    def _create_filter(self, strlist, servers=0):        """ Create a python regular expression based on        the list of filter strings provided as input """        refilter = []        if servers:            serverfilter=[]            for s in strlist:                # First replace any ''' with ''                s=s.replace("'",'')                # Here asteriks have a meaning, they should match                # anything                s=s.replace('*', '.*')                serverfilter.append(s)            return serverfilter        for s in strlist:            fstr = ''            # First replace any ''' with ''            extn=s.replace("'",'')                        # Then we remove the asteriks            s=s.replace('*','.*')            # Type 1 filter-> they begin with '.' now            # Find out position of '.'            pos=s.rfind('.')            if pos == 0:                s = "".join(("\\", s))                # Append a '.*$' to the string                s += '$'                fstr += s            # Type 3 filter            # These will be the form of <something>/.<extn> now            elif s[pos-1] == '/':                # get that <something>                prefix = s[:(pos-1)]                # get the <extn>                extn = s[(pos+1):]                myfilter = prefix                myfilter += '/(?=\w+.'                myfilter += extn                myfilter += ')'                fstr += myfilter            # All other cases are considered Type 2 filters            # i.e, plain strings            else:                fstr += s            # print 'Fstr=>',fstr                        refilter.append(re.compile(fstr, re.IGNORECASE))        return refilter    def _make_word_filter(self, s):        """ Create a word filter rule for HarvestMan """        return re.compile(s, re.IGNORECASE|re.UNICODE)    def clean_up(self):        """ Purge data for a project by cleaning up        lists, dictionaries and resetting other member items"""        debug('Rules got cleaned up...!')                self._filter = {}        self._extservers = []        self._extdirs = []        self._robocache = []        # Reset dicts        self._robots.clear()        

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?