⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 rules.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 3 页
字号:

    def is_duplicate_link(self, link):
        """ Duplicate url check """

        if self.add_link(link):
            return True
        
        return False

    def add_link(self, url):
        """ Add the passed url to the links list after checking
        for duplicates """

        # Return True if the url is present in
        # the list, False otherwise.
        val=False
        try:
            #self._dataLock.acquire()
            try:
                self._links.index(url)
                val=True
            except:
                self._links.append(url)
        finally:
            pass
            #self._dataLock.release()


        return val

    def get_stats(self):
        """ Return statistics as a 3 tuple. This returns
        a 3 tuple of number of links, number of servers, and
        number of directories in the base server parsed by
        url trackers """

        numlinks=len(self._links)
        numservers=len(self._extservers)
        numdirs=len(self._extdirs)

        return (numlinks, numservers, numdirs)
       
    def dump_urls(self, file):
        """ Write all parsed urls to a file """

        if os.path.exists(file):
            try:
                os.remove(file)
            except OSError, e:
                print e
                return -1

        moreinfo('Dumping url list to file', file)

        f=open(file, 'w')
        f.write('LINKS PARSED\n')
        f.write('============\n')

        for link in self._links:
            f.write(link + '\n')

        f.close()
        debug('Done.') 

    def make_filters(self):
        """ This function creates the filter regexps
        for url/server filtering """
        
        # url filter string
        urlfilterstr = self._configobj.urlfilter
        url_filters = self.__make_filter(urlfilterstr)
        self._configobj.set_option('control.urlfilterre', url_filters)

        # server filter string
        serverfilterstr = self._configobj.serverfilter
        server_filters = self.__make_filter(serverfilterstr)
        self._configobj.set_option('control.serverfilterre', server_filters)

        #  url/server priority filters
        urlprioritystr = self._configobj.urlpriority
        # The return is a dictionary
        url_priorities = self.__make_priority(urlprioritystr)
        # print url_priorities
        self._configobj.set_option('control.urlprioritydict', url_priorities)

        serverprioritystr = self._configobj.serverpriority
        # The return is a dictionary        
        server_priorities = self.__make_priority(serverprioritystr)
        # print server_priorities
        self._configobj.set_option('control.serverprioritydict', server_priorities)
        
        # word filter list
        wordfilterstr = self._configobj.wordfilter
        if wordfilterstr:
            word_filter = self.__make_word_filter(wordfilterstr)
            self._configobj.wordfilterre = word_filter

    def __make_priority(self, pstr):
        """ Generate a priority dictionary from the priority string """

        # file priority is based on file extensions &
        # server priority based on server names

        # Priority string is of the form...
        # str1+pnum1,str2-pnum2,str3+pnum3 etc...
        # Priority range is from [-5 ... +5]

        # Split the string based on commas
        pr_strs = pstr.split(',')

        # For each string in list, create a dictionary
        # with the string as key and the priority (including
        # sign) as the value.

        d = {}
        for s in pr_strs:
            if s.find('+') != -1:
                key, val = s.split('+')
                val = int(val)
                
            elif s.find('-') != -1:
                key, val = s.split('-')
                val = -1*int(val)
            else:
                continue

            # Since we dont allow values outside
            # the range [-5 ..5] skip such values
            if val not in range(-5,6): continue
            d[key.lower()] = val

        return d
    
    def __make_filter(self, fstr,servers=0):
        """ Function used to convert url filter strings
        to python regular expresssions """

        # First replace any ''' with ''
        fstr=fstr.replace("'",'')            
        # regular expressions to include
        include=[]
        # regular expressions to exclude        
        exclude=[]
        # all regular expressions
        all=[]

        index=0
        previndex=-1
        fstr += '+'
        for c in fstr:
            if c in ('+','-'):
                subs=fstr[(previndex+1):index]
                if subs: all.append(subs)
                previndex=index
            index+=1
                    
        l=fstr.split('+')
        
        for s in l:
            l2=s.split('-')
            for x in range(0, len(l2)):
                str=l2[x]
                if str=='': continue
                if x==0:
                    include.append(str)
                else:
                    exclude.append(str)
                    
        exclusionfilter=self.__create_filter(exclude,servers)
        inclusionfilter=self.__create_filter(include,servers)
        allfilter = self.__create_filter(all, servers)
        
        # return a 3 tuple of (inclusionfilter, exclusionfilter, allfilter)
        return (inclusionfilter, exclusionfilter, allfilter)

    def __create_filter(self, strlist, servers=0):
        """ Create a python regular expression based on
        the list of filter strings provided as input """

        refilter = []
        if servers:
            serverfilter=[]
            for s in strlist:
                # First replace any ''' with ''
                s=s.replace("'",'')
                # Here asteriks have a meaning, they should match
                # anything
                s=s.replace('*', '.*')
                serverfilter.append(s)
                
            return serverfilter

        for s in strlist:
            fstr = ''
            # First replace any ''' with ''
            extn=s.replace("'",'')            
            # Then we remove the asteriks
            s=s.replace('*','')
            # Type 1 filter-> they begin with '.' now
            # Find out position of '.'
            pos=s.rfind('.')
            if pos == 0:
                extn = s[pos:]
                # Append a '.*$' to the string
                s += '.*$'
                fstr += s
            # Type 3 filter
            # These will be the form of <something>/.<extn> now
            elif s[pos-1] == '/':
                # get that <something>
                prefix = s[:(pos-1)]
                # get the <extn>
                extn = s[(pos+1):]
                myfilter = prefix
                myfilter += '/(?=\w+.'
                myfilter += extn
                myfilter += ')'
                fstr += myfilter
            # All other cases are considered Type 2 filters
            # i.e, plain strings
            else:
                fstr += s
                
            refilter.append(re.compile(fstr))

        return refilter

    def __parse_word_filter(self, s):

        scopy = s[:]
        oparmatch, clparmatch = False, False
        index = scopy.rfind('(')

        l = []

        if index != -1:
            oparmatch = True
            index2 = scopy.find(')', index)

            if index2 != -1:
                clparmatch = True
                newstr = scopy[index+1:index2]
                # if the string is only of whitespace chars, skip it
                wspre = re.compile('^\s*$')
                if not wspre.match(newstr):
                    self._rexplist.append(newstr)
                replacestr = ''.join(('(', newstr, ')'))
                scopy = scopy.replace(replacestr, '')
                self.__parse_word_filter(scopy)

        if not clparmatch and not oparmatch:
            if scopy: self._rexplist.append(scopy)


    def __make_not_expr(self, s):
        """ Make a NOT expression """

        if s.find('!') == 0:
            return ''.join(('(?!', s[1:], ')'))
        else:
            return s

    def __is_inbetween(self, l, elem):
        """ Find out if an element is in between in a list """

        i = l.index(elem)
        if i == -1: return False

        loflist = len(l)
        
        if i>1:
            if i in range(1, loflist -1):
                return True
            else:
                return False
        elif i==1:
            return True
        elif i==0:
            if loflist==1:
                return True
            else:
                return False
        
    def __make_word_filter(self, s):
        """ Create a word filter rule for HarvestMan """

        # Word filter strings can be simple or compound.
        # Simple strings are strings that can stand for a
        # word or a string.
        # Egs: Python.
        # Complex strings are expressions that can mean
        # boolean logic.
        # Egs: Python & Perl, Python || Perl, (Python || Perl) & Ruby

        # If more than one paren group found, replace | with (|)
        clparen = s.count(')')
        oparen  = s.count('(')
        if oparen != clparen:
            print 'Error in word regular expression'
            return None
            
        self.__parse_word_filter(s)
        # if NOT is one of the members, reverse
        # the list.
        if '!' in self._rexplist:
            self._rexplist.reverse()
            
        rstr = self.__make_word_regexp( self._rexplist )
        r = re.compile( rstr, re.IGNORECASE )
        return r

    def __make_word_regexp(self, mylist):

 
        is_list = True

        if type(mylist) is str:
            is_list = False
            elem =  mylist
        elif type(mylist) is list:
            elem = mylist[0]

        if type(elem) is list:
            elem = elem[0]

        eor = False
        if not is_list or len(mylist) == 1:
            eor = True

        s=''

        # Implementing NOT
        if elem == '!':
            return ''.join(('(?!', self.__make_word_regexp(mylist[1:]), ')'))
        # Implementing OR
        elif elem.find(' | ') != -1:
            listofors = elem.split(' | ')
            for o in listofors:
                index = listofors.index(o)
                in_bet = self.__is_inbetween(listofors, o)

                if o:
                    o = self.__make_not_expr(o)
                    if in_bet:
                        s = ''.join((s, '|', self._wordstr, o, '.*'))
                    else:
                        s = ''.join((s, self._wordstr, o, '.*'))

        # Implementing AND
        elif elem.find(' & ') != -1:
            listofands = elem.split(' & ')

            for a in listofands:
                index = listofands.index(a)

                if a:
                    a = self.__make_not_expr(a)                   
                    s = ''.join((s, self._wordstr, a, '.*'))

        else:
            if elem:
                elem = self.__make_not_expr(elem)             
                s = ''.join((self._wordstr, elem, '.*'))

        if eor:
            return s
        else:
            return ''.join((s, self.__make_word_regexp(mylist[1:])))
        
                                          
            
                        
        
                    
                        
        
            

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -