📄 rules.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 3 页
字号:
            if indexincl != -1 and indexexcl != -1:
                if indexincl < indexexcl:
                    # inclusion filter has precedence
                    return inclcheck
                else:
                    # exclusion filter has precedence
                    return exclcheck
            else:
                # error, return allow (0)
                return 0
        else:
            # return whichever matched
            if matchincl: return inclcheck
            elif matchexcl: return exclcheck
            # none matched, allow it
            else: return 0 

        # We wont reach here
        return 0

    def __apply_server_filter(self, urlObj):
        """ See if we have a filter matching the server of
        this url. Return 1 on success(blocked) and 0 on failure
        (allowed) """

        server = urlObj.get_domain()

        serverinclfilter = self._configobj.serverinclfilter
        serverexclfilter = self._configobj.serverexclfilter

        if not serverexclfilter and not serverinclfilter: return 0

        # We always check inclusion filter first since it is
        # normally more specific than exclusion filter. Someone
        # can request to not fetch any url containing /images/
        # in the path, but still fetch the particular path
        # /preferred/images. It will work only if we check for
        # inclusion first and exclusion later.
        inclcheck,exclcheck=-1,-1
        matchincl, matchexcl='',''

        url = urlObj.get_full_url()

        if serverinclfilter:
            inclcheck = 1

            for f in serverinclfilter:
                # see if we have a match
                m=re.search(re.compile(f), server)

                if m:
                    extrainfo('Go-through filter for url ', url, 'found')
                    matchincl=f
                    inclcheck=0
                    break

        if serverexclfilter:
            exclcheck = 1
            for f in serverexclfilter:
                # see if we have a match
                m=re.search(re.compile(f), server)

                if m:
                    extrainfo('No-pass filter for url ', url, 'found')
                    matchexcl=f
                    self.add_to_filter(url)               
                    exclcheck=1
                    break

        if inclcheck==1:
            extrainfo("Inclfilter does not allow this url", url)
        if exclcheck==0:
            extrainfo("Exclfilter allows this url", url)

        # if exclfilter and inclfilter returns different results
        # (exclfilter denys, inclfilter allows)
        # we check the order of the filters in the global filter. Whichever
        # comes first has precedence.
        if inclcheck == 0 and exclcheck == 1:
            globalfilter=self._configobj.allserverfilters
            try:
                indexincl=globalfilter.index(matchincl)
            except:
                indexincl=-1
            try:
                indexexcl=globalfilter.index(matchexcl)
            except:
                indexexcl=-1

            if indexincl != -1 and indexexcl != -1:
                if indexincl < indexexcl:
                    # inclusion filter has precedence
                    return inclcheck
                else:
                    # exclusion filter has precedence
                    return exclcheck
            else:
                # error, return allow (0)
                return 0
        else:
            # return whichever matched
            if matchincl: return inclcheck
            elif matchexcl: return exclcheck
            # none matched, allow it
            else: return 0 

        # We wont reach here
        return 0

    def is_under_starting_directory(self, urlObj):
        """ Check whether the url in the url object belongs
        to the same directory as the starting url for the
        project """

        dir = urlObj.get_url_directory()
        # Get the tracker queue object
        tq = GetObject('trackerqueue')
        baseUrlObj = tq.get_base_urlobject()
        if baseUrlObj is None:
            return True

        bdir = baseUrlObj.get_url_directory()

        index = dir.find(bdir)

        if index != -1:
            return True

        return False

    def is_external_server_link(self, urlObj):
        """ Check whether the url in the url object belongs to
        an external server """

        # Get the tracker queue object
        tq = GetObject('trackerqueue')
        baseUrlObj = tq.get_base_urlobject()
        if baseUrlObj is None:
            return False

        # Check based on the server
        server = urlObj.get_domain()
        baseserver = baseUrlObj.get_domain()
        
        return not self.__compare_domains( server, baseserver )

    def __is_external_link(self, urlObj):
        """ Check if the url is an external link relative to starting url,
        using the download rules set by the user """

        # Example.
        # Assume our start url is 'http://www.server.com/files/images/index.html"
        # Then any url which starts with another server name or at a level
        # above the start url's base directory on the same server is considered
        # an external url
        # i.e, http://www.yahoo.com will be external because of
        # 1st reason &
        # http://www.server.com/files/search.cgi will be external link because of
        # 2nd reason.
        # External links ?

        # if under the same starting directory, return False
        if self.is_under_starting_directory(urlObj):
            return False

        dir = urlObj.get_url_directory()

        tq = GetObject('trackerqueue')
        baseUrlObj = tq.get_base_urlobject()
        if baseUrlObj is None:
            return False

        if not self.is_external_server_link(urlObj):
            # print 'Same server ', urlObj.domain, baseUrlObj.domain
            if self._configobj.fetchlevel==0:
                return True
            elif self._configobj.fetchlevel==3:
                # check for the directory of the parent url
                # if it is same as starting directory, allow this
                # url, else deny
                try:
                    parentUrlObj = urlObj.get_base_urlobject()
                    if parentUrlObj is None:
                        return False

                    parentdir = parentUrlObj.get_url_directory()
                    bdir = baseUrlObj.get_url_directory()

                    if parentdir == bdir:
                        self.__increment_ext_directory_count(dir)
                        return False
                    else:
                        return True
                except HarvestManUrlParserError, e:
                    print e
            elif self._configobj.fetchlevel > 0:
                # this option takes precedence over the
                # extpagelinks option, so set extpagelinks
                # option to true.
                self._configobj.epagelinks=1
                # do other checks , just fall through

            # Increment external directory count
            dir = urlObj.get_url_directory()

            res=self.__ext_directory_check(dir)
            if not res:
                extrainfo("External directory error - filtered!")
                self.add_to_filter(urlObj.get_full_url())
                return True

            # Apply depth check for external dirs here
            if self._configobj.extdepth:
                if self.__apply_depth_check(urlObj, mode=2):
                    return True

            if self._configobj.epagelinks:
                # We can get external links belonging to same server,
                # so this is not an external link
                return False
            else:
                # We cannot get external links belonging to same server,
                # so this is an external link
                self.add_to_filter(urlObj.get_full_url())
                return True
        else:
            # print 'Different server ', urlObj.domain, baseUrlObj.domain
            # print 'Fetchlevel ', self._configobj.fetchlevel
            # Both belong to different base servers
            if self._configobj.fetchlevel==0 or self._configobj.fetchlevel == 1:
                return True
            elif self._configobj.fetchlevel==2 or self._configobj.fetchlevel==3:
                # check whether the baseurl (parent url of this url)
                # belongs to the starting server. If so allow fetching
                # else deny. ( we assume the baseurl path is not relative! :-)
                try:
                    parentUrlObj = urlObj.get_base_urlobject()
                    baseserver = baseUrlObj.get_domain()

                    if parentUrlObj is None:
                        return False

                    server = urlObj.get_domain()
                    if parentUrlObj.get_domain() == baseserver:
                        self.__increment_ext_server_count(server)
                        return False
                    else:
                        return True
                except HarvestManUrlParserError, e:
                    print e
            elif self._configobj.fetchlevel>3:
                # this option takes precedence over the
                # extserverlinks option, so set extserverlinks
                # option to true.
                self._configobj.eserverlinks=1
                # do other checks , just fall through
                
            res = self.__ext_server_check(urlObj.get_domain())

            if not res:
                self.add_to_filter(urlObj.get_full_url())
                return True

            # Apply filter for servers here
            if self.__apply_server_filter(urlObj):
                return True

            # Apply depth check for external servers here
            if self._configobj.extdepth:
                if self.__apply_depth_check(urlObj, mode=2):
                    return True

            if self._configobj.eserverlinks:
                # We can get links belonging to another server, so
                # this is NOT an external link
                return False
            else:
                # We cannot get external links beloning to another server,
                # so this is an external link
                self.add_to_filter(urlObj.get_full_url())
                return True

        # We should not reach here
        return False

    def __apply_depth_check(self, urlObj, mode=0):
        """ Apply the depth setting for this url, if any """

        # depth variable is -1 means no depth-check
        tq = GetObject('trackerqueue')
        baseUrlObj = tq.get_base_urlobject()
        if baseUrlObj is None:
            return False

        reldepth = urlObj.get_relative_depth(baseUrlObj, mode)

        if reldepth != -1:
            # check if this exceeds allowed depth
            if mode == 0 and self._configobj.depth != -1:
                if reldepth > self._configobj.depth:
                    self.add_to_filter(urlObj.get_full_url())
                    return True
            elif mode == 2 and self._configobj.extdepth:
                if reldepth > self._configobj.extdepth:
                    self.add_to_filter(urlObj.get_full_url())
                    return True

        return False

    def __ext_directory_check(self, directory):
        """ Check whether the directory <directory>
        should be considered external """

        index=self.__increment_ext_directory_count(directory)

        # Are we above a prescribed limit ?
        if self._configobj.maxextdirs and len(self._extdirs)>self._configobj.maxextdirs:
            if index != -1:
                # directory index was below the limit, allow its urls
                if index <= self._configobj.maxextdirs:
                    return True
                else:
                    # directory index was above the limit, block its urls
                    return False
            # new directory, block its urls
            else:
                return False
        else:
            return True

    def __ext_server_check(self, server):
        """ Check whether the server <server> should be considered
        external """

        index=self.__increment_ext_server_count(server)

        # are we above a prescribed limit ?
        if self._configobj.maxextservers and len(self._extservers)>self._configobj.maxextservers:
            if index != -1:
                # server index was below the limit, allow its urls
                if index <= self._configobj.maxextservers:
                    return True
                else:
                    return False
            # new server, block its urls
            else:
                return False
        else:
            return True

    def __increment_ext_directory_count(self, directory):
        """ Increment the external dir count """

        try:
            self._dataLock.acquire()
            index=-1
            try:
                index=self._extdirs.index(directory)
            except:
                self._extdirs.append(directory)
        finally:
            self._dataLock.release()

        return index

    def __increment_ext_server_count(self,server):
        """ Increment the external server count """

        try:
            self._dataLock.acquire()
            index=-1
            try:
                index=self._extservers.index(server)
            except:
                self._extservers.append(server)
        finally:
            self._dataLock.release()

        return index
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -