rules.py

来自「Harvestman-最新版本」· Python 代码 · 共 1,030 行 · 第 1/3 页

PY
1,030
字号
                return False        return False    def apply_url_filter(self, urlobj):        """ See if we have a filter matching the url.        Return 1 for blocking the url and 0 for allowing it """        inclfilter = self._configobj.inclfilter        exclfilter = self._configobj.exclfilter        # neither filters are enabled, return False        if not inclfilter and not exclfilter:            return 0        # We always check inclusion filter first since it is        # normally more specific than exclusion filter. Someone        # can request to not fetch any url containing /images/        # in the path, but still fetch the particular path        # /preferred/images. It will work only if we check for        # inclusion first and exclusion later.        inclcheck,exclcheck=-1,-1        matchincl, matchexcl=False,False        url = urlobj.get_full_url()                if inclfilter:            inclcheck=1            # see if we have a match            for f in inclfilter:                m=f.search(url)                if m:                    debug('Go-through filter for url ', url, 'found')                    matchincl=True                    inclcheck=0                    break        if exclfilter:            exclcheck=0            # see if we have a match            for f in exclfilter:                m=f.search(url)                if m:                    debug('No-pass filter for url ', url, 'found')                    matchexcl=True                    exclcheck=1                    break        if inclcheck==1:            extrainfo("Inclfilter does not allow this url", url)        if exclcheck==0:            extrainfo("Exclfilter allows this url", url)        # if exclfilter and inclfilter returns different results        # (exclfilter denys, inclfilter allows)        # we check the order of the filters in the global filter. Whichever        # comes first has precedence.        if inclcheck == 0 and exclcheck == 1:            globalfilter=self._configobj.allfilters            try:                indexincl=globalfilter.index(matchincl)            except:                indexincl=-1            try:                indexexcl=globalfilter.index(matchexcl)            except:                indexexcl=-1            if indexincl != -1 and indexexcl != -1:                if indexincl < indexexcl:                    # inclusion filter has precedence                    return inclcheck                else:                    # exclusion filter has precedence                    return exclcheck            else:                # error, return allow (0)                return 0        else:            # return whichever matched            if inclcheck != -1:                return inclcheck            elif exclcheck != -1:                return exclcheck            # none matched, allow it            else:                return 0         # We wont reach here        return 0    def apply_server_filter(self, urlObj):        """ See if we have a filter matching the server of        this url. Return 1 on success(blocked) and 0 on failure        (allowed) """        server = urlObj.get_domain()        serverinclfilter = self._configobj.serverinclfilter        serverexclfilter = self._configobj.serverexclfilter        if not serverexclfilter and not serverinclfilter: return 0        # We always check inclusion filter first since it is        # normally more specific than exclusion filter. Someone        # can request to not fetch any url containing /images/        # in the path, but still fetch the particular path        # /preferred/images. It will work only if we check for        # inclusion first and exclusion later.        inclcheck,exclcheck=-1,-1        matchincl, matchexcl=False,False        url = urlObj.get_full_url()        if serverinclfilter:            inclcheck = 1            for f in serverinclfilter:                # see if we have a match                m=re.search(re.compile(f,re.IGNORECASE), server)                if m:                    debug('Go-through filter for url ', url, 'found')                    matchincl=f                    inclcheck=0                    break        if serverexclfilter:            exclcheck = 1            for f in serverexclfilter:                # see if we have a match                m=re.search(re.compile(f,re.IGNORECASE), server)                if m:                    debug('No-pass filter for url ', url, 'found')                    matchexcl=f                    exclcheck=1                    break        if inclcheck==1:            extrainfo("Inclfilter does not allow this url", url)        if exclcheck==0:            extrainfo("Exclfilter allows this url", url)        # if exclfilter and inclfilter returns different results        # (exclfilter denys, inclfilter allows)        # we check the order of the filters in the global filter. Whichever        # comes first has precedence.        if inclcheck == 0 and exclcheck == 1:            globalfilter=self._configobj.allserverfilters            try:                indexincl=globalfilter.index(matchincl)            except:                indexincl=-1            try:                indexexcl=globalfilter.index(matchexcl)            except:                indexexcl=-1            if indexincl != -1 and indexexcl != -1:                if indexincl < indexexcl:                    # inclusion filter has precedence                    return inclcheck                else:                    # exclusion filter has precedence                    return exclcheck            else:                # error, return allow (0)                return 0        else:            # return whichever matched            if inclcheck != -1:                return inclcheck            elif exclcheck != -1:                return exclcheck            # none matched, allow it            else:                return 0         # We wont reach here        return 0    def is_under_starting_directory(self, urlObj):        """ Check whether the url in the url object belongs        to the same directory as the starting url for the        project """        directory = urlObj.get_url_directory()        baseUrlObj = objects.queuemgr.get_base_url()        if not baseUrlObj:            return True        # Bug: the original URL might have had been        # redirected, so its base URL might have got        # changed. We need to check with the original        # URL in such cases.        # Sample site: http://www.vegvesen.no        if baseUrlObj.reresolved:            bdir = baseUrlObj.get_original_url_directory()        else:            bdir = baseUrlObj.get_url_directory()                    # print 'BASEDIR=>',bdir        # print 'DIRECTORY=>',directory        # Look for bdir inside dir        index = directory.find(bdir)                if index == 0:            return True        # Sometimes a simple string match        # is not good enough. May be both        # the directories are the same but        # the server names are slightly different        # ex: www-106.ibm.com and www.ibm.com        # for developerworks links.        # Check if both of them are in the same        # domain        if self.compare_domains(urlObj.get_domain(), baseUrlObj.get_domain()):            debug('Domains',urlObj.get_domain(),'and',baseUrlObj.get_domain(),'compare fine')            # Get url directory sans domain            directory = urlObj.get_url_directory_sans_domain()            bdir = baseUrlObj.get_url_directory_sans_domain()            debug('Directories',directory,bdir)                        # Check again            if directory.find(bdir) == 0:                return True        return False                def is_external_server_link(self, urlObj):        """ Check whether the url in the url object belongs to        an external server """        # Get the tracker queue object        baseUrlObj = objects.queuemgr.get_base_url()        if not baseUrlObj:            return False        # Check based on the server        server = urlObj.get_domain()        baseserver = baseUrlObj.get_domain()        return not self.compare_domains( server, baseserver )    def is_external_link(self, urlObj):        """ Check if the url is an external link relative to starting url,        using the download rules set by the user """        # Example.        # Assume our start url is 'http://www.server.com/files/images/index.html"        # Then any url which starts with another server name or at a level        # above the start url's base directory on the same server is considered        # an external url        # i.e, http://www.yahoo.com will be external because of        # 1st reason &        # http://www.server.com/files/search.cgi will be external link because of        # 2nd reason.        # External links ?        # if under the same starting directory, return False        if self.is_under_starting_directory(urlObj):            return False        directory = urlObj.get_url_directory()        baseUrlObj = objects.queuemgr.get_base_url()        if not baseUrlObj:            return False        if urlObj.get_type() == 'stylesheet':            if self._configobj.getstylesheets: return False        elif urlObj.get_type() == 'image':            if self._configobj.getimagelinks: return False        if not self.is_external_server_link(urlObj):            debug('Same!')            if self._configobj.fetchlevel==0:                return True                        elif self._configobj.fetchlevel==3:                # check for the directory of the parent url                # if it is same as starting directory, allow this                # url, else deny                try:                    parentUrlObj = urlObj.get_parent_url()                    if not parentUrlObj:                        return False                    parentdir = parentUrlObj.get_url_directory()                    bdir = baseUrlObj.get_url_directory()                    if parentdir == bdir:                        self._increment_ext_directory_count(directory)                        return False                    else:                        return True                except urlparser.HarvestManUrlError, e:                    logconsole(e)                                elif self._configobj.fetchlevel > 0:                # do other checks , just fall through                pass                        # Increment external directory count            # directory = urlObj.get_url_directory()            # res=self._ext_directory_check(directory)            # if not res:            #    extrainfo("External directory error - filtered!")            #    return True            # Apply depth check for external dirs here            if self._configobj.extdepth:                if self.apply_depth_check(urlObj, mode=2):                    return True            #if self._configobj.epagelinks:            #    # We can get external links belonging to same server,            #    # so this is not an external link            #    return False            #else:            #    # We cannot get external links belonging to same server,            #    # so this is an external link            #    return True            return False        else:            # print 'Different!',self._configobj.fetchlevel            debug('Different!')            # Both belong to different base servers            if self._configobj.fetchlevel==0 or self._configobj.fetchlevel == 1:                return True            elif self._configobj.fetchlevel==2 or self._configobj.fetchlevel==3:                # check whether the baseurl (parent url of this url)                # belongs to the starting server. If so allow fetching                # else deny. ( we assume the baseurl path is not relative! :-)                try:                    parentUrlObj = urlObj.get_parent_url()                    baseserver = baseUrlObj.get_domain()

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?