rules.py

来自「Harvestman-最新版本」· Python 代码 · 共 1,030 行 · 第 1/3 页

PY
1,030
字号
# -- coding: utf-8""" rules.py - Rules checker module for HarvestMan.    This module is part of the HarvestMan program.    Author: Anand B Pillai <abpillai at gmail dot com>    Modification History    --------------------   Jan 8 2006          Anand    Updated this file from EIAO                                repository to get fixes for robot                                rules. Removed EIAO specific                                code.                                Put ext check rules before robots                                check to speed up things.   Jan 10 2006          Anand    Converted from dos to unix format                                (removed Ctrl-Ms).   April 11 2007        Anand   Not doing I.P comparison for                                non-robots.txt URLs in compare_domains                                method as it is erroneous.                                   Copyright (C) 2004 Anand B Pillai.                                """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import socketimport reimport osimport timeimport copyfrom harvestman.lib.event import HarvestManEventfrom harvestman.lib import robotparserfrom harvestman.lib.methodwrapper import MethodWrapperMetaClassfrom harvestman.lib import urlparserfrom harvestman.lib import filtersfrom harvestman.lib.common.common import *from harvestman.lib.common.netinfo import tldsfrom harvestman.lib.common.lrucache import LRU# Defining pluggable functions__plugins__ = {'violates_rules_plugin': 'HarvestManRulesChecker:violates_rules'}# Defining functions with callbacks__callbacks__ = {'violates_rules_callback' : 'HarvestManRulesChecker:violates_rules'}class HarvestManRulesChecker(object):    """ Class which checks the download rules for urls. These    rules include depth checks, robot.txt rules checks, filter    checks, external server/directory checks, duplicate url    checks, maximum limits check etc. """    # For supporting callbacks    __metaclass__ = MethodWrapperMetaClass    alias = 'rulesmgr'        # Regular expression for matching www. infront of domains    wwwre = re.compile(r'^www(\d*)\.')    def __init__(self):        self.reset()    def reset(self):        self._filter = {}        self._extservers = Ldeque(1000)        self._extdirs = Ldeque(1000)        self._wordstr = '[\s+<>]'        self._robots  = LRU(1000)        self._robocache = Ldeque(1000)        self._invalidservers = Ldeque(1000)        # Flag for making filters        self._madefilters = False        self._configobj = objects.config        self.junkfilter = filters.HarvestManJunkFilter()            def violates_rules(self, urlObj):        """ Check the basic rules for this url object,        This function returns True if the url object        violates the rules, else returns False """        # raise event to allow custom logic        ret = objects.eventmgr.raise_event('includelinks', urlObj)        if ret==False:            self.add_to_filter(urlObj.index)                        return True        elif ret==True:            return False                url = urlObj.get_full_url()        # New in 2.0        # If checking of rules on the type of this URL        # is set to be skipped, return False        if urlObj.typ in self._configobj.skipruletypes:            return False                # if this url exists in filter list, return        # True rightaway        try:            self._filter[url.index]            return True        except KeyError:            pass       # now apply the url filter        if self.apply_url_filter(urlObj):            extrainfo("Custom filter - filtered ", url)            self.add_to_filter(urlObj.index)            return True        # now apply the junk filter        if self.junkfilter:            if not self.junkfilter.check(urlObj):                extrainfo("Junk Filter - filtered", url)                self.add_to_filter(urlObj.index)                                            return True        # check if this is an external link        if self.is_external_link( urlObj ):            extrainfo("External link - filtered ", urlObj.get_full_url())            self.add_to_filter(urlObj.index)            return True        # now apply REP        if self.apply_rep(urlObj):            extrainfo("Robots.txt rules prevents crawl of ", url)            self.add_to_filter(urlObj.index)            return True        # depth check        if self.apply_depth_check(urlObj):            extrainfo("Depth exceeds - filtered ", urlObj.get_full_url())            self.add_to_filter(urlObj.index)                        return True        return False    def add_to_filter(self, urlindex):        """ Add the link to the filter dictionary """        self._filter[urlindex] = 1    def compare_domains(self, domain1, domain2, robots=False):        """ Compare two domains (servers) first by        ip and then by name and return True if both point        to the same server, return False otherwise. """        # For comparing robots.txt file, first compare by        # ip and then by name.        if robots:             firstval=self.compare_by_ip(domain1, domain2)            if firstval:                return firstval            else:                return self.compare_by_name(domain1, domain2)        # otherwise, we do only a name check        else:            return self.compare_by_name(domain1, domain2)    def _get_base_server(self, server):        """ Return the base server name of  the passed        server (domain) name """        # If the server name is of the form say bar.foo.com        # or vodka.bar.foo.com, i.e there are more than one        # '.' in the name, then we need to return the        # last string containing a dot in the middle.        if server.count('.') > 1:            dotstrings = server.split('.')            # now the list is of the form => [vodka, bar, foo, com]            # Skip the list for skipping over tld domain name endings            # such as .org.uk, .mobi.uk etc. For example, if the            # server is games.mobileworld.mobi.uk, then we            # need to return mobileworld.mobi.uk, not mobi.uk            dotstrings.reverse()            idx = 0                        for item in dotstrings:                if item.lower() in tlds:                    idx += 1                            return '.'.join(dotstrings[idx::-1])        else:            # The server is of the form foo.com or just "foo"            # so return it straight away            return server    def compare_no_tld(self, domain1, domain2):        """ Compare two server names without their tld endings """        # This will return True for www.foo.com, www.foo.org        # foo.co.uk etc.        dotstrings1 = self.wwwre.sub('', domain1.lower()).split('.')        dotstrings2 = self.wwwre.sub('', domain2.lower()).split('.')        l1 = [item for item in dotstrings1 if item not in tlds]        l2 = [item for item in dotstrings2 if item not in tlds]                    debug(l1, l2)                return '.'.join(l1) =='.'.join(l2)            def compare_by_name(self, domain1, domain2):        """ Compare two servers by their names. Return True        if similar, False otherwise """        # first check if both domains are same        if domain1.lower() == domain2.lower(): return True        # Check whether we are comparing something like www.foo.com        # and foo.com, they are assumed to be same.         if self.wwwre.sub('',domain1.lower())==self.wwwre.sub('',domain2.lower()):            return True        # If ignoretlds is set to True, return True for two servers such        # as www.foo.com and www.foo.co.uk, www.foo.org etc.        if self._configobj.ignoretlds:            if self.compare_no_tld(domain1, domain2):                return True                    if not self._configobj.subdomain:            # Checks whether the domain names belong to            # the same base server, if the above config            # variable is set. For example, this will            # return True for two servers like server1.foo.com            # and server2.foo.com or server1.base and server2.base            baseserver1 = self.wwwre.sub('',self._get_base_server(domain1))            baseserver2 = self.wwwre.sub('',self._get_base_server(domain2))            debug('Bases=>',baseserver1, baseserver2)                        # Instead of checking for equality, check for endswith.            # This will return True even for cases like            # vanhall-larenstein.nl and larenstein.nl            if self._configobj.ignoretlds:                if self.compare_no_tld(baseserver1, baseserver2):                    return True                            return baseserver1.lower().endswith(baseserver2.lower())            # return (baseserver1.lower() == baseserver2.lower())        else:            # if the subdomain variable is set will return False for two servers like            # server1.foo.com and server2.foo.com i.e with same base domain but different            # subdomains.            return False    def compare_by_ip(self, domain1, domain2):        """ Compare two servers by their ip address. Return        True if same, False otherwise """        try:            ip1 = socket.gethostbyname(domain1)            ip2 = socket.gethostbyname(domain2)        except Exception:            return False        if ip1==ip2: return True        else: return False    def apply_rep(self, urlObj):        """ See if the robots.txt file on the server        allows fetching of this url. Return 0 on success        (fetching allowed) and 1 on failure(fetching blocked) """        # robots option turned off        if self._configobj.robots==0: return False                domport = urlObj.get_full_domain_with_port()        # The robots.txt file url        robotsfile = "".join((domport, '/robots.txt'))        # Check #1        # if this url exists in filter list, return        # True rightaway        try:            self._filter[urlObj.index]            return True        except KeyError:            pass        url_directory = urlObj.get_url_directory()        # Check #2: Check if this directory        # is already there in the white list        try:            self._robocache.index(url_directory)            return False        except ValueError:            pass        try:            rp = self._robots[domport]            # Check #4            # If there is an entry, but it            # is None, it means there is no            # robots.txt file in the server            # (see below). So return False.            if not rp: return False        except KeyError:            # Not there, create a fresh            # one and add it.            rp = robotparser.RobotFileParser()            rp.set_url(robotsfile)            ret = rp.read()            # Check #5                            if ret==-1:                # no robots.txt file                # Set the entry for this                # server as None, so next                # time we dont need to do                # this operation again.                self._robots[domport] = None                return False            else:                # Set it                self._robots[domport] = rp        # Get user-agent from Spider        ua = self._configobj.USER_AGENT                # Check #6        if rp.can_fetch(ua, url_directory):            # Add to white list            self._robocache.append(url_directory)            return False        # Cannot fetch, so add to filter        # for quick look up later.                return True    def apply_word_filter(self, data):        """ Apply the word filter """        if self._configobj.wordfilter:            if self._configobj.wordfilterre.search(data):                return True            else:

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?