rules.py
来自「Harvestman-最新版本」· Python 代码 · 共 1,030 行 · 第 1/3 页
PY
1,030 行
# -- coding: utf-8""" rules.py - Rules checker module for HarvestMan. This module is part of the HarvestMan program. Author: Anand B Pillai <abpillai at gmail dot com> Modification History -------------------- Jan 8 2006 Anand Updated this file from EIAO repository to get fixes for robot rules. Removed EIAO specific code. Put ext check rules before robots check to speed up things. Jan 10 2006 Anand Converted from dos to unix format (removed Ctrl-Ms). April 11 2007 Anand Not doing I.P comparison for non-robots.txt URLs in compare_domains method as it is erroneous. Copyright (C) 2004 Anand B Pillai. """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import socketimport reimport osimport timeimport copyfrom harvestman.lib.event import HarvestManEventfrom harvestman.lib import robotparserfrom harvestman.lib.methodwrapper import MethodWrapperMetaClassfrom harvestman.lib import urlparserfrom harvestman.lib import filtersfrom harvestman.lib.common.common import *from harvestman.lib.common.netinfo import tldsfrom harvestman.lib.common.lrucache import LRU# Defining pluggable functions__plugins__ = {'violates_rules_plugin': 'HarvestManRulesChecker:violates_rules'}# Defining functions with callbacks__callbacks__ = {'violates_rules_callback' : 'HarvestManRulesChecker:violates_rules'}class HarvestManRulesChecker(object): """ Class which checks the download rules for urls. These rules include depth checks, robot.txt rules checks, filter checks, external server/directory checks, duplicate url checks, maximum limits check etc. """ # For supporting callbacks __metaclass__ = MethodWrapperMetaClass alias = 'rulesmgr' # Regular expression for matching www. infront of domains wwwre = re.compile(r'^www(\d*)\.') def __init__(self): self.reset() def reset(self): self._filter = {} self._extservers = Ldeque(1000) self._extdirs = Ldeque(1000) self._wordstr = '[\s+<>]' self._robots = LRU(1000) self._robocache = Ldeque(1000) self._invalidservers = Ldeque(1000) # Flag for making filters self._madefilters = False self._configobj = objects.config self.junkfilter = filters.HarvestManJunkFilter() def violates_rules(self, urlObj): """ Check the basic rules for this url object, This function returns True if the url object violates the rules, else returns False """ # raise event to allow custom logic ret = objects.eventmgr.raise_event('includelinks', urlObj) if ret==False: self.add_to_filter(urlObj.index) return True elif ret==True: return False url = urlObj.get_full_url() # New in 2.0 # If checking of rules on the type of this URL # is set to be skipped, return False if urlObj.typ in self._configobj.skipruletypes: return False # if this url exists in filter list, return # True rightaway try: self._filter[url.index] return True except KeyError: pass # now apply the url filter if self.apply_url_filter(urlObj): extrainfo("Custom filter - filtered ", url) self.add_to_filter(urlObj.index) return True # now apply the junk filter if self.junkfilter: if not self.junkfilter.check(urlObj): extrainfo("Junk Filter - filtered", url) self.add_to_filter(urlObj.index) return True # check if this is an external link if self.is_external_link( urlObj ): extrainfo("External link - filtered ", urlObj.get_full_url()) self.add_to_filter(urlObj.index) return True # now apply REP if self.apply_rep(urlObj): extrainfo("Robots.txt rules prevents crawl of ", url) self.add_to_filter(urlObj.index) return True # depth check if self.apply_depth_check(urlObj): extrainfo("Depth exceeds - filtered ", urlObj.get_full_url()) self.add_to_filter(urlObj.index) return True return False def add_to_filter(self, urlindex): """ Add the link to the filter dictionary """ self._filter[urlindex] = 1 def compare_domains(self, domain1, domain2, robots=False): """ Compare two domains (servers) first by ip and then by name and return True if both point to the same server, return False otherwise. """ # For comparing robots.txt file, first compare by # ip and then by name. if robots: firstval=self.compare_by_ip(domain1, domain2) if firstval: return firstval else: return self.compare_by_name(domain1, domain2) # otherwise, we do only a name check else: return self.compare_by_name(domain1, domain2) def _get_base_server(self, server): """ Return the base server name of the passed server (domain) name """ # If the server name is of the form say bar.foo.com # or vodka.bar.foo.com, i.e there are more than one # '.' in the name, then we need to return the # last string containing a dot in the middle. if server.count('.') > 1: dotstrings = server.split('.') # now the list is of the form => [vodka, bar, foo, com] # Skip the list for skipping over tld domain name endings # such as .org.uk, .mobi.uk etc. For example, if the # server is games.mobileworld.mobi.uk, then we # need to return mobileworld.mobi.uk, not mobi.uk dotstrings.reverse() idx = 0 for item in dotstrings: if item.lower() in tlds: idx += 1 return '.'.join(dotstrings[idx::-1]) else: # The server is of the form foo.com or just "foo" # so return it straight away return server def compare_no_tld(self, domain1, domain2): """ Compare two server names without their tld endings """ # This will return True for www.foo.com, www.foo.org # foo.co.uk etc. dotstrings1 = self.wwwre.sub('', domain1.lower()).split('.') dotstrings2 = self.wwwre.sub('', domain2.lower()).split('.') l1 = [item for item in dotstrings1 if item not in tlds] l2 = [item for item in dotstrings2 if item not in tlds] debug(l1, l2) return '.'.join(l1) =='.'.join(l2) def compare_by_name(self, domain1, domain2): """ Compare two servers by their names. Return True if similar, False otherwise """ # first check if both domains are same if domain1.lower() == domain2.lower(): return True # Check whether we are comparing something like www.foo.com # and foo.com, they are assumed to be same. if self.wwwre.sub('',domain1.lower())==self.wwwre.sub('',domain2.lower()): return True # If ignoretlds is set to True, return True for two servers such # as www.foo.com and www.foo.co.uk, www.foo.org etc. if self._configobj.ignoretlds: if self.compare_no_tld(domain1, domain2): return True if not self._configobj.subdomain: # Checks whether the domain names belong to # the same base server, if the above config # variable is set. For example, this will # return True for two servers like server1.foo.com # and server2.foo.com or server1.base and server2.base baseserver1 = self.wwwre.sub('',self._get_base_server(domain1)) baseserver2 = self.wwwre.sub('',self._get_base_server(domain2)) debug('Bases=>',baseserver1, baseserver2) # Instead of checking for equality, check for endswith. # This will return True even for cases like # vanhall-larenstein.nl and larenstein.nl if self._configobj.ignoretlds: if self.compare_no_tld(baseserver1, baseserver2): return True return baseserver1.lower().endswith(baseserver2.lower()) # return (baseserver1.lower() == baseserver2.lower()) else: # if the subdomain variable is set will return False for two servers like # server1.foo.com and server2.foo.com i.e with same base domain but different # subdomains. return False def compare_by_ip(self, domain1, domain2): """ Compare two servers by their ip address. Return True if same, False otherwise """ try: ip1 = socket.gethostbyname(domain1) ip2 = socket.gethostbyname(domain2) except Exception: return False if ip1==ip2: return True else: return False def apply_rep(self, urlObj): """ See if the robots.txt file on the server allows fetching of this url. Return 0 on success (fetching allowed) and 1 on failure(fetching blocked) """ # robots option turned off if self._configobj.robots==0: return False domport = urlObj.get_full_domain_with_port() # The robots.txt file url robotsfile = "".join((domport, '/robots.txt')) # Check #1 # if this url exists in filter list, return # True rightaway try: self._filter[urlObj.index] return True except KeyError: pass url_directory = urlObj.get_url_directory() # Check #2: Check if this directory # is already there in the white list try: self._robocache.index(url_directory) return False except ValueError: pass try: rp = self._robots[domport] # Check #4 # If there is an entry, but it # is None, it means there is no # robots.txt file in the server # (see below). So return False. if not rp: return False except KeyError: # Not there, create a fresh # one and add it. rp = robotparser.RobotFileParser() rp.set_url(robotsfile) ret = rp.read() # Check #5 if ret==-1: # no robots.txt file # Set the entry for this # server as None, so next # time we dont need to do # this operation again. self._robots[domport] = None return False else: # Set it self._robots[domport] = rp # Get user-agent from Spider ua = self._configobj.USER_AGENT # Check #6 if rp.can_fetch(ua, url_directory): # Add to white list self._robocache.append(url_directory) return False # Cannot fetch, so add to filter # for quick look up later. return True def apply_word_filter(self, data): """ Apply the word filter """ if self._configobj.wordfilter: if self._configobj.wordfilterre.search(data): return True else:
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?