📄 rules.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
""" HarvestManRules.py - Rules checker module for HarvestMan.
    This software is part of the HarvestMan(R) program.

    Copyright (C) 2004-2005 Anand B Pillai.
    
    Author: Anand B Pillai (anandpillai at letterboxes dot org).    

    Modification History

    Dec 15 2003               Added word search function for 1.3 release.
    Jan 2 2004                1.4 bug-fix version development started.
    Feb 10 2004               Fixed bug in checking of external servers.
                              Bug ID# B1076402348.52.
                              Fixed bug in checking of servers in the same
                              base domain. Bug ID #B1076409910.45.

                              1.3.1 bug fix version released.

    May 4-9 2004              1.4 development.

                              *Added priority based rule. Added
                              function __make_priority which
                              creates a priority dictionary based
                              on file extensions.

                              *Modified url filter function. 
                              Regular expressions are pre-compiled.

                              *Modified rules checking function.
                              Reordered the functions that check
                              various constraints. url filter check
                              comes before the forced download check
                              for stylesheets/images. The rest come
                              afterwards.

                              *Rewrote the function __compare_by_name
                              to fix bug B1083256752.28 (browsing
                              problem for localhost).

    Jun 14 2004         Anand          1.3.9 release.
"""

import threading
import socket
import re
from threading import Lock, Condition

import robotparser

from common import *
# urlPathParser module
from urlparser import HarvestManUrlParser, HarvestManUrlParserError

class harvestManRulesChecker:
    """ Class which checks the download rules for urls. These
    rules include depth checks, robot.txt rules checks, filter
    checks, external server/directory checks, duplicate url
    checks, maximum limits check etc. """

    def __init__(self):

        self.__dict__=  {'_links'    : [],
                         '_filter'   : [],
                         '_extservers': [],
                         '_extdirs' : [],
                         '_counter' : 0,
                         '_rexplist' : [],
                         '_wordstr' : '[\s+<>]',
                         '_rp': None
                         }

        # Locks
        self._dataLock = Condition( Lock())
        # Configure robotparser object if rep rule is specified
        self._configobj = GetObject('config')

        if self._configobj.robots:
            self._rp = robotparser.RobotFileParser()

    def __getattr__(self, name):
        try:
            return self.__dict__[name]
        except KeyError:
            return None

    def __setattr__(self, name, value):
        self.__dict__[name] = value

    def violates_basic_rules(self, urlObj):
        """ Check the basic rules for this url object,
        This function returns True if the url object
        violates the rules, else returns False """

        # stylesheet check, if config is set to always
        # fetch stylesheets of a page(the default), return False
        if urlObj.get_type() == 'stylesheet':
            if self._configobj.getstylesheets: return False

        elif urlObj.get_type() == 'image':
            if self._configobj.getimagelinks: return False

        # check if this is an external link
        if self.__is_external_link( urlObj ):
            extrainfo("External link - filtered ", urlObj.get_full_url())
            return True

        # depth check
        if self.__apply_depth_check(urlObj):
            extrainfo("Depth exceeds - filtered ", urlObj.get_full_url())
            return True

        return False

    def violates_download_rules(self, urlObj, noextordepthcheck=0):
        """ Check download rules for this url object """

        url = urlObj.get_full_url()

        # if this url exists in filter list, return
        # True rightaway
        try:
            index=self._filter.index(url)
            return True
        except ValueError, e:
            pass

        # now apply the url filter
        if self.__apply_url_filter(url):
            extrainfo("Custom filter - filtered ", url)           
            return True

        # stylesheet check, if config is set to always
        # fetch stylesheets of a page(the default), return 0
        # Bug, this code should come after url filter check
        # and before other checks since default setting for
        # stylesheets and images has a lower priority than
        # url filters, but higher priority than other constraints
        # like depth/external checks.
        
        if urlObj.get_type() == 'stylesheet':
            if self._configobj.getstylesheets: return False
        
        elif urlObj.get_type() == 'image':
            if self._configobj.getimagelinks: return False

        # check if this is an external link
        if not noextordepthcheck and self.__is_external_link(urlObj):
            extrainfo("External link - filtered ", url)
            return True

        # depth check
        if not noextordepthcheck and self.__apply_depth_check(urlObj):
            extrainfo("Exceeds depth checks - filtered", url)          
            return True

        # now apply REP
        if self.__apply_rep(urlObj):
            extrainfo("Robots.txt rules prevents download of ", url)
            return True

        return False

    def add_to_filter(self, link):
        """ Add the link to the filter list """

        try:
            self._dataLock.acquire()
            try:
                self._filter.index(link)
            except:
                self._filter.append(link)
        finally:
            self._dataLock.release()
            pass

    def __compare_domains(self, domain1, domain2, robots=False):
        """ Compare two domains (servers) first by
        ip and then by name and return True if both point
        to the same server, return False otherwise. """

        # For comparing robots.txt file, first compare by
        # ip and then by name.
        if robots: 
            firstval=self.__compare_by_ip(domain1, domain2)
            if firstval:
                return firstval
            else:
                return self.__compare_by_name(domain1, domain2)

        # otherwise, we do a name check first and
        # ip check later
        else:
            firstval=self.__compare_by_name(domain1, domain2)
            if firstval:
                return firstval
            else:
                return self.__compare_by_ip(domain1, domain2)

    def __get_base_server(self, server):
        """ Return the base server name of  the passed
        server (domain) name """

        # If the server name is of the form say bar.foo.com
        # or vodka.bar.foo.com, i.e there are more than one
        # '.' in the name, then we need to return the
        # last string containing a dot in the middle.
        if server.count('.') > 1:
            dotstrings = server.split('.')
            # now the list is of the form => [vodka, bar, foo, com]

            # Return the last two items added with a '.'
            # in between
            return "".join((dotstrings[-2], ".", dotstrings[-1]))
        else:
            # The server is of the form foo.com or just "foo"
            # so return it straight away
            return server

    def __compare_by_name(self, domain1, domain2):
        """ Compare two servers by their names. Return True
        if similar, False otherwise """

        # first check if both domains are same
        if domain1.lower() == domain2.lower(): return True

        if not self._configobj.subdomain:
            # Checks whether the domain names belong to
            # the same base server, if the above config
            # variable is set. For example, this will
            # return True for two servers like server1.foo.com
            # and server2.foo.com or server1.base and server2.base
            baseserver1 = self.__get_base_server(domain1)
            baseserver2 = self.__get_base_server(domain2)
            if baseserver1.lower() == baseserver2.lower():
                return True
        else:
            # if the subdomain variable is set
            # will return False for two servers like
            # server1.foo.com and server2.foo.com i.e
            # with same base domain but different
            # subdomains.
            return False
        
    def __compare_by_ip(self, domain1, domain2):
        """ Compare two servers by their ip address. Return
        True if same, False otherwise """

        ip1='127.0.0.1'
        ip2='127.0.0.1'

        try:
            ip1 = socket.gethostbyname(domain1)
            ip2 = socket.gethostbyname(domain2)
        except:
            pass

        if ip1==ip2: return True
        else: return False

    def __apply_rep(self, urlObj):
        """ See if the robots.txt file on the server
        allows fetching of this url. Return 0 on success
        (fetching allowed) and 1 on failure(fetching blocked) """

        # robots option turned off
        if self._configobj.robots==0: return False

        rp = self._rp
        if rp is None: return False

        server = urlObj.get_domain()

        # The robots.txt file url
        robotsfile = urlObj.get_full_domain_with_port() + '/robots.txt'
        # if this url exists in filter list, return
        # -1 rightaway
        try:
            index=self._filter.index(robotsfile)
            return False
        except ValueError, e:
            pass

        rp.set_url(robotsfile)
        ret = rp.read()
        if ret==-1:
            # PERFMOD: Add this to the filter list
            # so that we dont need to check it later
            self.add_to_filter(robotsfile)
            return False # no robots file

        # Get directory of this url
        directory = urlObj.get_url_directory()
        # Get user-agent from Spider
        ua = GetObject('USER_AGENT')

        if rp.can_fetch(ua, directory): return False
        self.add_to_filter(urlObj.get_full_url())

        return True

    def apply_word_filter(self, data):
        """ Apply the word filter """

        cfg = GetObject('config')
        if cfg.wordfilter:
            if cfg.wordfilterre.search(data):
                return True
            else:
                return False

        return True

    def __apply_url_filter(self, url):
        """ See if we have a filter matching the url.
        Return 1 for blocking the url and 0 for allowing it """

        inclfilter = self._configobj.inclfilter
        exclfilter = self._configobj.exclfilter

        # for html files, dont do any check
        # since it is handled by the 'html' option of the
        # config class.
        extn = url[url.rfind('.'):].lower()
        if extn in ('.htm', '.html'): return 0

        # neither filters are enabled, return 0
        if not inclfilter and not exclfilter: return 0

        # We always check inclusion filter first since it is
        # normally more specific than exclusion filter. Someone
        # can request to not fetch any url containing /images/
        # in the path, but still fetch the particular path
        # /preferred/images. It will work only if we check for
        # inclusion first and exclusion later.
        inclcheck,exclcheck=-1,-1
        matchincl, matchexcl='',''

        if inclfilter:
            inclcheck=1
            # see if we have a match
            for f in inclfilter:
                m=f.search(url)
                if m:
                    extrainfo('Go-through filter for url ', url, 'found')
                    matchincl=f
                    inclcheck=0
                    break

        if exclfilter:
            exclcheck=0
            # see if we have a match
            for f in exclfilter:
                m=f.search(url)
                if m:
                    extrainfo('No-pass filter for url ', url, 'found')
                    matchexcl=f
                    self.add_to_filter(url)               
                    exclcheck=1
                    break

        if inclcheck==1:
            extrainfo("Inclfilter does not allow this url", url)
        if exclcheck==0:
            extrainfo("Exclfilter allows this url", url)

        # if exclfilter and inclfilter returns different results
        # (exclfilter denys, inclfilter allows)
        # we check the order of the filters in the global filter. Whichever
        # comes first has precedence.
        if inclcheck == 0 and exclcheck == 1:
            globalfilter=self._configobj.allfilters
            try:
                indexincl=globalfilter.index(matchincl)
            except:
                indexincl=-1
            try:
                indexexcl=globalfilter.index(matchexcl)
            except:
                indexexcl=-1
12 3 下一页
💿 文件大小 153 K
👤 上传用户 xiaoexiao
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#网络
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -