📄 rules.py
字号:
""" HarvestManRules.py - Rules checker module for HarvestMan.
This software is part of the HarvestMan(R) program.
Copyright (C) 2004-2005 Anand B Pillai.
Author: Anand B Pillai (anandpillai at letterboxes dot org).
Modification History
Dec 15 2003 Added word search function for 1.3 release.
Jan 2 2004 1.4 bug-fix version development started.
Feb 10 2004 Fixed bug in checking of external servers.
Bug ID# B1076402348.52.
Fixed bug in checking of servers in the same
base domain. Bug ID #B1076409910.45.
1.3.1 bug fix version released.
May 4-9 2004 1.4 development.
*Added priority based rule. Added
function __make_priority which
creates a priority dictionary based
on file extensions.
*Modified url filter function.
Regular expressions are pre-compiled.
*Modified rules checking function.
Reordered the functions that check
various constraints. url filter check
comes before the forced download check
for stylesheets/images. The rest come
afterwards.
*Rewrote the function __compare_by_name
to fix bug B1083256752.28 (browsing
problem for localhost).
Jun 14 2004 Anand 1.3.9 release.
"""
import threading
import socket
import re
from threading import Lock, Condition
import robotparser
from common import *
# urlPathParser module
from urlparser import HarvestManUrlParser, HarvestManUrlParserError
class harvestManRulesChecker:
""" Class which checks the download rules for urls. These
rules include depth checks, robot.txt rules checks, filter
checks, external server/directory checks, duplicate url
checks, maximum limits check etc. """
def __init__(self):
self.__dict__= {'_links' : [],
'_filter' : [],
'_extservers': [],
'_extdirs' : [],
'_counter' : 0,
'_rexplist' : [],
'_wordstr' : '[\s+<>]',
'_rp': None
}
# Locks
self._dataLock = Condition( Lock())
# Configure robotparser object if rep rule is specified
self._configobj = GetObject('config')
if self._configobj.robots:
self._rp = robotparser.RobotFileParser()
def __getattr__(self, name):
try:
return self.__dict__[name]
except KeyError:
return None
def __setattr__(self, name, value):
self.__dict__[name] = value
def violates_basic_rules(self, urlObj):
""" Check the basic rules for this url object,
This function returns True if the url object
violates the rules, else returns False """
# stylesheet check, if config is set to always
# fetch stylesheets of a page(the default), return False
if urlObj.get_type() == 'stylesheet':
if self._configobj.getstylesheets: return False
elif urlObj.get_type() == 'image':
if self._configobj.getimagelinks: return False
# check if this is an external link
if self.__is_external_link( urlObj ):
extrainfo("External link - filtered ", urlObj.get_full_url())
return True
# depth check
if self.__apply_depth_check(urlObj):
extrainfo("Depth exceeds - filtered ", urlObj.get_full_url())
return True
return False
def violates_download_rules(self, urlObj, noextordepthcheck=0):
""" Check download rules for this url object """
url = urlObj.get_full_url()
# if this url exists in filter list, return
# True rightaway
try:
index=self._filter.index(url)
return True
except ValueError, e:
pass
# now apply the url filter
if self.__apply_url_filter(url):
extrainfo("Custom filter - filtered ", url)
return True
# stylesheet check, if config is set to always
# fetch stylesheets of a page(the default), return 0
# Bug, this code should come after url filter check
# and before other checks since default setting for
# stylesheets and images has a lower priority than
# url filters, but higher priority than other constraints
# like depth/external checks.
if urlObj.get_type() == 'stylesheet':
if self._configobj.getstylesheets: return False
elif urlObj.get_type() == 'image':
if self._configobj.getimagelinks: return False
# check if this is an external link
if not noextordepthcheck and self.__is_external_link(urlObj):
extrainfo("External link - filtered ", url)
return True
# depth check
if not noextordepthcheck and self.__apply_depth_check(urlObj):
extrainfo("Exceeds depth checks - filtered", url)
return True
# now apply REP
if self.__apply_rep(urlObj):
extrainfo("Robots.txt rules prevents download of ", url)
return True
return False
def add_to_filter(self, link):
""" Add the link to the filter list """
try:
self._dataLock.acquire()
try:
self._filter.index(link)
except:
self._filter.append(link)
finally:
self._dataLock.release()
pass
def __compare_domains(self, domain1, domain2, robots=False):
""" Compare two domains (servers) first by
ip and then by name and return True if both point
to the same server, return False otherwise. """
# For comparing robots.txt file, first compare by
# ip and then by name.
if robots:
firstval=self.__compare_by_ip(domain1, domain2)
if firstval:
return firstval
else:
return self.__compare_by_name(domain1, domain2)
# otherwise, we do a name check first and
# ip check later
else:
firstval=self.__compare_by_name(domain1, domain2)
if firstval:
return firstval
else:
return self.__compare_by_ip(domain1, domain2)
def __get_base_server(self, server):
""" Return the base server name of the passed
server (domain) name """
# If the server name is of the form say bar.foo.com
# or vodka.bar.foo.com, i.e there are more than one
# '.' in the name, then we need to return the
# last string containing a dot in the middle.
if server.count('.') > 1:
dotstrings = server.split('.')
# now the list is of the form => [vodka, bar, foo, com]
# Return the last two items added with a '.'
# in between
return "".join((dotstrings[-2], ".", dotstrings[-1]))
else:
# The server is of the form foo.com or just "foo"
# so return it straight away
return server
def __compare_by_name(self, domain1, domain2):
""" Compare two servers by their names. Return True
if similar, False otherwise """
# first check if both domains are same
if domain1.lower() == domain2.lower(): return True
if not self._configobj.subdomain:
# Checks whether the domain names belong to
# the same base server, if the above config
# variable is set. For example, this will
# return True for two servers like server1.foo.com
# and server2.foo.com or server1.base and server2.base
baseserver1 = self.__get_base_server(domain1)
baseserver2 = self.__get_base_server(domain2)
if baseserver1.lower() == baseserver2.lower():
return True
else:
# if the subdomain variable is set
# will return False for two servers like
# server1.foo.com and server2.foo.com i.e
# with same base domain but different
# subdomains.
return False
def __compare_by_ip(self, domain1, domain2):
""" Compare two servers by their ip address. Return
True if same, False otherwise """
ip1='127.0.0.1'
ip2='127.0.0.1'
try:
ip1 = socket.gethostbyname(domain1)
ip2 = socket.gethostbyname(domain2)
except:
pass
if ip1==ip2: return True
else: return False
def __apply_rep(self, urlObj):
""" See if the robots.txt file on the server
allows fetching of this url. Return 0 on success
(fetching allowed) and 1 on failure(fetching blocked) """
# robots option turned off
if self._configobj.robots==0: return False
rp = self._rp
if rp is None: return False
server = urlObj.get_domain()
# The robots.txt file url
robotsfile = urlObj.get_full_domain_with_port() + '/robots.txt'
# if this url exists in filter list, return
# -1 rightaway
try:
index=self._filter.index(robotsfile)
return False
except ValueError, e:
pass
rp.set_url(robotsfile)
ret = rp.read()
if ret==-1:
# PERFMOD: Add this to the filter list
# so that we dont need to check it later
self.add_to_filter(robotsfile)
return False # no robots file
# Get directory of this url
directory = urlObj.get_url_directory()
# Get user-agent from Spider
ua = GetObject('USER_AGENT')
if rp.can_fetch(ua, directory): return False
self.add_to_filter(urlObj.get_full_url())
return True
def apply_word_filter(self, data):
""" Apply the word filter """
cfg = GetObject('config')
if cfg.wordfilter:
if cfg.wordfilterre.search(data):
return True
else:
return False
return True
def __apply_url_filter(self, url):
""" See if we have a filter matching the url.
Return 1 for blocking the url and 0 for allowing it """
inclfilter = self._configobj.inclfilter
exclfilter = self._configobj.exclfilter
# for html files, dont do any check
# since it is handled by the 'html' option of the
# config class.
extn = url[url.rfind('.'):].lower()
if extn in ('.htm', '.html'): return 0
# neither filters are enabled, return 0
if not inclfilter and not exclfilter: return 0
# We always check inclusion filter first since it is
# normally more specific than exclusion filter. Someone
# can request to not fetch any url containing /images/
# in the path, but still fetch the particular path
# /preferred/images. It will work only if we check for
# inclusion first and exclusion later.
inclcheck,exclcheck=-1,-1
matchincl, matchexcl='',''
if inclfilter:
inclcheck=1
# see if we have a match
for f in inclfilter:
m=f.search(url)
if m:
extrainfo('Go-through filter for url ', url, 'found')
matchincl=f
inclcheck=0
break
if exclfilter:
exclcheck=0
# see if we have a match
for f in exclfilter:
m=f.search(url)
if m:
extrainfo('No-pass filter for url ', url, 'found')
matchexcl=f
self.add_to_filter(url)
exclcheck=1
break
if inclcheck==1:
extrainfo("Inclfilter does not allow this url", url)
if exclcheck==0:
extrainfo("Exclfilter allows this url", url)
# if exclfilter and inclfilter returns different results
# (exclfilter denys, inclfilter allows)
# we check the order of the filters in the global filter. Whichever
# comes first has precedence.
if inclcheck == 0 and exclcheck == 1:
globalfilter=self._configobj.allfilters
try:
indexincl=globalfilter.index(matchincl)
except:
indexincl=-1
try:
indexexcl=globalfilter.index(matchexcl)
except:
indexexcl=-1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -