📄 rules.py
字号:
def is_duplicate_link(self, link):
""" Duplicate url check """
if self.add_link(link):
return True
return False
def add_link(self, url):
""" Add the passed url to the links list after checking
for duplicates """
# Return True if the url is present in
# the list, False otherwise.
val=False
try:
#self._dataLock.acquire()
try:
self._links.index(url)
val=True
except:
self._links.append(url)
finally:
pass
#self._dataLock.release()
return val
def get_stats(self):
""" Return statistics as a 3 tuple. This returns
a 3 tuple of number of links, number of servers, and
number of directories in the base server parsed by
url trackers """
numlinks=len(self._links)
numservers=len(self._extservers)
numdirs=len(self._extdirs)
return (numlinks, numservers, numdirs)
def dump_urls(self, file):
""" Write all parsed urls to a file """
if os.path.exists(file):
try:
os.remove(file)
except OSError, e:
print e
return -1
moreinfo('Dumping url list to file', file)
f=open(file, 'w')
f.write('LINKS PARSED\n')
f.write('============\n')
for link in self._links:
f.write(link + '\n')
f.close()
debug('Done.')
def make_filters(self):
""" This function creates the filter regexps
for url/server filtering """
# url filter string
urlfilterstr = self._configobj.urlfilter
url_filters = self.__make_filter(urlfilterstr)
self._configobj.set_option('control.urlfilterre', url_filters)
# server filter string
serverfilterstr = self._configobj.serverfilter
server_filters = self.__make_filter(serverfilterstr)
self._configobj.set_option('control.serverfilterre', server_filters)
# url/server priority filters
urlprioritystr = self._configobj.urlpriority
# The return is a dictionary
url_priorities = self.__make_priority(urlprioritystr)
# print url_priorities
self._configobj.set_option('control.urlprioritydict', url_priorities)
serverprioritystr = self._configobj.serverpriority
# The return is a dictionary
server_priorities = self.__make_priority(serverprioritystr)
# print server_priorities
self._configobj.set_option('control.serverprioritydict', server_priorities)
# word filter list
wordfilterstr = self._configobj.wordfilter
if wordfilterstr:
word_filter = self.__make_word_filter(wordfilterstr)
self._configobj.wordfilterre = word_filter
def __make_priority(self, pstr):
""" Generate a priority dictionary from the priority string """
# file priority is based on file extensions &
# server priority based on server names
# Priority string is of the form...
# str1+pnum1,str2-pnum2,str3+pnum3 etc...
# Priority range is from [-5 ... +5]
# Split the string based on commas
pr_strs = pstr.split(',')
# For each string in list, create a dictionary
# with the string as key and the priority (including
# sign) as the value.
d = {}
for s in pr_strs:
if s.find('+') != -1:
key, val = s.split('+')
val = int(val)
elif s.find('-') != -1:
key, val = s.split('-')
val = -1*int(val)
else:
continue
# Since we dont allow values outside
# the range [-5 ..5] skip such values
if val not in range(-5,6): continue
d[key.lower()] = val
return d
def __make_filter(self, fstr,servers=0):
""" Function used to convert url filter strings
to python regular expresssions """
# First replace any ''' with ''
fstr=fstr.replace("'",'')
# regular expressions to include
include=[]
# regular expressions to exclude
exclude=[]
# all regular expressions
all=[]
index=0
previndex=-1
fstr += '+'
for c in fstr:
if c in ('+','-'):
subs=fstr[(previndex+1):index]
if subs: all.append(subs)
previndex=index
index+=1
l=fstr.split('+')
for s in l:
l2=s.split('-')
for x in range(0, len(l2)):
str=l2[x]
if str=='': continue
if x==0:
include.append(str)
else:
exclude.append(str)
exclusionfilter=self.__create_filter(exclude,servers)
inclusionfilter=self.__create_filter(include,servers)
allfilter = self.__create_filter(all, servers)
# return a 3 tuple of (inclusionfilter, exclusionfilter, allfilter)
return (inclusionfilter, exclusionfilter, allfilter)
def __create_filter(self, strlist, servers=0):
""" Create a python regular expression based on
the list of filter strings provided as input """
refilter = []
if servers:
serverfilter=[]
for s in strlist:
# First replace any ''' with ''
s=s.replace("'",'')
# Here asteriks have a meaning, they should match
# anything
s=s.replace('*', '.*')
serverfilter.append(s)
return serverfilter
for s in strlist:
fstr = ''
# First replace any ''' with ''
extn=s.replace("'",'')
# Then we remove the asteriks
s=s.replace('*','')
# Type 1 filter-> they begin with '.' now
# Find out position of '.'
pos=s.rfind('.')
if pos == 0:
extn = s[pos:]
# Append a '.*$' to the string
s += '.*$'
fstr += s
# Type 3 filter
# These will be the form of <something>/.<extn> now
elif s[pos-1] == '/':
# get that <something>
prefix = s[:(pos-1)]
# get the <extn>
extn = s[(pos+1):]
myfilter = prefix
myfilter += '/(?=\w+.'
myfilter += extn
myfilter += ')'
fstr += myfilter
# All other cases are considered Type 2 filters
# i.e, plain strings
else:
fstr += s
refilter.append(re.compile(fstr))
return refilter
def __parse_word_filter(self, s):
scopy = s[:]
oparmatch, clparmatch = False, False
index = scopy.rfind('(')
l = []
if index != -1:
oparmatch = True
index2 = scopy.find(')', index)
if index2 != -1:
clparmatch = True
newstr = scopy[index+1:index2]
# if the string is only of whitespace chars, skip it
wspre = re.compile('^\s*$')
if not wspre.match(newstr):
self._rexplist.append(newstr)
replacestr = ''.join(('(', newstr, ')'))
scopy = scopy.replace(replacestr, '')
self.__parse_word_filter(scopy)
if not clparmatch and not oparmatch:
if scopy: self._rexplist.append(scopy)
def __make_not_expr(self, s):
""" Make a NOT expression """
if s.find('!') == 0:
return ''.join(('(?!', s[1:], ')'))
else:
return s
def __is_inbetween(self, l, elem):
""" Find out if an element is in between in a list """
i = l.index(elem)
if i == -1: return False
loflist = len(l)
if i>1:
if i in range(1, loflist -1):
return True
else:
return False
elif i==1:
return True
elif i==0:
if loflist==1:
return True
else:
return False
def __make_word_filter(self, s):
""" Create a word filter rule for HarvestMan """
# Word filter strings can be simple or compound.
# Simple strings are strings that can stand for a
# word or a string.
# Egs: Python.
# Complex strings are expressions that can mean
# boolean logic.
# Egs: Python & Perl, Python || Perl, (Python || Perl) & Ruby
# If more than one paren group found, replace | with (|)
clparen = s.count(')')
oparen = s.count('(')
if oparen != clparen:
print 'Error in word regular expression'
return None
self.__parse_word_filter(s)
# if NOT is one of the members, reverse
# the list.
if '!' in self._rexplist:
self._rexplist.reverse()
rstr = self.__make_word_regexp( self._rexplist )
r = re.compile( rstr, re.IGNORECASE )
return r
def __make_word_regexp(self, mylist):
is_list = True
if type(mylist) is str:
is_list = False
elem = mylist
elif type(mylist) is list:
elem = mylist[0]
if type(elem) is list:
elem = elem[0]
eor = False
if not is_list or len(mylist) == 1:
eor = True
s=''
# Implementing NOT
if elem == '!':
return ''.join(('(?!', self.__make_word_regexp(mylist[1:]), ')'))
# Implementing OR
elif elem.find(' | ') != -1:
listofors = elem.split(' | ')
for o in listofors:
index = listofors.index(o)
in_bet = self.__is_inbetween(listofors, o)
if o:
o = self.__make_not_expr(o)
if in_bet:
s = ''.join((s, '|', self._wordstr, o, '.*'))
else:
s = ''.join((s, self._wordstr, o, '.*'))
# Implementing AND
elif elem.find(' & ') != -1:
listofands = elem.split(' & ')
for a in listofands:
index = listofands.index(a)
if a:
a = self.__make_not_expr(a)
s = ''.join((s, self._wordstr, a, '.*'))
else:
if elem:
elem = self.__make_not_expr(elem)
s = ''.join((self._wordstr, elem, '.*'))
if eor:
return s
else:
return ''.join((s, self.__make_word_regexp(mylist[1:])))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -