filters.py
来自「Harvestman-最新版本」· Python 代码 · 共 602 行 · 第 1/2 页
PY
602 行
# -- coding: utf-8"""filters.py - Module which holds class definitions forclasses which define filters for filtering out URLsand web pages based on regualr expression and other kindsof filters. Author: Anand B Pillai <abpillai at gmail dot com> Modification History -------------------- Jul 23 2008 Anand Creation Copyright (C) 2003-2008 Anand B Pillai. """import reclass HarvestManBaseFilter(object): """ Base class for all HarvestMan filter classes """ def filter(self, url): raise NotImplementedError def make_regex(self, pattern, casing, flags): flag = 0 if not casing: flag |= re.IGNORECASE if flags: flag |= eval(flags) return re.compile(pattern, flag) class HarvestManUrlFilter(HarvestManBaseFilter): """ Filter class for filtering out web pages based on the URL path string """ def __init__(self, regexfilters=[], pathfilters=[], extnfilters=[]): # Filter pattern strings self.regexfilterpatterns = regexfilters self.pathfilterpatterns = pathfilters self.extnfilterpatterns = extnfilters # Actual filters self.regexfilters = [] self.pathfilters = [] self.extnfilters = [] self._compile_filters() def _make_path_filter(self, filterstring): fstr = filterstring # First replace any ''' with '' fstr=fstr.replace("'",'') # regular expressions to include include=[] # regular expressions to exclude exclude=[] # all regular expressions all=[] index=0 previndex=-1 fstr += '+' for c in fstr: if c in ('+','-'): subs=fstr[(previndex+1):index] if subs: all.append(subs) previndex=index index+=1 l=fstr.split('+') for s in l: l2=s.split('-') for x in xrange(len(l2)): s=l2[x] if s=='': continue if x==0: include.append(s) else: exclude.append(s) print 'Exclude=>',exclude print 'Include=>',include #exclusionfilter=self._create_filter(exclude,servers) #inclusionfilter=self._create_filter(include,servers) #allfilter = self._create_filter(all, servers) # return a 3 tuple of (inclusionfilter, exclusionfilter, allfilter) # return (inclusionfilter, exclusionfilter, allfilter) def _make_extn_filter(self, filterstring): pass def _compile_filters(self): # Regular expression filters for pattern, casing, flags in self.regexfilterpatterns: self.regexfilters.append(self.make_regex(pattern, casing, flags)) print self.regexfilters for pattern, casing, flags in self.pathfilterpatterns: self._make_path_filter(pattern) # URL path filtersclass HarvestManJunkFilter(object): """ Junk filter class. Filter out junk urls such as ads, banners, flash files etc """ # Domain specific blocking - List courtesy # junkbuster proxy. block_domains =[ '1ad.prolinks.de', '1st-fuss.com', '247media.com', 'admaximize.com', 'adbureau.net', 'adsolution.de', 'adwisdom.com', 'advertising.com', 'atwola.com', 'aladin.de', 'annonce.insite.dk', 'a.tribalfusion.com', 'avenuea.com', 'bannercommunity.de', 'banerswap.com', 'bizad.nikkeibp.co.jp', 'bluestreak.com', 'bs.gsanet.com', 'cash-for-clicks.de', 'cashformel.com', 'cash4banner.de', 'cgi.tietovalta.fi', 'cgicounter.puretec.de', 'click-fr.com', 'click.egroups.com', 'commonwealth.riddler.com', 'comtrack.comclick.com', 'customad.cnn.com', 'cybereps.com:8000', 'cyberclick.net', 'dino.mainz.ibm.de', 'dinoadserver1.roka.net', 'disneystoreaffiliates.com', 'dn.adzerver.com', 'doubleclick.net', 'ds.austriaonline.at', 'einets.com', 'emap.admedia.net', 'eu-adcenter.net', 'eurosponser.de', 'fastcounter.linkexchange.com', 'findcommerce.com', 'flycast.com', 'focalink.com', 'fp.buy.com', 'globaltrack.com', 'globaltrak.net', 'gsanet.com', 'hitbox.com', 'hurra.de', 'hyperbanner.net', 'iadnet.com', 'image.click2net.com', 'image.linkexchange.com', 'imageserv.adtech.de', 'imagine-inc.com', 'img.getstats.com', 'img.web.de', 'imgis.com', 'james.adbutler.de', 'jmcms.cydoor.com', 'leader.linkexchange.com', 'linkexchange.com', 'link4ads.com', 'link4link.com', 'linktrader.com', 'media.fastclick.net', 'media.interadnet.com', 'media.priceline.com', 'mediaplex.com', 'members.sexroulette.com', 'newsads.cmpnet.com', 'ngadcenter.net', 'nol.at:81', 'nrsite.com', 'offers.egroups.com', 'omdispatch.co.uk', 'orientserve.com', 'pagecount.com', 'preferences.com', 'promotions.yahoo.com', 'pub.chez.com', 'pub.nomade.fr', 'qa.ecoupons.com', 'qkimg.net', 'resource-marketing.com', 'revenue.infi.net', 'sam.songline.com', 'sally.songline.com', 'sextracker.com', 'smartage.com', 'smartclicks.com', 'spinbox1.filez.com', 'spinbox.versiontracker.com', 'stat.onestat.com', 'stats.surfaid.ihost.com', 'stats.webtrendslive.com', 'swiftad.com', 'tm.intervu.net', 'tracker.tradedoubler.com', 'ultra.multimania.com', 'ultra1.socomm.net', 'uproar.com', 'usads.imdb.com', 'valueclick.com', 'valueclick.net', 'victory.cnn.com', 'videoserver.kpix.com', 'view.atdmt.com', 'webcounter.goweb.de', 'websitesponser.de', 'werbung.guj.de', 'wvolante.com', 'www.ad-up.com', 'www.adclub.net', 'www.americanpassage.com', 'www.bannerland.de', 'www.bannermania.nom.pl', 'www.bizlink.ru', 'www.cash4banner.com', 'www.clickagents.com', 'www.clickthrough.ca', 'www.commision-junction.com', 'www.eads.com', 'www.flashbanner.no', 'www.mediashower.com', 'www.popupad.net', 'www.smartadserver.com', 'www.smartclicks.com:81', 'www.spinbox.com', 'www.sponsorpool.net', 'www.ugo.net', 'www.valueclick.com', 'www.virtual-hideout.net', 'www.web-stat.com', 'www.webpeep.com', 'www.zserver.com', 'www3.exn.net:80', 'xb.xoom.com', 'yimg.com' ] # Common block patterns. These are created # in the Python regular expression syntax. # Original list courtesy junkbuster proxy. block_patterns = [ r'/*.*/(.*[-_.])?ads?[0-9]?(/|[-_.].*|\.(gif|jpe?g))', r'/*.*/(.*[-_.])?count(er)?(\.cgi|\.dll|\.exe|[?/])', r'/*.*/(.*[-_.].*)?maino(kset|nta|s).*(/|\.(gif|html?|jpe?g|png))', r'/*.*/(ilm(oitus)?|kampanja)(hallinta|kuvat?)(/|\.(gif|html?|jpe?g|png))', r'/*.*/(ng)?adclient\.cgi', r'/*.*/(plain|live|rotate)[-_.]?ads?/', r'/*.*/(sponsor|banner)s?[0-9]?/', r'/*.*/*preferences.com*', r'/*.*/.*banner([-_]?[a-z0-9]+)?\.(gif|jpg)', r'/*.*/.*bannr\.gif', r'/*.*/.*counter\.pl', r'/*.*/.*pb_ihtml\.gif', r'/*.*/Advertenties/', r'/*.*/Image/BannerAdvertising/', r'/*.*/[?]adserv', r'/*.*/_?(plain|live)?ads?(-banners)?/', r'/*.*/abanners/', r'/*.*/ad(sdna_image|gifs?)/', r'/*.*/ad(server|stream|juggler)\.(cgi|pl|dll|exe)', r'/*.*/adbanner*', r'/*.*/adfinity', r'/*.*/adgraphic*', r'/*.*/adimg/', r'/*.*/adjuggler', r'/*.*/adlib/server\.cgi', r'/*.*/ads\\', r'/*.*/adserver', r'/*.*/adstream\.cgi', r'/*.*/adv((er)?ts?|ertis(ing|ements?))?/', r'/*.*/advanbar\.(gif|jpg)', r'/*.*/advanbtn\.(gif|jpg)', r'/*.*/advantage\.(gif|jpg)', r'/*.*/amazon([a-zA-Z0-9]+)\.(gif|jpg)', r'/*.*/ana2ad\.gif', r'/*.*/anzei(gen)?/?', r'/*.*/ban[-_]cgi/', r'/*.*/banner_?ads/', r'/*.*/banner_?anzeigen', r'/*.*/bannerimage/', r'/*.*/banners?/',
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?