filters.py

来自「Harvestman-最新版本」· Python 代码 · 共 602 行 · 第 1/2 页

PY
602
字号
# -- coding: utf-8"""filters.py - Module which holds class definitions forclasses which define filters for filtering out URLsand web pages based on regualr expression and other kindsof filters. Author: Anand B Pillai <abpillai at gmail dot com> Modification History -------------------- Jul 23 2008 Anand   Creation  Copyright (C) 2003-2008 Anand B Pillai.                                """import reclass HarvestManBaseFilter(object):    """ Base class for all HarvestMan filter classes """    def filter(self, url):        raise NotImplementedError    def make_regex(self, pattern, casing, flags):        flag = 0        if not casing:            flag |= re.IGNORECASE        if flags:            flag |= eval(flags)        return re.compile(pattern, flag)        class HarvestManUrlFilter(HarvestManBaseFilter):    """ Filter class for filtering out web pages based on the URL path string """    def __init__(self, regexfilters=[], pathfilters=[], extnfilters=[]):        # Filter pattern strings        self.regexfilterpatterns = regexfilters        self.pathfilterpatterns = pathfilters        self.extnfilterpatterns = extnfilters        # Actual filters        self.regexfilters = []        self.pathfilters = []        self.extnfilters = []        self._compile_filters()    def _make_path_filter(self, filterstring):        fstr = filterstring        # First replace any ''' with ''        fstr=fstr.replace("'",'')                    # regular expressions to include        include=[]        # regular expressions to exclude                exclude=[]        # all regular expressions        all=[]        index=0        previndex=-1        fstr += '+'        for c in fstr:            if c in ('+','-'):                subs=fstr[(previndex+1):index]                if subs: all.append(subs)                previndex=index            index+=1        l=fstr.split('+')        for s in l:            l2=s.split('-')            for x in xrange(len(l2)):                s=l2[x]                if s=='': continue                if x==0:                    include.append(s)                else:                    exclude.append(s)        print 'Exclude=>',exclude        print 'Include=>',include                #exclusionfilter=self._create_filter(exclude,servers)        #inclusionfilter=self._create_filter(include,servers)        #allfilter = self._create_filter(all, servers)        # return a 3 tuple of (inclusionfilter, exclusionfilter, allfilter)        # return (inclusionfilter, exclusionfilter, allfilter)    def _make_extn_filter(self, filterstring):        pass            def _compile_filters(self):        # Regular expression filters        for pattern, casing, flags in self.regexfilterpatterns:            self.regexfilters.append(self.make_regex(pattern, casing, flags))        print self.regexfilters        for pattern, casing, flags in self.pathfilterpatterns:            self._make_path_filter(pattern)                    # URL path filtersclass HarvestManJunkFilter(object):    """ Junk filter class. Filter out junk urls such    as ads, banners, flash files etc """    # Domain specific blocking - List courtesy    # junkbuster proxy.    block_domains =[ '1ad.prolinks.de',                     '1st-fuss.com',                     '247media.com',                     'admaximize.com',                     'adbureau.net',                     'adsolution.de',                     'adwisdom.com',                     'advertising.com',                     'atwola.com',                     'aladin.de',                     'annonce.insite.dk',                     'a.tribalfusion.com',                                                'avenuea.com',                     'bannercommunity.de',                     'banerswap.com',                     'bizad.nikkeibp.co.jp',                     'bluestreak.com',                     'bs.gsanet.com',                     'cash-for-clicks.de',                     'cashformel.com',                                                'cash4banner.de',                     'cgi.tietovalta.fi',                     'cgicounter.puretec.de',                     'click-fr.com',                     'click.egroups.com',                     'commonwealth.riddler.com',                     'comtrack.comclick.com',                     'customad.cnn.com',                     'cybereps.com:8000',                     'cyberclick.net',                     'dino.mainz.ibm.de',                     'dinoadserver1.roka.net',                     'disneystoreaffiliates.com',                     'dn.adzerver.com',                     'doubleclick.net',                     'ds.austriaonline.at',                     'einets.com',                     'emap.admedia.net',                     'eu-adcenter.net',                     'eurosponser.de',                     'fastcounter.linkexchange.com',                     'findcommerce.com',                     'flycast.com',                     'focalink.com',                     'fp.buy.com',                     'globaltrack.com',                     'globaltrak.net',                     'gsanet.com',                                                'hitbox.com',                     'hurra.de',                     'hyperbanner.net',                     'iadnet.com',                     'image.click2net.com',                     'image.linkexchange.com',                     'imageserv.adtech.de',                     'imagine-inc.com',                     'img.getstats.com',                     'img.web.de',                     'imgis.com',                     'james.adbutler.de',                     'jmcms.cydoor.com',                     'leader.linkexchange.com',                     'linkexchange.com',                     'link4ads.com',                     'link4link.com',                     'linktrader.com',                     'media.fastclick.net',                     'media.interadnet.com',                     'media.priceline.com',                     'mediaplex.com',                     'members.sexroulette.com',                     'newsads.cmpnet.com',                     'ngadcenter.net',                     'nol.at:81',                     'nrsite.com',                     'offers.egroups.com',                     'omdispatch.co.uk',                     'orientserve.com',                     'pagecount.com',                     'preferences.com',                     'promotions.yahoo.com',                     'pub.chez.com',                     'pub.nomade.fr',                     'qa.ecoupons.com',                     'qkimg.net',                     'resource-marketing.com',                     'revenue.infi.net',                     'sam.songline.com',                     'sally.songline.com',                     'sextracker.com',                     'smartage.com',                     'smartclicks.com',                     'spinbox1.filez.com',                     'spinbox.versiontracker.com',                     'stat.onestat.com',                     'stats.surfaid.ihost.com',                     'stats.webtrendslive.com',                     'swiftad.com',                     'tm.intervu.net',                     'tracker.tradedoubler.com',                     'ultra.multimania.com',                     'ultra1.socomm.net',                     'uproar.com',                     'usads.imdb.com',                     'valueclick.com',                     'valueclick.net',                     'victory.cnn.com',                     'videoserver.kpix.com',                     'view.atdmt.com',                     'webcounter.goweb.de',                     'websitesponser.de',                     'werbung.guj.de',                     'wvolante.com',                     'www.ad-up.com',                     'www.adclub.net',                     'www.americanpassage.com',                     'www.bannerland.de',                     'www.bannermania.nom.pl',                     'www.bizlink.ru',                     'www.cash4banner.com',                                                'www.clickagents.com',                     'www.clickthrough.ca',                     'www.commision-junction.com',                     'www.eads.com',                     'www.flashbanner.no',                                                'www.mediashower.com',                     'www.popupad.net',                                                'www.smartadserver.com',                                                'www.smartclicks.com:81',                     'www.spinbox.com',                     'www.sponsorpool.net',                     'www.ugo.net',                     'www.valueclick.com',                     'www.virtual-hideout.net',                     'www.web-stat.com',                     'www.webpeep.com',                     'www.zserver.com',                     'www3.exn.net:80',                     'xb.xoom.com',                     'yimg.com' ]    # Common block patterns. These are created    # in the Python regular expression syntax.    # Original list courtesy junkbuster proxy.    block_patterns = [ r'/*.*/(.*[-_.])?ads?[0-9]?(/|[-_.].*|\.(gif|jpe?g))',                       r'/*.*/(.*[-_.])?count(er)?(\.cgi|\.dll|\.exe|[?/])',                       r'/*.*/(.*[-_.].*)?maino(kset|nta|s).*(/|\.(gif|html?|jpe?g|png))',                       r'/*.*/(ilm(oitus)?|kampanja)(hallinta|kuvat?)(/|\.(gif|html?|jpe?g|png))',                       r'/*.*/(ng)?adclient\.cgi',                       r'/*.*/(plain|live|rotate)[-_.]?ads?/',                       r'/*.*/(sponsor|banner)s?[0-9]?/',                       r'/*.*/*preferences.com*',                       r'/*.*/.*banner([-_]?[a-z0-9]+)?\.(gif|jpg)',                       r'/*.*/.*bannr\.gif',                       r'/*.*/.*counter\.pl',                       r'/*.*/.*pb_ihtml\.gif',                       r'/*.*/Advertenties/',                       r'/*.*/Image/BannerAdvertising/',                       r'/*.*/[?]adserv',                       r'/*.*/_?(plain|live)?ads?(-banners)?/',                       r'/*.*/abanners/',                       r'/*.*/ad(sdna_image|gifs?)/',                       r'/*.*/ad(server|stream|juggler)\.(cgi|pl|dll|exe)',                       r'/*.*/adbanner*',                       r'/*.*/adfinity',                       r'/*.*/adgraphic*',                       r'/*.*/adimg/',                       r'/*.*/adjuggler',                       r'/*.*/adlib/server\.cgi',                       r'/*.*/ads\\',                       r'/*.*/adserver',                       r'/*.*/adstream\.cgi',                       r'/*.*/adv((er)?ts?|ertis(ing|ements?))?/',                       r'/*.*/advanbar\.(gif|jpg)',                       r'/*.*/advanbtn\.(gif|jpg)',                       r'/*.*/advantage\.(gif|jpg)',                       r'/*.*/amazon([a-zA-Z0-9]+)\.(gif|jpg)',                       r'/*.*/ana2ad\.gif',                       r'/*.*/anzei(gen)?/?',                       r'/*.*/ban[-_]cgi/',                       r'/*.*/banner_?ads/',                       r'/*.*/banner_?anzeigen',                       r'/*.*/bannerimage/',                       r'/*.*/banners?/',

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?