filters.py

来自「Harvestman-最新版本」· Python 代码 · 共 602 行 · 第 1/2 页

PY
602
字号
                       r'/*.*/banners?\.cgi/',                       r'/*.*/bizgrphx/',                       r'/*.*/biznetsmall\.(gif|jpg)',                       r'/*.*/bnlogo.(gif|jpg)',                       r'/*.*/buynow([a-zA-Z0-9]+)\.(gif|jpg)',                       r'/*.*/cgi-bin/centralad/getimage',                       r'/*.*/drwebster.gif',                       r'/*.*/epipo\.(gif|jpg)',                       r'/*.*/gsa_bs/gsa_bs.cmdl',                       r'/*.*/images/addver\.gif',                       r'/*.*/images/advert\.gif',                       r'/*.*/images/marketing/.*\.(gif|jpe?g)',                       r'/*.*/images/na/us/brand/',                       r'/*.*/images/topics/topicgimp\.gif',                       r'/*.*/phpAds/phpads.php',                       r'/*.*/phpAds/viewbanner.php',                       r'/*.*/place-ads',                       r'/*.*/popupads/',                       r'/*.*/promobar.*',                       r'/*.*/publicite/',                       r'/*.*/randomads/.*\.(gif|jpe?g)',                       r'/*.*/reklaam/.*\.(gif|jpe?g)',                       r'/*.*/reklama/.*\.(gif|jpe?g)',                       r'/*.*/reklame/.*\.(gif|jpe?g)',                       r'/*.*/servfu.pl',                       r'/*.*/siteads/',                       r'/*.*/smallad2\.gif',                       r'/*.*/spin_html/',                       r'/*.*/sponsor.*\.gif',                       r'/*.*/sponsors?[0-9]?/',                       r'/*.*/ucbandeimg/',                       r'/*.*/utopiad\.(gif|jpg)',                       r'/*.*/werb\..*',                       r'/*.*/werbebanner/',                       r'/*.*/werbung/.*\.(gif|jpe?g)',                       r'/*ad.*.doubleclick.net',                       r'/.*(ms)?backoff(ice)?.*\.(gif|jpe?g)',                       r'/.*./Adverteerders/',                       r'/.*/?FPCreated\.gif',                       r'/.*/?va_banner.html',                       r'/.*/adv\.',                       r'/.*/advert[0-9]+\.jpg',                       r'/.*/favicon\.ico',                       r'/.*/ie_?(buttonlogo|static?|anim.*)?\.(gif|jpe?g)',                       r'/.*/ie_horiz\.gif',                       r'/.*/ie_logo\.gif',                       r'/.*/ns4\.gif',                       r'/.*/opera13\.gif',                       r'/.*/opera35\.gif',                       r'/.*/opera_b\.gif',                       r'/.*/v3sban\.gif',                       r'/.*Ad00\.gif',                       r'/.*activex.*(gif|jpe?g)',                       r'/.*add_active\.gif',                       r'/.*addchannel\.gif',                       r'/.*adddesktop\.gif',                       r'/.*bann\.gif',                       r'/.*barnes_logo\.gif',                       r'/.*book.search\.gif',                       r'/.*by/main\.gif',                       r'/.*cnnpostopinionhome.\.gif',                       r'/.*cnnstore\.gif',                       r'/.*custom_feature\.gif',                       r'/.*exc_ms\.gif',                       r'/.*explore.anim.*gif',                       r'/.*explorer?.(gif|jpe?g)',                       r'/.*freeie\.(gif|jpe?g)',                       r'/.*gutter117\.gif',                       r'/.*ie4_animated\.gif',                       r'/.*ie4get_animated\.gif',                       r'/.*ie_sm\.(gif|jpe?g)',                       r'/.*ieget\.gif',                       r'/.*images/cnnfn_infoseek\.gif',                       r'/.*images/pathfinder_btn2\.gif',                       r'/.*img/gen/fosz_front_em_abc\.gif',                       r'/.*img/promos/bnsearch\.gif',                       r'/.*infoseek\.gif',                       r'/.*logo_msnhm_*',                       r'/.*mcsp2\.gif',                       r'/.*microdell\.gif',                       r'/.*msie(30)?\.(gif|jpe?g)',                       r'/.*msn2\.gif',                       r'/.*msnlogo\.(gif|jpe?g)',                       r'/.*n_iemap\.gif',                       r'/.*n_msnmap\.gif',                       r'/.*navbars/nav_partner_logos\.gif',                       r'/.*nbclogo\.gif',                       r'/.*office97_ad1\.(gif|jpe?g)',                       r'/.*pathnet.warner\.gif',                       r'/.*pbbobansm\.(gif|jpe?g)',                       r'/.*powrbybo\.(gif|jpe?g)',                       r'/.*s_msn\.gif',                       r'/.*secureit\.gif',                       r'/.*sqlbans\.(gif|jpe?g)',                       r'/BannerImages/'                       r'/BarnesandNoble/images/bn.recommend.box.*',                       r'/Media/Images/Adds/',                       r'/SmartBanner/',                       r'/US/AD/',                       r'/_banner/',                       r'/ad[-_]container/',                       r'/adcycle.cgi',                       r'/adcycle/',                       r'/adgenius/',                       r'/adimages/',                       r'/adproof/',                       r'/adserve/',                       r'/affiliate_banners/',                       r'/annonser?/',                       r'/anz/pics/',                       r'/autoads/',                       r'/av/gifs/av_logo\.gif',                       r'/av/gifs/av_map\.gif',                       r'/av/gifs/new/ns\.gif',                       r'/bando/',                       r'/bannerad/',                       r'/bannerfarm/',                       r'/bin/getimage.cgi/...\?AD',                       r'/cgi-bin/centralad/',                       r'/cgi-bin/getimage.cgi/....\?GROUP=',                       r'/cgi-bin/nph-adclick.exe/',                       r'/cgi-bin/nph-load',                       r'/cgi-bin/webad.dll/ad',                       r'/cgi/banners.cgi',                       r'/cwmail/acc\.gif',                       r'/cwmail/amzn-bm1\.gif',                       r'/db_area/banrgifs/',                       r'/digitaljam/images/digital_ban\.gif',                       r'/free2try/',                       r'/gfx/bannerdir/',                       r'/gif/buttons/banner_.*',                       r'/gif/buttons/cd_shop_.*',                       r'/gif/cd_shop/cd_shop_ani_.*',                       r'/gif/teasere/',                       r'/grafikk/annonse/',                       r'/graphics/advert',                       r'/graphics/defaultAd/',                       r'/grf/annonif',                       r'/hotstories/companies/images/companies_banner\.gif',                       r'/htmlad/',                       r'/image\.ng/AdType',                       r'/image\.ng/transactionID',                       r'/images/.*/.*_anim\.gif',                       r'/images/adds/',                       r'/images/getareal2\.gif',                       r'/images/locallogo.gif',                       r'/img/special/chatpromo\.gif',                       r'/include/watermark/v2/',                       r'/ip_img/.*\.(gif|jpe?g)',                       r'/ltbs/cgi-bin/click.cgi',                       r'/marketpl*/',                       r'/markets/images/markets_banner\.gif',                       r'/minibanners/',                       r'/ows-img/bnoble\.gif',                       r'/ows-img/nb_Infoseek\.gif',                       r'/p/d/publicid',                       r'/pics/amzn-b5\.gif',                       r'/pics/getareal1\.gif',                       r'/pics/gotlx1\.gif',                       r'/promotions/',                       r'/rotads/',                       r'/rotations/',                       r'/torget/jobline/.*\.gif'                       r'/viewad/'                       r'/we_ba/',                       r'/werbung/',                       r'/world-banners/',                       r'/worldnet/ad\.cgi',                       r'/zhp/auktion/img/' ]                                def __init__(self):        self.msg = '<No Error>'        self.match = ''        # Compile pattern list for performance        self.patterns = map(re.compile, self.block_patterns)        # Create base domains list from domains list        self.base_domains = map(self.base_domain, self.block_domains)    def reset_msg(self):        self.msg = '<No Error>'    def reset_match(self):        self.msg = ''                    def check(self, url_obj):        """ Check whether the url is junk. Return        True if the url is O.K (not junk) and False        otherwise """        self.reset_msg()        self.reset_match()                # Check domain first        ret = self._check_domain(url_obj)        if not ret:            return ret        # Check pattern next        return self._check_pattern(url_obj)    def base_domain(self, domain):        if domain.count(".") > 1:            strings = domain.split(".")            return "".join((strings[-2], strings[-1]))        else:            return domain                def _check_domain(self, url_obj):        """ Check whether the url belongs to a junk        domain. Return true if url is O.K (NOT a junk        domain) and False otherwise """        # Get base server of the domain with port        base_domain_port = url_obj.get_base_domain_with_port()        # Get domain with port        domain_port = url_obj.get_domain_with_port()        # First check for domain        if domain_port in self.block_domains:            self.msg = '<Found domain match>'            return False        # Then check for base domain        else:            if base_domain_port in self.base_domains:                self.msg = '<Found base-domain match>'                                return False        return True    def _check_pattern(self, url_obj):        """ Check whether the url matches a junk pattern.        Return true if url is O.K (not a junk pattern) and        false otherwise """        url = url_obj.get_full_url()        indx=0        for p in self.patterns:            # Do a search, not match            if p.search(url):                self.msg = '<Found pattern match>'                self.match = self.block_patterns[indx]                return False                        indx += 1                    return True                def get_error_msg(self):        return self.msg    def get_match(self):        return self.match    if __name__=="__main__":    # Test filter class    filter = HarvestManJunkFilter()        # Violates, should return False    # The first two are direct domain matches, the    # next two are base domain matches.    u = urlparser.HarvestManUrl("http://a.tribalfusion.com/images/1.gif")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()    u = urlparser.HarvestManUrl("http://stats.webtrendslive.com/cgi-bin/stats.pl")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()    u = urlparser.HarvestManUrl("http://stats.cyberclick.net/cgi-bin/stats.pl")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()        u = urlparser.HarvestManUrl("http://m.doubleclick.net/images/anim.gif")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()        # The next are pattern matches    u = urlparser.HarvestManUrl("http://www.foo.com/popupads/ad.gif")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()    print '\tMatch=>',filter.get_match()    u = urlparser.HarvestManUrl("http://www.foo.com/htmlad/1.html")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()    print '\tMatch=>',filter.get_match()        u = urlparser.HarvestManUrl("http://www.foo.com/logos/nbclogo.gif")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()    print '\tMatch=>',filter.get_match()        u = urlparser.HarvestManUrl("http://www.foo.com/bar/siteads/1.ad")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()    print '\tMatch=>',filter.get_match()        u = urlparser.HarvestManUrl("http://www.foo.com/banners/world-banners/banner.gif")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()    print '\tMatch=>',filter.get_match()    u = urlparser.HarvestManUrl("http://ads.foo.com/")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()    print '\tMatch=>',filter.get_match()                # This one should not match    u = urlparser.HarvestManUrl("http://www.foo.com/doc/logo.gif")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()    # This also...    u = urlparser.HarvestManUrl("http://www.foo.org/bar/vodka/pattern.html")    print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()    

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?