filters.py
来自「Harvestman-最新版本」· Python 代码 · 共 602 行 · 第 1/2 页
PY
602 行
r'/*.*/banners?\.cgi/', r'/*.*/bizgrphx/', r'/*.*/biznetsmall\.(gif|jpg)', r'/*.*/bnlogo.(gif|jpg)', r'/*.*/buynow([a-zA-Z0-9]+)\.(gif|jpg)', r'/*.*/cgi-bin/centralad/getimage', r'/*.*/drwebster.gif', r'/*.*/epipo\.(gif|jpg)', r'/*.*/gsa_bs/gsa_bs.cmdl', r'/*.*/images/addver\.gif', r'/*.*/images/advert\.gif', r'/*.*/images/marketing/.*\.(gif|jpe?g)', r'/*.*/images/na/us/brand/', r'/*.*/images/topics/topicgimp\.gif', r'/*.*/phpAds/phpads.php', r'/*.*/phpAds/viewbanner.php', r'/*.*/place-ads', r'/*.*/popupads/', r'/*.*/promobar.*', r'/*.*/publicite/', r'/*.*/randomads/.*\.(gif|jpe?g)', r'/*.*/reklaam/.*\.(gif|jpe?g)', r'/*.*/reklama/.*\.(gif|jpe?g)', r'/*.*/reklame/.*\.(gif|jpe?g)', r'/*.*/servfu.pl', r'/*.*/siteads/', r'/*.*/smallad2\.gif', r'/*.*/spin_html/', r'/*.*/sponsor.*\.gif', r'/*.*/sponsors?[0-9]?/', r'/*.*/ucbandeimg/', r'/*.*/utopiad\.(gif|jpg)', r'/*.*/werb\..*', r'/*.*/werbebanner/', r'/*.*/werbung/.*\.(gif|jpe?g)', r'/*ad.*.doubleclick.net', r'/.*(ms)?backoff(ice)?.*\.(gif|jpe?g)', r'/.*./Adverteerders/', r'/.*/?FPCreated\.gif', r'/.*/?va_banner.html', r'/.*/adv\.', r'/.*/advert[0-9]+\.jpg', r'/.*/favicon\.ico', r'/.*/ie_?(buttonlogo|static?|anim.*)?\.(gif|jpe?g)', r'/.*/ie_horiz\.gif', r'/.*/ie_logo\.gif', r'/.*/ns4\.gif', r'/.*/opera13\.gif', r'/.*/opera35\.gif', r'/.*/opera_b\.gif', r'/.*/v3sban\.gif', r'/.*Ad00\.gif', r'/.*activex.*(gif|jpe?g)', r'/.*add_active\.gif', r'/.*addchannel\.gif', r'/.*adddesktop\.gif', r'/.*bann\.gif', r'/.*barnes_logo\.gif', r'/.*book.search\.gif', r'/.*by/main\.gif', r'/.*cnnpostopinionhome.\.gif', r'/.*cnnstore\.gif', r'/.*custom_feature\.gif', r'/.*exc_ms\.gif', r'/.*explore.anim.*gif', r'/.*explorer?.(gif|jpe?g)', r'/.*freeie\.(gif|jpe?g)', r'/.*gutter117\.gif', r'/.*ie4_animated\.gif', r'/.*ie4get_animated\.gif', r'/.*ie_sm\.(gif|jpe?g)', r'/.*ieget\.gif', r'/.*images/cnnfn_infoseek\.gif', r'/.*images/pathfinder_btn2\.gif', r'/.*img/gen/fosz_front_em_abc\.gif', r'/.*img/promos/bnsearch\.gif', r'/.*infoseek\.gif', r'/.*logo_msnhm_*', r'/.*mcsp2\.gif', r'/.*microdell\.gif', r'/.*msie(30)?\.(gif|jpe?g)', r'/.*msn2\.gif', r'/.*msnlogo\.(gif|jpe?g)', r'/.*n_iemap\.gif', r'/.*n_msnmap\.gif', r'/.*navbars/nav_partner_logos\.gif', r'/.*nbclogo\.gif', r'/.*office97_ad1\.(gif|jpe?g)', r'/.*pathnet.warner\.gif', r'/.*pbbobansm\.(gif|jpe?g)', r'/.*powrbybo\.(gif|jpe?g)', r'/.*s_msn\.gif', r'/.*secureit\.gif', r'/.*sqlbans\.(gif|jpe?g)', r'/BannerImages/' r'/BarnesandNoble/images/bn.recommend.box.*', r'/Media/Images/Adds/', r'/SmartBanner/', r'/US/AD/', r'/_banner/', r'/ad[-_]container/', r'/adcycle.cgi', r'/adcycle/', r'/adgenius/', r'/adimages/', r'/adproof/', r'/adserve/', r'/affiliate_banners/', r'/annonser?/', r'/anz/pics/', r'/autoads/', r'/av/gifs/av_logo\.gif', r'/av/gifs/av_map\.gif', r'/av/gifs/new/ns\.gif', r'/bando/', r'/bannerad/', r'/bannerfarm/', r'/bin/getimage.cgi/...\?AD', r'/cgi-bin/centralad/', r'/cgi-bin/getimage.cgi/....\?GROUP=', r'/cgi-bin/nph-adclick.exe/', r'/cgi-bin/nph-load', r'/cgi-bin/webad.dll/ad', r'/cgi/banners.cgi', r'/cwmail/acc\.gif', r'/cwmail/amzn-bm1\.gif', r'/db_area/banrgifs/', r'/digitaljam/images/digital_ban\.gif', r'/free2try/', r'/gfx/bannerdir/', r'/gif/buttons/banner_.*', r'/gif/buttons/cd_shop_.*', r'/gif/cd_shop/cd_shop_ani_.*', r'/gif/teasere/', r'/grafikk/annonse/', r'/graphics/advert', r'/graphics/defaultAd/', r'/grf/annonif', r'/hotstories/companies/images/companies_banner\.gif', r'/htmlad/', r'/image\.ng/AdType', r'/image\.ng/transactionID', r'/images/.*/.*_anim\.gif', r'/images/adds/', r'/images/getareal2\.gif', r'/images/locallogo.gif', r'/img/special/chatpromo\.gif', r'/include/watermark/v2/', r'/ip_img/.*\.(gif|jpe?g)', r'/ltbs/cgi-bin/click.cgi', r'/marketpl*/', r'/markets/images/markets_banner\.gif', r'/minibanners/', r'/ows-img/bnoble\.gif', r'/ows-img/nb_Infoseek\.gif', r'/p/d/publicid', r'/pics/amzn-b5\.gif', r'/pics/getareal1\.gif', r'/pics/gotlx1\.gif', r'/promotions/', r'/rotads/', r'/rotations/', r'/torget/jobline/.*\.gif' r'/viewad/' r'/we_ba/', r'/werbung/', r'/world-banners/', r'/worldnet/ad\.cgi', r'/zhp/auktion/img/' ] def __init__(self): self.msg = '<No Error>' self.match = '' # Compile pattern list for performance self.patterns = map(re.compile, self.block_patterns) # Create base domains list from domains list self.base_domains = map(self.base_domain, self.block_domains) def reset_msg(self): self.msg = '<No Error>' def reset_match(self): self.msg = '' def check(self, url_obj): """ Check whether the url is junk. Return True if the url is O.K (not junk) and False otherwise """ self.reset_msg() self.reset_match() # Check domain first ret = self._check_domain(url_obj) if not ret: return ret # Check pattern next return self._check_pattern(url_obj) def base_domain(self, domain): if domain.count(".") > 1: strings = domain.split(".") return "".join((strings[-2], strings[-1])) else: return domain def _check_domain(self, url_obj): """ Check whether the url belongs to a junk domain. Return true if url is O.K (NOT a junk domain) and False otherwise """ # Get base server of the domain with port base_domain_port = url_obj.get_base_domain_with_port() # Get domain with port domain_port = url_obj.get_domain_with_port() # First check for domain if domain_port in self.block_domains: self.msg = '<Found domain match>' return False # Then check for base domain else: if base_domain_port in self.base_domains: self.msg = '<Found base-domain match>' return False return True def _check_pattern(self, url_obj): """ Check whether the url matches a junk pattern. Return true if url is O.K (not a junk pattern) and false otherwise """ url = url_obj.get_full_url() indx=0 for p in self.patterns: # Do a search, not match if p.search(url): self.msg = '<Found pattern match>' self.match = self.block_patterns[indx] return False indx += 1 return True def get_error_msg(self): return self.msg def get_match(self): return self.match if __name__=="__main__": # Test filter class filter = HarvestManJunkFilter() # Violates, should return False # The first two are direct domain matches, the # next two are base domain matches. u = urlparser.HarvestManUrl("http://a.tribalfusion.com/images/1.gif") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() u = urlparser.HarvestManUrl("http://stats.webtrendslive.com/cgi-bin/stats.pl") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() u = urlparser.HarvestManUrl("http://stats.cyberclick.net/cgi-bin/stats.pl") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() u = urlparser.HarvestManUrl("http://m.doubleclick.net/images/anim.gif") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() # The next are pattern matches u = urlparser.HarvestManUrl("http://www.foo.com/popupads/ad.gif") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() print '\tMatch=>',filter.get_match() u = urlparser.HarvestManUrl("http://www.foo.com/htmlad/1.html") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() print '\tMatch=>',filter.get_match() u = urlparser.HarvestManUrl("http://www.foo.com/logos/nbclogo.gif") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() print '\tMatch=>',filter.get_match() u = urlparser.HarvestManUrl("http://www.foo.com/bar/siteads/1.ad") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() print '\tMatch=>',filter.get_match() u = urlparser.HarvestManUrl("http://www.foo.com/banners/world-banners/banner.gif") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() print '\tMatch=>',filter.get_match() u = urlparser.HarvestManUrl("http://ads.foo.com/") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() print '\tMatch=>',filter.get_match() # This one should not match u = urlparser.HarvestManUrl("http://www.foo.com/doc/logo.gif") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url() # This also... u = urlparser.HarvestManUrl("http://www.foo.org/bar/vodka/pattern.html") print filter.check(u),filter.get_error_msg(),'=>',u.get_full_url()
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?