⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 deciderulesequencetest.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
        addDecideRule(new TooManyPathSegmentsDecideRule("SEGMENTS"));        final int max =            TooManyPathSegmentsDecideRule.DEFAULT_MAX_PATH_DEPTH.intValue();        StringBuffer baseUri = new StringBuffer("http://archive.org");        for (int i = 0; i < max; i++) {            baseUri.append('/');            baseUri.append(Integer.toString(i + 1));        }        UURI uuri = UURIFactory.getInstance(baseUri.toString());        CandidateURI candidate = new CandidateURI(uuri);        Object decision = this.rule.decisionFor(candidate);        assertTrue("Expect " + DecideRule.PASS + " but got " + decision,            decision == DecideRule.PASS);        baseUri.append("/x");        uuri = UURIFactory.getInstance(baseUri.toString());        candidate = new CandidateURI(uuri);        decision = this.rule.decisionFor(candidate);        assertTrue("Expect " + DecideRule.REJECT + " but got " + decision,            decision == DecideRule.REJECT);    }        public void testMatchesFilePattern()    throws InvalidAttributeValueException, URIException {        addDecideRule(new MatchesFilePatternDecideRule("FILE_PATTERN"));        StringBuffer baseUri = new StringBuffer("http://archive.org/");        UURI uuri = UURIFactory.getInstance(baseUri.toString() + "ms.doc");        CandidateURI candidate = new CandidateURI(uuri);        Object decision = this.rule.decisionFor(candidate);        assertTrue("Expect " + DecideRule.ACCEPT + " but got " + decision,            decision == DecideRule.ACCEPT);        uuri = UURIFactory.getInstance(baseUri.toString() + "index.html");        candidate = new CandidateURI(uuri);        decision = this.rule.decisionFor(candidate);        assertTrue("Expect " + DecideRule.PASS + " but got " + decision,            decision == DecideRule.PASS);    }        public void testNotMatchesFilePattern()    throws InvalidAttributeValueException, URIException {        addDecideRule(new NotMatchesFilePatternDecideRule("NOT_FILE_PATTERN"));        StringBuffer baseUri = new StringBuffer("http://archive.org/");        UURI uuri = UURIFactory.getInstance(baseUri.toString() + "ms.doc");        CandidateURI candidate = new CandidateURI(uuri);        Object decision = this.rule.decisionFor(candidate);        assertTrue("Expect " + DecideRule.PASS + " but got " + decision,            decision == DecideRule.PASS);        uuri = UURIFactory.getInstance(baseUri.toString() + "index.html");        candidate = new CandidateURI(uuri);        decision = this.rule.decisionFor(candidate);        assertTrue("Expect " + DecideRule.ACCEPT + " but got " + decision,            decision == DecideRule.ACCEPT);    }        protected void testHopLimit(final int max, final char pathExpansion,        final String defaultDecision, final String overLimitDecision)    throws URIException {        UURI uuri = UURIFactory.getInstance("http://archive.org");        CandidateURI candidate = new CandidateURI(uuri);        Object decision = this.rule.decisionFor(candidate);        assertTrue("Expect " + defaultDecision + " but got " + decision,            decision == defaultDecision);        StringBuffer path = new StringBuffer(max);        for (int i = 0; i < (max - 1); i++) {            path.append(pathExpansion);        }        candidate = new CandidateURI(uuri, path.toString(), null, null);        decision = this.rule.decisionFor(candidate);        assertTrue("Expect " + defaultDecision + " but got " + decision,            decision == defaultDecision);        path.append(pathExpansion);        candidate = new CandidateURI(uuri, path.toString(), null, null);        decision = this.rule.decisionFor(candidate);        assertTrue("Expect " + defaultDecision + " but got " + decision,            decision == defaultDecision);        path.append(pathExpansion);        candidate = new CandidateURI(uuri, path.toString(), null, null);        decision = this.rule.decisionFor(candidate);        assertTrue("Expect " + overLimitDecision + " but got " + decision,            decision == overLimitDecision);           }              public void testScopePlusOne()                 throws URIException, InvalidAttributeValueException,                 AttributeNotFoundException, MBeanException,                ReflectionException {        // first test host scope              ScopePlusOneDecideRule t = new ScopePlusOneDecideRule("host");        SurtPrefixSet mSet = new SurtPrefixSet();        mSet.add(SurtPrefixSet.prefixFromPlain("http://audio.archive.org"));        mSet.convertAllPrefixesToHosts();        t.surtPrefixes = mSet;        DecideRule s = addDecideRule(t);        s.setAttribute(new Attribute(ScopePlusOneDecideRule.ATTR_SCOPE,            ScopePlusOneDecideRule.HOST));        UURI uuri =            UURIFactory.getInstance("http://audio.archive.org/examples");        CandidateURI candidate = new CandidateURI(uuri);        Object decision = this.rule.decisionFor(candidate);        assertTrue("URI Expect " + DecideRule.ACCEPT + " for " + candidate +            " but got " + decision, decision == DecideRule.ACCEPT);            UURI uuriOne = UURIFactory.getInstance("http://movies.archive.org");        CandidateURI plusOne = new CandidateURI(uuriOne);        plusOne.setVia(uuri);        decision = this.rule.decisionFor(plusOne);        assertTrue("PlusOne Expect " + DecideRule.ACCEPT + " for " + plusOne +            " with via " + plusOne.flattenVia() + " but got " + decision,            decision == DecideRule.ACCEPT);        UURI uuriTwo = UURIFactory.getInstance("http://sloan.archive.org");        CandidateURI plusTwo = new CandidateURI(uuriTwo);        plusTwo.setVia(uuriOne);        decision = this.rule.decisionFor(plusTwo);        assertTrue("PlusTwo Expect " + DecideRule.PASS + " for " + plusTwo +            " with via " + plusTwo.flattenVia() + " but got " + decision,            decision == DecideRule.PASS);                        //now test domain scope        ScopePlusOneDecideRule u = new ScopePlusOneDecideRule("domain");        SurtPrefixSet mSet1 = new SurtPrefixSet();        mSet1.add(SurtPrefixSet.prefixFromPlain("archive.org"));        mSet1.convertAllPrefixesToDomains();        u.surtPrefixes = mSet1;        DecideRule v = addDecideRule(u);        v.setAttribute(new Attribute(ScopePlusOneDecideRule.ATTR_SCOPE,            ScopePlusOneDecideRule.DOMAIN));                decision = this.rule.decisionFor(candidate);        assertTrue("Domain: URI Expect " + DecideRule.ACCEPT + " for " +            candidate + " but got " + decision, decision == DecideRule.ACCEPT);            decision = this.rule.decisionFor(plusOne);        assertTrue("Domain: PlusOne Expect " + DecideRule.ACCEPT + " for " +            plusOne + " with via "  + plusOne.flattenVia() + " but got " +            decision, decision == DecideRule.ACCEPT);        decision = this.rule.decisionFor(plusTwo);        assertTrue("Domain: PlusTwo Expect " + DecideRule.ACCEPT + " for " +            plusTwo + " with via " + plusTwo.flattenVia() + " but got " +            decision, decision == DecideRule.ACCEPT);                UURI uuriThree = UURIFactory.getInstance("http://sloan.org");        CandidateURI plusThree = new CandidateURI(uuriThree);        plusThree.setVia(uuriTwo);        decision = this.rule.decisionFor(plusThree);        assertTrue("Domain: PlusThree Expect " + DecideRule.ACCEPT + " for " +            plusThree + " with via " + plusThree.flattenVia() + " but got " +            decision, decision == DecideRule.ACCEPT);                UURI uuriFour = UURIFactory.getInstance("http://example.com");        CandidateURI plusFour = new CandidateURI(uuriFour);        plusFour.setVia(uuriThree);        decision = this.rule.decisionFor(plusFour);                        assertTrue("Domain: PlusFour Expect " + DecideRule.PASS + " for " +            plusFour + " with via " + plusFour.flattenVia() + " but got " +            decision, decision == DecideRule.PASS);            }             public void testFilter()    throws InvalidAttributeValueException, URIException, AttributeNotFoundException, MBeanException, ReflectionException {    	FilterDecideRule dr = new FilterDecideRule(			"FilterDecideRule(ContentTypeRegExpFilter)");        addDecideRule(dr);        StringBuffer baseUri = new StringBuffer();        UURI uuri = UURIFactory.getInstance("http://example.com/foo");        CrawlURI curi = new CrawlURI(uuri);        curi.setContentType("text/html");        Object decision = this.rule.decisionFor(curi);        // default for unconfigured FilterDecideRule is true from (empty)        // filters, then ACCEPT because of true        assertTrue("Expect " + DecideRule.ACCEPT + " but got " + decision,            decision == DecideRule.ACCEPT);        ContentTypeRegExpFilter filt =         	new ContentTypeRegExpFilter("ContentTypeRegExpFilter","app.*");        dr.filters.addElement(null,filt);        decision = this.rule.decisionFor(curi);        // filter should now return false, making decision REJECT        assertTrue("Expect " + DecideRule.REJECT + " but got " + decision,            decision == DecideRule.REJECT);        curi.setContentType("application/octet-stream");        decision = this.rule.decisionFor(curi);        // filter should now return true, making decision ACCEPT        assertTrue("Expect " + DecideRule.ACCEPT + " but got " + decision,                decision == DecideRule.ACCEPT);        // change true answer to "PASS"; use String to simulate settings non-identity        dr.setAttribute(new Attribute(FilterDecideRule.ATTR_TRUE_DECISION,"PASS"));        decision = this.rule.decisionFor(curi);        assertTrue("Expect " + DecideRule.PASS + " but got " + decision,                decision == DecideRule.PASS);           }        protected DecideRule addDecideRule(DecideRule dr)    throws InvalidAttributeValueException {        MapType rules = this.rule.getRules(null);        rules.addElement(null, dr);        return dr;    }        public void testContentTypeMatchesRegexpDecideRule() throws Exception{        ContentTypeMatchesRegExpDecideRule dr = new ContentTypeMatchesRegExpDecideRule("CTMREDRtest");        DecideRule v = addDecideRule(dr);                v.setAttribute(new Attribute(MatchesRegExpDecideRule.ATTR_REGEXP,"text/html"));        UURI uuri = UURIFactory.getInstance("http://www.archive.org");        CrawlURI crawlUri = new CrawlURI(uuri);        // no content type - let curi pass        Object decision = this.rule.decisionFor(crawlUri);        assertTrue("URI Expect " + DecideRule.PASS + " for " + crawlUri +                " but got " + decision, decision == DecideRule.PASS);                // non-matching content type - let curi pass        crawlUri.setContentType("application/pdf");        decision = this.rule.decisionFor(crawlUri);        assertTrue("URI Expect " + DecideRule.PASS + " for " + crawlUri +                " but got " + decision, decision == DecideRule.PASS);                  // matching content type - accept curi        crawlUri.setContentType("text/html");        decision = this.rule.decisionFor(crawlUri);        assertTrue("URI Expect " + DecideRule.ACCEPT + " for " + crawlUri +                " but got " + decision, decision == DecideRule.ACCEPT);    }        public void testContentTypeNotMatchesRegexpDecideRule() throws Exception{        ContentTypeNotMatchesRegExpDecideRule dr = new ContentTypeNotMatchesRegExpDecideRule("CTNMREDRtest");        DecideRule v = addDecideRule(dr);                v.setAttribute(new Attribute(MatchesRegExpDecideRule.ATTR_REGEXP,"text/html"));        UURI uuri = UURIFactory.getInstance("http://www.archive.org");        CrawlURI crawlUri = new CrawlURI(uuri);        // no content type - let curi pass        Object decision = this.rule.decisionFor(crawlUri);        assertTrue("URI Expect " + DecideRule.PASS + " for " + crawlUri +                " but got " + decision, decision == DecideRule.PASS);                // matching content type - let curi pass        crawlUri.setContentType("text/html");        decision = this.rule.decisionFor(crawlUri);        assertTrue("URI Expect " + DecideRule.PASS + " for " + crawlUri +                " but got " + decision, decision == DecideRule.PASS);                // non-matching content type - accept curi        crawlUri.setContentType("application/pdf");        decision = this.rule.decisionFor(crawlUri);        assertTrue("URI Expect " + DecideRule.ACCEPT + " for " + crawlUri +                " but got " + decision, decision == DecideRule.ACCEPT);    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -