📄 extractorhtmltest.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
                Link link = (Link) i.next();                if (link.getHopType() != Link.SPECULATIVE_HOP                        && link.getHopType() != Link.NAVLINK_HOP                        && link.getHopType() != Link.EMBED_HOP) {                    count++;                    System.out.println(link.getHopType() + " "                            + link.getDestination());                }            }        }        System.out.println("TOTAL URIS EXTRACTED: "+count);    }    /**     * Test a particular <embed src=...> construct that was suspicious in     * the No10GovUk crawl.     *     * @throws URIException     */    public void testEmbedSrc() throws URIException {        CrawlURI curi=            new CrawlURI(UURIFactory.getInstance("http://www.example.org"));        // An example from http://www.records.pro.gov.uk/documents/prem/18/1/default.asp?PageId=62&qt=true        CharSequence cs = "<embed src=\"/documents/prem/18/1/graphics/qtvr/" +            "hall.mov\" width=\"320\" height=\"212\" controller=\"true\" " +            "CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/" +            "quicktime/download/\" /> ";        this.extractor.extract(curi,cs);        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {            public boolean evaluate(Object object) {                return ((Link) object).getDestination().toString().indexOf(                        "/documents/prem/18/1/graphics/qtvr/hall.mov")>=0;            }        }));    }        /**     * Test a whitespace issue found in href.     *      * See [ 963965 ] Either UURI or ExtractHTML should strip whitespace better.     * https://sourceforge.net/tracker/?func=detail&atid=539099&aid=963965&group_id=73833     *     * @throws URIException     */    public void testHrefWhitespace() throws URIException {        CrawlURI curi =            new CrawlURI(UURIFactory.getInstance("http://www.carsound.dk"));        CharSequence cs = "<a href=\"http://www.carsound.dk\n\n\n" +        	"\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>";           this.extractor.extract(curi,cs);        curi.getOutLinks();        assertTrue("Not stripping new lines", CollectionUtils.exists(curi                .getOutLinks(), new Predicate() {            public boolean evaluate(Object object) {                return ((Link) object).getDestination().toString().indexOf(                        "http://www.carsound.dk/")>=0;            }        }));    }        /**     * Test a missing whitespace issue found in form     *      * [HER-1128] ExtractorHTML fails to extract FRAME SRC link without     * whitespace before SRC http://webteam.archive.org/jira/browse/HER-1128     */    public void testNoWhitespaceBeforeValidAttribute() throws URIException {        CrawlURI curi = new CrawlURI(UURIFactory                .getInstance("http://www.example.com"));        CharSequence cs = "<frame name=\"main\"src=\"http://www.example.com/\"> ";        this.extractor.extract(curi, cs);        Link[] links = curi.getOutLinks().toArray(new Link[0]);        assertTrue("no links found",links.length==1);        assertTrue("expected link not found",                 links[0].getDestination().toString().equals("http://www.example.com/"));    }        /**     * Test only extract FORM ACTIONS with METHOD GET      *      * [HER-1280] do not by default GET form action URLs declared as POST,      * because it can cause problems/complaints      * http://webteam.archive.org/jira/browse/HER-1280     */    public void testOnlyExtractFormGets() throws URIException {        CrawlURI curi = new CrawlURI(UURIFactory                .getInstance("http://www.example.com"));        CharSequence cs =             "<form method=\"get\" action=\"http://www.example.com/ok1\"> "+            "<form action=\"http://www.example.com/ok2\" method=\"get\"> "+            "<form method=\"post\" action=\"http://www.example.com/notok\"> "+            "<form action=\"http://www.example.com/ok3\"> ";        this.extractor.extract(curi, cs);        Link[] links = curi.getOutLinks().toArray(new Link[0]);        assertTrue("incorrect number of links found",links.length==3);    }        /**     * Test that relative URIs with late colons aren't misinterpreted     * as absolute URIs with long, illegal scheme components.      *      * See http://webteam.archive.org/jira/browse/HER-1268     *      * @throws URIException     */    public void testBadRelativeLinks() throws URIException {        CrawlURI curi = new CrawlURI(UURIFactory                .getInstance("http://www.example.com"));        CharSequence cs = "<a href=\"example.html;jsessionid=deadbeef:deadbeed?parameter=this:value\"/>"                + "<a href=\"example.html?parameter=this:value\"/>";        this.extractor.extract(curi, cs);        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {            public boolean evaluate(Object object) {                return ((Link) object)                        .getDestination()                        .toString()                        .indexOf(                                "/example.html;jsessionid=deadbeef:deadbeed?parameter=this:value") >= 0;            }        }));        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {            public boolean evaluate(Object object) {                return ((Link) object).getDestination().toString().indexOf(                        "/example.html?parameter=this:value") >= 0;            }        }));    }        /**     * Test if scheme is maintained by speculative hops onto exact      * same host     *      * [HER-1524] speculativeFixup in ExtractorJS should maintain URL scheme     */    public void testSpeculativeLinkExtraction() throws URIException {        CrawlURI curi = new CrawlURI(UURIFactory                .getInstance("https://www.example.com"));        CharSequence cs =             "<script type=\"text/javascript\">_parameter=\"www.anotherexample.com\";"                + "_anotherparameter=\"www.example.com/index.html\""                + ";</script>";        this.extractor.extract(curi, cs);        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {            public boolean evaluate(Object object) {                return ((Link) object).getDestination().toString().equals(                        "http://www.anotherexample.com/");            }        }));        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {            public boolean evaluate(Object object) {                return ((Link) object).getDestination().toString().equals(                        "https://www.example.com/index.html");            }        }));    }            /**     * test to see if embedded <SCRIPT/> which writes script TYPE     * creates any outlinks, e.g. "type='text/javascript'".      *      * [HER-1526] SCRIPT writing script TYPE common trigger of bogus links      *   (eg. 'text/javascript')     *        * @throws URIException     */    public void testScriptTagWritingScriptType() throws URIException {        CrawlURI curi = new CrawlURI(UURIFactory                .getInstance("http://www.example.com/en/fiche/dossier/322/"));        CharSequence cs =             "<script type=\"text/javascript\">"            + "var gaJsHost = ((\"https:\" == document.location.protocol) "            + "? \"https://ssl.\" : \"http://www.\");"            + "document.write(unescape(\"%3Cscript src='\" + gaJsHost + "            + "\"google-analytics.com/ga.js' "            + "type='text/javascript'%3E%3C/script%3E\"));"            + "</script>";        this.extractor.extract(curi, cs);        assertTrue("outlinks should be empty",curi.getOutLinks().isEmpty());                    }        /**     * False test: tries to verify extractor ignores a 'longDesc'     * attribute. In fact, HTML spec says longDesc is a URI, so      * crawler should find 2 links here.      * See [HER-206]     * @throws URIException     */    public void xestAvoidBadSpec() throws URIException {        CrawlURI curi = new CrawlURI(UURIFactory                .getInstance("http://www.example.com"));        CharSequence cs =             "<TBODY>\r\n" +             "<TR>\r\n" +             "<TD><IMG height=259 alt=\"Operation Overlord Commanders\"\r\n" +             "src=\"/img/aboutus/history/dday60/commanders.jpg\"\r\n" +             "width=500 longDesc=\"Overlord Commanders, Back row, left\r\n" +             "to right:<BR>Lieutenant General Bradley, Admiral\r\n" +             "Ramsay, Air Chief Marshal Leigh-Mallory, General Bedell\r\n" +             "Smith.<BR>Front row, left to right: Air Chief Marshal\r\n" +             "Tedder, General Eisenhower, General Montgomery.\"></TD></TR>\r\n" +             "<TR>\r\n" +             "<TD class=caption>�Overlord� Commanders, Back row, left\r\n" +             "to right:<BR>Lieutenant General Bradley, Admiral\r\n" +             "Ramsay, Air Chief Marshal Leigh-Mallory, General Bedell\r\n" +             "Smith.<BR>Front row, left to right: Air Chief Marshal\r\n" +             "Tedder, General Eisenhower, General\r\n" +             "Montgomery.</TD></TR></TBODY></TABLE>\r\n" +             "<P>\r\n" +             "<TABLE id=imageinset width=\"35%\" align=right\r\n" +             "summary=\"Key Facts About the Allied Forces Deployed on\r\n" +             "D-Day\" border=0>\r\n" +             "<TBODY>";        this.extractor.extract(curi, cs);        Link[] links = curi.getOutLinks().toArray(new Link[0]);        assertTrue("incorrect number of links found",links.length==1);    }    public static void main(String[] args) throws Exception {        if (args.length != 1 && args.length != 2) {            System.err.println("Usage: " + ExtractorHTMLTest.class.getName() +                " URL|PATH [ENCODING]");            System.exit(1);        }        ExtractorHTMLTest testCase = new ExtractorHTMLTest();        testCase.setUp();        try {            testCase.runExtractor(testCase.getUURI(args[0]),                (args.length == 2)? args[1]: null);        } finally {            testCase.tearDown();        }    }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -