📄 parsertest.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
        NodeClassFilter filter = new NodeClassFilter (LinkTag.class);        for (NodeIterator e = parser.elements();e.hasMoreNodes();)            e.nextNode().collectInto(collectionList,filter);        assertEquals("Size of collection vector should be 11",11,collectionList.size());        // All items in collection vector should be links        for (SimpleNodeIterator e = collectionList.elements();e.hasMoreNodes();) {            Node node = e.nextNode();            assertTrue("Only links should have been parsed",node instanceof LinkTag);        }    }    public void testImageCollection() throws ParserException {        createParser(        "<html>\n"+        "<head>\n"+            "<meta name=\"generator\" content=\"Created Using Yahoo! PageBuilder 2.60.24\">\n"+        "</head>\n"+        "<body bgcolor=\"#FFFFFF\" link=\"#0000FF\" vlink=\"#FF0000\" text=\"#000000\"\n"+        " onLoad=\"window.onresize=new Function('if (navigator.appVersion==\'Netscape\') history.go(0);');\">\n"+        "<div id=\"layer0\" style=\"position:absolute;left:218;top:40;width:240;height:26;\">\n"+        "<table width=240 height=26 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+        "<td><b><font size=\"+2\"><span style=\"font-size:24\">NISHI-HONGWAN-JI</span></font></b></td>\n"+        "</tr></table></div>\n"+        "<div id=\"layer1\" style=\"position:absolute;left:75;top:88;width:542;height:83;\">\n"+        "<table width=542 height=83 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+        "<td><span style=\"font-size:14\">The Nihi Hongwanj-ji temple is very traditional, very old, and very beautiful. This is the place that we stayed on our first night in Kyoto. We then attended the morning prayer ceremony, at 6:30 am. Staying here costed us 7,500 yen, which was inclusive of dinner and breakfast, and usage of the o-furo (public bath). Felt more like a luxury hotel than a temple.</span></td>\n"+        "</tr></table></div>\n"+        "<div id=\"layer2\" style=\"position:absolute;left:144;top:287;width:128;height:96;\">\n"+        "<table width=128 height=96 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+        "<td><a href=\"nishi-hongwanji1.html\"><img height=96 width=128 src=\"nishi-hongwanji1-thumb.jpg\" border=0 ></a></td>\n"+        "</tr></table></div>\n"+        "<div id=\"layer3\" style=\"position:absolute;left:415;top:285;width:128;height:96;\">\n"+        "<table width=128 height=96 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+        "<td><a href=\"nishi-hongwanji3.html\"><img height=96 width=128 src=\"nishi-hongwanji2-thumb.jpg\" border=0 ></a></td>\n"+        "</tr></table></div>\n"+        "<div id=\"layer4\" style=\"position:absolute;left:414;top:182;width:128;height:96;\">\n"+        "<table width=128 height=96 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+        "<td><a href=\"higashi-hongwanji.html\"><img height=96 width=128 src=\"higashi-hongwanji-thumb.jpg\" border=0 ></a></td>\n"+        "</tr></table></div>\n"+        "<div id=\"layer5\" style=\"position:absolute;left:78;top:396;width:530;height:49;\">\n"+        "<table width=530 height=49 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+        "<td><span style=\"font-size:14\">Click on the pictures to see the full-sized versions. The picture at the top right corner is taken in Higashi-Hongwanji. Nishi means west, and Higashi means east. These two temples are adjacent to each other and represent two different Buddhist sects.</span></td>\n"+        "</tr></table></div>\n"+        "<div id=\"layer6\" style=\"position:absolute;left:143;top:180;width:128;height:102;\">\n"+        "<table width=128 height=102 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+        "<td><a href=\"nishi-hongwanji4.html\"><img height=102 width=128 src=\"nishi-hongwanji4-thumb.jpg\" border=0 ></a></td>\n"+        "</tr></table></div>\n"+        "<div id=\"layer7\" style=\"position:absolute;left:280;top:235;width:124;height:99;\">\n"+        "<table width=124 height=99 border=0 cellpadding=0 cellspacing=0><tr valign=\"top\">\n"+        "<td><a href=\"nishi-hongwanji-lodging.html\"><img height=99 width=124 src=\"nishi-hongwanji-lodging-thumb.jpg\" border=0 ></a></td>\n"+        "</tr></table></div>\n"+        "</body>\n"+        "</html>");        NodeList collectionList = new NodeList();        TagNameFilter filter = new TagNameFilter ("IMG");        for (NodeIterator e = parser.elements();e.hasMoreNodes();)            e.nextNode().collectInto(collectionList,filter);        assertEquals("Size of collection vector should be 5",5,collectionList.size());        // All items in collection vector should be links        for (SimpleNodeIterator e = collectionList.elements();e.hasMoreNodes();) {            Node node = e.nextNode();            assertTrue("Only images should have been parsed",node instanceof ImageTag);        }    }    /**     * See bug #728241 OutOfMemory error/ Infinite loop     */    public void testOutOfMemory () throws Exception    {        createParser (            "<html><head></head>\n"            + "<body>\n"            + "<table>\n"            + "<tr>\n"            + "      <td><img src=\"foo.jpg\" alt=\"f's b\"><font\n"            + " size=1>blah</font>\n"            + "</td>\n"            + "</tr>\n"            + "</table>\n"            + "</body></html>\n");        for (NodeIterator e = parser.elements();e.hasMoreNodes();) {            e.nextNode();        }    }    /**     * See bug #729368 Embedded quote and split tag     */    public void testEmbeddedQuoteSplit () throws Exception    {        createParser (            "<html><head></head>\n"            + "<body>\n"            + "<table>\n"            + "<tr><td><img src=\"x\" alt=\"f's b\"><font\n"            + "size=1>blah</font></td></tr>\n"            + "</table>\n"            + "</body></html>");        parser.setNodeFactory (new PrototypicalNodeFactory (true));        int i = 0;        for (NodeIterator e = parser.elements();e.hasMoreNodes();)        {            Node node = e.nextNode();            if (10 == i)            {                assertTrue ("not a tag", node instanceof Tag);                assertTrue ("ALT attribute incorrect", ((Tag)node).getAttribute ("ALT").equals ("f's b"));            }            i++;        }        assertEquals("Expected nodes",21,i);    }    /**     * See bug #826764 ParserException occurs only when using setInputHTML() instea     */    public void testSetInputHTML () throws Exception    {        String html;        String path;        File file;        PrintWriter out;        Node[] nodes;        html = "<html></html>";        createParser (html);        path = System.getProperty ("user.dir");        if (!path.endsWith (File.separator))            path += File.separator;        file = new File (path + "delete_me.html");        try        {            out = new PrintWriter (new FileWriter (file));            out.print ("<html>\r\n");            out.print ("<head>\r\n");            out.print ("<!-- BEGIN TYPE -->\r\n");            out.print ("<!-- NAVIGATION -->\r\n");            out.print ("<!-- END TYPE -->\r\n");            out.print ("<!-- BEGIN TITLE -->\r\n");            out.print ("<title>Einstiegsseite</title>\r\n");            out.print ("<!-- END TITLE -->\r\n");            out.print ("</head>\r\n");            out.print ("<body>\r\n");            out.print ("<ul>\r\n");            out.print ("<li>\r\n");            out.print ("<!-- BEGIN ITEM -->\r\n");            out.print ("<!-- BEGIN REF -->\r\n");            out.print ("<a href=\"kapitel1/index.html\">\r\n");            out.print ("<!-- END REF -->\r\n");            out.print ("<!-- BEGIN REFTITLE -->\r\n");            out.print ("Kapitel 1\r\n");            out.print ("<!-- END REFTITLE -->\r\n");            out.print ("</a>\r\n");            out.print ("<!-- END ITEM -->\r\n");            out.print ("</li>\r\n");            out.print ("<li>\r\n");            out.print ("<!-- BEGIN ITEM -->\r\n");            out.print ("<!-- BEGIN REF -->\r\n");            out.print ("<a href=\"kapitel2/index.html\">\r\n");            out.print ("<!-- END REF -->\r\n");            out.print ("<!-- BEGIN REFTITLE -->\r\n");            out.print ("Kapitel 2\r\n");            out.print ("<!-- END REFTITLE -->\r\n");            out.print ("</a>\r\n");            out.print ("<!-- END ITEM -->\r\n");            out.print ("</li>\r\n");            out.print ("<li>\r\n");            out.print ("<!-- BEGIN ITEM -->\r\n");            out.print ("<!-- BEGIN REF -->\r\n");            out.print ("<a href=\"kapitel3/index.html\">\r\n");            out.print ("<!-- END REF -->\r\n");            out.print ("<!-- BEGIN REFTITLE -->\r\n");            out.print ("Kapitel 3\r\n");            out.print ("<!-- END REFTITLE -->\r\n");            out.print ("</a>\r\n");            out.print ("<!-- END ITEM -->\r\n");            out.print ("</li>\r\n");            out.print ("</ul>\r\n");            out.print ("</body>\r\n");            out.print ("</html>");            out.close ();            DataInputStream stream = new DataInputStream (                new BufferedInputStream (new FileInputStream (file)));            byte[] buffer = new byte[(int)file.length ()];            stream.readFully (buffer);            html = new String (buffer);            try            {                parser.setInputHTML (html);                nodes = parser.extractAllNodesThatAre (LinkTag.class);            }            catch (ParserException e)            {                e.printStackTrace ();                nodes = new Node[0];            }            assertTrue ("node count", 3 == nodes.length);        }        catch (Exception e)        {            fail (e.toString ());        }        finally        {            file.delete ();        }    }    /**     * Test reproducing a java.lang.StackOverflowError.     */    public void testXMLTypeToString () throws Exception    {        String guts;        String output;                                                                                                                                                                guts = "TD width=\"69\"/";        createParser ("<" + guts + ">");        parseAndAssertNodeCount (1);        output = node[0].toString (); // this was where StackOverflow was thrown        assertTrue ("bad toString()", -1 != output.indexOf (guts));    }    /**     * See bug #883664 toUpperCase on tag names and attributes depends on locale     */    public void testDifferentLocale () throws Exception    {        String html;        Locale original;                                                                                                                                                                html = "<title>This is supposedly Turkish.</title>";        original = Locale.getDefault ();        try        {            Locale.setDefault (new Locale ("tr")); // turkish            createParser (html);            parseAndAssertNodeCount (1);            assertStringEquals ("html", html, node[0].toHtml ());        }        finally        {            Locale.setDefault (original);        }    }        /**     * See bug #900128 RemarkNode.setText() does not set Text     */    public void testSetStringText () throws Exception    {        String text;        String html;        String newtext;        String newhtml;        Node txt;        text = "This is just text.";        html = "<body>" + text + "</body>";        newtext = "This is different text.";        newhtml = "<body>" + newtext + "</body>";        createParser (html);        parseAndAssertNodeCount (1);        assertStringEquals ("html wrong", html, node[0].toHtml ());        assertTrue ("wrong number of children", 1 == node[0].getChildren ().size ());        assertTrue ("string node expected", node[0].getChildren ().elementAt (0) instanceof Text);        txt = node[0].getChildren ().elementAt (0);        assertStringEquals ("string html wrong", text, txt.toHtml ());        assertStringEquals ("string contents wrong", text, txt.getText ());        assertTrue ("toString wrong", txt.toString ().endsWith (text));        txt.setText (newtext);        assertStringEquals ("html wrong", newhtml, node[0].toHtml ());        assertStringEquals ("new string html wrong", newtext, txt.toHtml ());        assertStringEquals ("new string contents wrong", newtext, txt.getText ());        assertTrue ("toString wrong", txt.toString ().endsWith (newtext));    }    /**     * See bug #900128 RemarkNode.setText() does not set Text     */    public void testSetRemarkText () throws Exception    {        String text;        String remark;        String html;        String newtext;        String newremark;        String newhtml;        Node rem;        text = " This is a remark. ";        remark = "<!--" + text + "-->";        html = "<body>" + remark + "</body>";        newtext = " This is a different remark. ";        newremark = "<!--" + newtext + "-->";        newhtml = "<body>" + newremark + "</body>";        createParser (html);        parseAndAssertNodeCount (1);        assertStringEquals ("html wrong", html, node[0].toHtml ());        assertTrue ("wrong number of children", 1 == node[0].getChildren ().size ());        assertTrue ("remark node expected", node[0].getChildren ().elementAt (0) instanceof Remark);        rem = node[0].getChildren ().elementAt (0);        assertStringEquals ("remark html wrong", remark, rem.toHtml ());        assertStringEquals ("remark contents wrong", text, rem.getText ());        assertTrue ("toString wrong", rem.toString ().endsWith (text));        rem.setText (newtext);        assertStringEquals ("html wrong", newhtml, node[0].toHtml ());        assertStringEquals ("new remark html wrong", newremark, rem.toHtml ());        assertStringEquals ("new remark contents wrong", newtext, rem.getText ());        assertTrue ("toString wrong", rem.toString ().endsWith (newtext));        rem.setText (newremark);        assertStringEquals ("html wrong", newhtml, node[0].toHtml ());        assertStringEquals ("new remark html wrong", newremark, rem.toHtml ());        assertStringEquals ("new remark contents wrong", newtext, rem.getText ());        assertTrue ("toString wrong", rem.toString ().endsWith (newtext));    }    public void testFixSpaces () throws ParserException    {        String url = "http://htmlparser.sourceforge.net/test/This is a Test Page.html";        parser = new Parser (url);        assertEquals("Expected","http://htmlparser.sourceforge.net/test/This%20is%20a%20Test%20Page.html", parser.getURL ());    }}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -