📄 lexertests.java

📁 html to xml convertor
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
     * encoding. We detect failure by looking for weird tag names which were     * not correctly handled as string nodes.     * <p>     * Here is a partial dump of the page with escape sequences:     * <pre>     * 0002420 1b 24 42 3f 79 4a 42 25 47 25 38 25 2b 25 61 43     * 0002440 35 44 65 43 44 1b 28 4a 20 77 69 74 68 20 43 61     * ..     * 0002720 6c 22 3e 4a 53 6b 79 1b 24 42 42 50 31 7e 25 5a     * 0002740 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 3c     * ..     * 0003060 20 69 1b 24 42 25 62 21 3c 25 49 42 50 31 7e 25     * 0003100 5a 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a     * ..     * 0003220 1b 24 42 25 2d 25 3f 25 5e 25 2f 25 69 24 4e 25     * 0003240 5b 21 3c 25 60 25 5a 21 3c 25 38 1b 28 4a 3c 2f     * ..     * 0003320 6e 65 31 2e 70 6c 22 3e 1b 24 42 3d 60 48 77 43     * 0003340 66 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 2d 2d 2d 2d     * ..     * 0004400 46 6f 72 75 6d 20 30 30 39 20 28 1b 24 42 3e 21     * 0004420 3c 6a 24 4b 31 4a 4a 21 44 2e 24 4a 24 49 1b 28     * 0004440 4a 29 3c 2f 41 3e 3c 49 4d 47 20 53 52 43 3d 22     * </pre>     * <p>     * The fix proposed by j_s_nightingale is implemented to swallow JIS     * escape sequences in the string parser.     * Apparently the fix won't help EUC-JP and Shift-JIS though, so this may     * still be a problem.     * It's theoretically possible that JIS encoding, or another one,     * could be used as attribute names or values within tags as well,     * but this is considered improbable and is therefore not handled in     * the tag parser state machine.     */    public void testJIS ()        throws ParserException    {        Parser parser;        NodeIterator iterator;                parser = new Parser ("http://www.009.com/");        try        {            iterator = parser.elements ();            while (iterator.hasMoreNodes ())                checkTagNames (iterator.nextNode ());        }        catch (EncodingChangeException ece)        {            parser.reset ();            iterator = parser.elements ();            while (iterator.hasMoreNodes ())                checkTagNames (iterator.nextNode ());        }    }    /**     * Check the tag name for one of the ones expected on the page.     * Recursively check the children.     */    public void checkTagNames (Node node)    {        Tag tag;        String name;        NodeList children;                if (node instanceof Tag)        {            tag = (Tag)node;            name = tag.getTagName ();            if (!mAcceptable.contains (name))                fail ("unrecognized tag name \"" + name + "\"");            children = tag.getChildren ();            if (null != children)                for (int i = 0; i < children.size (); i++)                    checkTagNames (children.elementAt (i));        }    }    /**     * See bug #825820 Words conjoined     */    public void testConjoined ()        throws            ParserException    {        StringBuffer buffer;        NodeIterator iterator;        Node node;        String expected;        expected = "The Title\nThis is the body.";        String html1 = "<html><title>The Title\n</title>" +            "<body>This is <a href=\"foo.html\">the body</a>.</body></html>";        createParser (html1);        buffer = new StringBuffer ();        for (iterator = parser.elements (); iterator.hasMoreNodes (); )        {            node = iterator.nextNode ();            String text = node.toPlainTextString ();            buffer.append (text);        }        assertStringEquals ("conjoined text", expected, buffer.toString ());        String html2 = "<html><title>The Title</title>\n" +            "<body>This is <a href=\"foo.html\">the body</a>.</body></html>";        createParser (html2);        buffer = new StringBuffer ();        for (iterator = parser.elements (); iterator.hasMoreNodes (); )        {            node = iterator.nextNode ();            String text = node.toPlainTextString ();            buffer.append (text);        }        assertStringEquals ("conjoined text", expected, buffer.toString ());                String html3 = "<html><title>The Title</title>" +            "<body>\nThis is <a href=\"foo.html\">the body</a>.</body></html>";        createParser (html3);        buffer = new StringBuffer ();        for (iterator = parser.elements (); iterator.hasMoreNodes (); )        {            node = iterator.nextNode ();            String text = node.toPlainTextString ();            buffer.append (text);        }        assertStringEquals ("conjoined text", expected, buffer.toString ());    }    /**     * Check for StackOverflow error.     */    public void testStackOverflow ()        throws            ParserException    {        NodeIterator iterator;        Node node;        String html;                                                                                                                                                                html = "<a href = \"http://test.com\" />";        createParser (html);        for (iterator = parser.elements (); iterator.hasMoreNodes (); )        {            node = iterator.nextNode ();            String text = node.toHtml ();            assertStringEquals ("no overflow", html, text);        }        html = "<a href=\"http://test.com\"/>";        createParser (html);        for (iterator = parser.elements (); iterator.hasMoreNodes (); )        {            node = iterator.nextNode ();            String text = node.toHtml ();            assertStringEquals ("no overflow", html, text);        }        html = "<a href = \"http://test.com\"/>";        createParser (html);        for (iterator = parser.elements (); iterator.hasMoreNodes (); )        {            node = iterator.nextNode ();            String text = node.toHtml ();            assertStringEquals ("no overflow", html, text);        }    }    /**     * See bug #880283 Character "&gt;" erroneously inserted by Lexer     */    public void testJsp () throws ParserException    {        String html;        Lexer lexer;        Node node;                html = "<% out.urlEncode('abc') + \"<br>\" + out.urlEncode('xyz') %>";        lexer = new Lexer (html);        node = lexer.nextNode ();        if (node == null)            fail ("too few nodes");        else            assertStringEquals ("bad html", html, node.toHtml());        assertNull ("too many nodes", lexer.nextNode ());    }    /**     * Unit test for new PI parsing code.     */    public void testPI() throws ParserException    {        String html;        Lexer lexer;        Node node;        html = "<?php print(\"<p>Hello World!</p>\"); ?>";        lexer = new Lexer(html);        node = lexer.nextNode();        if (node == null)            fail ("too few nodes");        else            assertStringEquals("bad html", html, node.toHtml());        assertNull("too many nodes", lexer.nextNode());    }    /**     * See bug #899413 bug in javascript end detection.     */    public void testEscapedQuote () throws ParserException    {        String string;        String html;        Lexer lexer;        Node node;                string = "\na='\\'';\n";        html = string + "</script>";        lexer = new Lexer (html);        node = lexer.nextNode (true);        if (node == null)            fail ("too few nodes");        else            assertStringEquals ("bad string", string, node.toHtml());        assertNotNull ("too few nodes", lexer.nextNode (true));        assertNull ("too many nodes", lexer.nextNode (true));    }    /**     * See bug #1227213 Particular SCRIPT tags close too late.     */    public void testCommentInScript () throws ParserException    {        String tag;        String cdata;        String endtag;        String html;        Parser parser;        NodeIterator iterator;        Node node;        tag = "<script>";        cdata = "<!--document.write(\"en\");// -->";        endtag = "</script>";        html = tag + cdata + endtag;        parser = new Parser ();        parser.setInputHTML (html);        iterator = parser.elements ();        node = iterator.nextNode ();        if (node == null)            fail ("too few nodes");        else            assertStringEquals ("bad parse", html, node.toHtml());        assertTrue (node instanceof ScriptTag);        assertStringEquals ("bad cdata", cdata, ((ScriptTag)node).getScriptCode ());        assertNull ("too many nodes", iterator.nextNode ());    }    /**     * See bug #1227213 Particular SCRIPT tags close too late.     * This was actually working prior to the patch, since the     * ScriptScanner didn't use smartquote processing.     * I'm not sure why jwilsonsprings1 said the patch worked     * for him. I can only assume he was mistaken in thinking     * it was the URL that caused the failure.     */    public void testUrlInStyle () throws ParserException    {        String tag;        String cdata;        String endtag;        String html;        Parser parser;        NodeIterator iterator;        Node node;                tag = "<style>";        cdata = ".eSDot {background-image:" +            "url(http://di.image.eshop.msn.com/img/sys/dot.gif)}";        endtag = "</style>";        html = tag + cdata + endtag;        parser = new Parser ();        parser.setInputHTML (html);        iterator = parser.elements ();        node = iterator.nextNode ();        if (node == null)            fail ("too few nodes");        else            assertStringEquals ("bad parse", html, node.toHtml());        assertTrue (node instanceof StyleTag);        assertStringEquals ("bad cdata", cdata, ((StyleTag)node).getStyleCode ());        assertNull ("too many nodes", iterator.nextNode ());    }    /**     * See bug #1493884 Lexer returns a TagNode with a 'null' name     */    public void testDosLineEndingInName () throws ParserException    {        String html;        NodeIterator iterator;        Node node;        html = "<!\r\nMSIE->";        parser = new Parser ();        parser.setInputHTML (html);        iterator = parser.elements ();        node = iterator.nextNode ();        if (node == null)            fail ("too few nodes");        else        {            assertNotNull ("null node", node);            assertTrue (node instanceof Tag);            assertNotNull ("null name", ((Tag)node).getTagName ());            assertStringEquals ("bad parse", "!", ((Tag)node).getTagName ());        }    }}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -