📄 parsertest.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
        String path;        File file;        PrintWriter out;        Parser parser;        Node nodes[];        int i;        NodeIterator enumeration;        path = System.getProperty ("user.dir");        if (!path.endsWith (File.separator))            path += File.separator;        file = new File (path + "delete_me.html");        try        {            out = new PrintWriter (new FileWriter (file));            out.println ("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">");            out.println ("<html>");            out.println ("<head>");            out.println ("<title>test</title>");            out.println ("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");            out.println ("</head>");            out.println ("<body>");            out.println ("This is a test page ");            out.println ("</body>");            out.println ("</html>");            out.close ();            parser = new Parser (file.getAbsolutePath (), new DefaultParserFeedback(DefaultParserFeedback.QUIET));            parser.setNodeFactory (new PrototypicalNodeFactory (true));            nodes = new Node[30];            i = 0;            for (enumeration = parser.elements (); enumeration.hasMoreNodes ();)            {                nodes[i] = enumeration.nextNode ();                i++;            }            assertEquals("Expected nodes",20,i);        }        catch (Exception e)        {            fail (e.toString ());        }        finally        {            file.delete ();        }    }    /**     * Tests deleting a file held open by the parser.     * See bug #1005409 Input file not free by parser     */    public void testFileDelete ()    {        String path;        File file;        PrintWriter out;        Parser parser;        NodeIterator enumeration;        path = System.getProperty ("user.dir");        if (!path.endsWith (File.separator))            path += File.separator;        file = new File (path + "delete_me.html");        try        {            out = new PrintWriter (new FileWriter (file));            out.println ("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">");            out.println ("<html>");            out.println ("<head>");            out.println ("<title>test</title>");            out.println ("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");            out.println ("</head>");            out.println ("<body>");            out.println ("This is a test page ");            out.println ("</body>");            out.println ("</html>");            // fill our 16K buffer on read            for (int i = 0; i < InputStreamSource.BUFFER_SIZE; i++)                out.println ();            out.close ();            parser = new Parser (file.getAbsolutePath (), new DefaultParserFeedback(DefaultParserFeedback.QUIET));            parser.setNodeFactory (new PrototypicalNodeFactory (true));            enumeration = parser.elements ();            enumeration.nextNode ();            if (-1 != System.getProperty ("os.name").indexOf("Windows"))                // linux/unix lets you delete a file even when it's open                assertTrue ("file deleted with more available", !file.delete ());            // parser.getLexer ().getPage ().close ();            parser = null;            enumeration = null;            System.gc ();            System.runFinalization ();            assertTrue ("file not deleted after destroy", file.delete ());        }        catch (Exception e)        {            fail (e.toString ());        }        finally        {            file.delete ();        }    }    /**     * Test with a HTTP header with a valid charset parameter.     * Here, ibm.co.jp is an example of a HTTP server that correctly sets the     * charset in the header to match the content encoding.     */    public void testHTTPCharset ()    {        Parser parser;        try        {            parser = new Parser("http://www.ibm.com/jp/", Parser.DEVNULL);            assertTrue("Character set should be Shift_JIS", parser.getEncoding ().equalsIgnoreCase ("Shift_JIS"));        }        catch (ParserException e)        {            fail ("could not open http://www.ibm.com/jp/");        }    }    /**     * Test with a HTML header with a charset parameter not matching the HTTP header.     * Here, www.sony.co.jp is an example of a HTTP server that does not set the     * charset in the header to match the content encoding. We check that after     * the enumeration is created, that the charset has changed to the correct value.     */    public void testHTMLCharset ()    {        Parser parser;        NodeIterator enumeration;        try        {            parser = new Parser("http://www.sony.co.jp", Parser.DEVNULL);            assertEquals("Character set by default is ISO-8859-1", "ISO-8859-1", parser.getEncoding ());            enumeration = parser.elements();            // search for the <BODY> tag            while (enumeration.hasMoreNodes ())                if (enumeration.nextNode () instanceof BodyTag)                    break;            assertTrue("Character set should be Shift_JIS", parser.getEncoding ().equalsIgnoreCase ("Shift_JIS"));        }        catch (ParserException e)        {            fail ("could not open http://www.sony.co.jp");        }    }    /**     * Test the case of a charset directive different than the HTTP header.     * See bug #707447 META TAG - CHARSET     * and bug #699886 can't parse website other than iso-8859-1     */    public void testSwitchCharset () throws ParserException    {        Parser parser;        String url = "http://htmlparser.sourceforge.net/test/gb2312Charset.html";        int i;        Node[] nodes;        parser = new Parser(url);        parser.setNodeFactory (new PrototypicalNodeFactory (new MetaTag ()));        i = 0;        nodes = new Node[30];        for (NodeIterator e = parser.elements(); e.hasMoreNodes();)            nodes[i++] = e.nextNode();        assertEquals ("Expected nodes", 23, i);    }    /**     * Test the case of a double quoted charset directive.     * See bug #694477.     * Technically, this format does not meet the HTTP/1.1     * specification in RFC 2068. In this case that I believe     * that the quotes are being inproperly generated in the     * header by a server-side web application.     * Nonetheless, it would be nice to handle this case.     */    public void testDoubleQuotedCharset () throws ParserException    {        Parser parser;        String url = "http://htmlparser.sourceforge.net/test/DoublequotedCharset.html";        parser = new Parser(url);        for (NodeIterator e = parser.elements();e.hasMoreNodes();)            e.nextNode();        assertTrue ("Wrong encoding", parser.getEncoding ().equals ("UTF-8"));    }    /**     * Test the case of a single quoted charset directive.     * See bug #694477.     * Technically, this format does not meet the HTTP/1.1     * specification in RFC 2068. In this case that I believe     * that the quotes are being inproperly generated in the     * header by a server-side web application.     * Nonetheless, it would be nice to handle this case.     */    public void testSingleQuotedCharset () throws ParserException    {        Parser parser;        String url = "http://htmlparser.sourceforge.net/test/SinglequotedCharset.html";        parser = new Parser(url);        for (NodeIterator e = parser.elements();e.hasMoreNodes();)            e.nextNode();        assertTrue ("Wrong encoding", parser.getEncoding ().equals ("UTF-8"));    }    // This test is commented out because the URL no longer has a comma delimited character set.    // Reinstate when a suitable URL is discovered, or the unit tests set up their own HTTP server.//    /**//     * Test a bogus comma delimited charset specification in the HTTP header.//     * See bug #722941.//     * A comma delimted charset in the HTTP header does not meet the HTTP/1.1//     * specification in RFC 2068. In this case that I believe//     * that some idiot has misconfigured the HTTP server, but since it's//     * AOL it would be nice to handle this case.//     *///    public void testCommaListCharset () throws ParserException//    {//        URL url;//        URLConnection connection;//        Parser parser;//        String bogus = "http://users.aol.com/geinster/rej.htm";////        try//        {//            url = new URL (bogus);//            connection = url.openConnection ();//            parser = new Parser (new Lexer (new Page (connection)));//            // must be the default//            assertTrue ("Wrong encoding", parser.getEncoding ().equals ("ISO-8859-1"));//            for (NodeIterator e = parser.elements();e.hasMoreNodes();)//                e.nextNode();//            assertTrue ("Wrong encoding", parser.getEncoding ().equals ("windows-1252"));//        }//        catch (Exception e)//        {//            fail (e.getMessage ());//        }//    }    public void testNullUrl() {        try        {            new Parser("http://none.existant.url.org", Parser.DEVNULL);            assertTrue("Should have thrown an exception!",false);        }        catch (ParserException e)        {            // expected outcome        }    }    public void testURLWithSpaces() throws ParserException{        Parser parser;        String url = "http://htmlparser.sourceforge.net/test/This is a Test Page.html";        parser = new Parser(url);        parser.setNodeFactory (new PrototypicalNodeFactory (true));        Node node [] = new Node[30];        int i = 0;        for (NodeIterator e = parser.elements();e.hasMoreNodes();) {            node[i] = e.nextNode();            i++;        }        assertEquals("Expected nodes",20,i);    }    public void testLinkCollection() throws ParserException {        createParser(        "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"><title>Google</title><style><!--\n"+        "body,td,a,p,.h{font-family:arial,sans-serif;} .h{font-size: 20px;} .h{color:} .q{text-decoration:none; color:#0000cc;}\n"+        "//--></style>\n"+        "<script>\n"+        "<!--\n"+        "function sf(){document.f.q.focus();}\n"+        "function c(p){var f=document.f;if (f.action) {f.action = 'http://'+p;f.submit();return false;}return true;}\n"+        "// -->\n"+        "</script>\n"+        "</head><body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onLoad=sf()><center><table border=0 cellspacing=0 cellpadding=0><tr><td><img src=\"images/logo.gif\" width=276 height=110 alt=\"Google\"></td></tr></table><br>\n"+        "<table border=0 cellspacing=0 cellpadding=0>" +            "<tr>" +            "<td width=15>&nbsp;</td>" +            "<td id=0 bgcolor=#3366cc align=center width=95 nowrap>" +                "<font color=#ffffff size=-1><b>Web</b></font>" +            "</td>" +            "<td width=15>&nbsp;</td>" +            "<td id=1 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/imghp');\" style=cursor:pointer;cursor:hand;><a id=1a class=q href=\"/imghp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/imghp');\"><font size=-1>Images</font></a></td><td width=15>&nbsp;</td><td id=2 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/grphp');\" style=cursor:pointer;cursor:hand;><a id=2a class=q href=\"/grphp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/grphp');\"><font size=-1>Groups</font></a></td><td width=15>&nbsp;</td><td id=3 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/dirhp');\" style=cursor:pointer;cursor:hand;><a id=3a class=q href=\"/dirhp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/dirhp');\"><font size=-1>Directory</font></a></td><td width=15>&nbsp;</td><td id=4 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/nwshp');\" style=cursor:pointer;cursor:hand;><a id=4a class=q href=\"/nwshp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/nwshp');\"><font size=-1><nobr>News-<font  color=red>New!</font></nobr></font></a></td><td width=15>&nbsp;</td></tr><tr><td colspan=12 bgcolor=#3366cc><img width=1 height=1 alt=\"\">" +            "</td>" +            "</tr>" +        "</table>" +        "<br>" +        "<form action=\"/search\" name=f>" +            "<table cellspacing=0 cellpadding=0>" +            "<tr>" +                "<td width=75>&nbsp;</td>" +                "<td align=center>" +                    "<input type=hidden name=hl value=en>" +                    "<input type=hidden name=ie value=\"UTF-8\">" +                    "<input type=hidden name=oe value=\"UTF-8\">" +                    "<input maxLength=256 size=55 name=q value=\"\"><br>" +                    "<input type=submit value=\"Google Search\" name=btnG>" +                    "<input type=submit value=\"I'm Feeling Lucky\" name=btnI>" +                "</td>" +                "<td valign=top nowrap>" +                    "<font size=-2>&nbsp;&#8226;&nbsp;<a href=/advanced_search?hl=en>Advanced&nbsp;Search</a>" +                    "<br>&nbsp;&#8226;&nbsp;<a href=/preferences?hl=en>Preferences</a>" +                    "<br>&nbsp;&#8226;&nbsp;<a href=/language_tools?hl=en>Language Tools</a>" +                    "</font>" +                "</td>" +            "</tr>" +            "</table>" +        "</form><br>\n"+        "<br><font size=-1><a href=\"/ads/\">Advertise&nbsp;with&nbsp;Us</a> - <a href=\"/services/\">Search&nbsp;Solutions</a> - <a href=\"/options/\">Services&nbsp;&amp;&nbsp;Tools</a> - <a href=/about.html>Jobs,&nbsp;Press,&nbsp;&amp;&nbsp;Help</a><span id=hp style=\"behavior:url(#default#homepage)\"></span>\n"+        "<script>\n"+        "if (!hp.isHomePage('http://www.google.com/')) {document.write(\"<p><a href=\"/mgyhp.html\" onClick=\"style.behavior='url(#default#homepage)';setHomePage('http://www.google.com/');\">Make Google Your Homepage!</a>\");}\n"+        "</script></font>\n"+        "<p><font size=-2>&copy;2002 Google</font><font size=-2> - Searching 3,083,324,652 web pages</font></center></body></html>\n"        );        NodeList collectionList = new NodeList();
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -