📄 parsertest.java
字号:
String path; File file; PrintWriter out; Parser parser; Node nodes[]; int i; NodeIterator enumeration; path = System.getProperty ("user.dir"); if (!path.endsWith (File.separator)) path += File.separator; file = new File (path + "delete_me.html"); try { out = new PrintWriter (new FileWriter (file)); out.println ("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">"); out.println ("<html>"); out.println ("<head>"); out.println ("<title>test</title>"); out.println ("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">"); out.println ("</head>"); out.println ("<body>"); out.println ("This is a test page "); out.println ("</body>"); out.println ("</html>"); out.close (); parser = new Parser (file.getAbsolutePath (), new DefaultParserFeedback(DefaultParserFeedback.QUIET)); parser.setNodeFactory (new PrototypicalNodeFactory (true)); nodes = new Node[30]; i = 0; for (enumeration = parser.elements (); enumeration.hasMoreNodes ();) { nodes[i] = enumeration.nextNode (); i++; } assertEquals("Expected nodes",20,i); } catch (Exception e) { fail (e.toString ()); } finally { file.delete (); } } /** * Tests deleting a file held open by the parser. * See bug #1005409 Input file not free by parser */ public void testFileDelete () { String path; File file; PrintWriter out; Parser parser; NodeIterator enumeration; path = System.getProperty ("user.dir"); if (!path.endsWith (File.separator)) path += File.separator; file = new File (path + "delete_me.html"); try { out = new PrintWriter (new FileWriter (file)); out.println ("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">"); out.println ("<html>"); out.println ("<head>"); out.println ("<title>test</title>"); out.println ("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">"); out.println ("</head>"); out.println ("<body>"); out.println ("This is a test page "); out.println ("</body>"); out.println ("</html>"); // fill our 16K buffer on read for (int i = 0; i < InputStreamSource.BUFFER_SIZE; i++) out.println (); out.close (); parser = new Parser (file.getAbsolutePath (), new DefaultParserFeedback(DefaultParserFeedback.QUIET)); parser.setNodeFactory (new PrototypicalNodeFactory (true)); enumeration = parser.elements (); enumeration.nextNode (); if (-1 != System.getProperty ("os.name").indexOf("Windows")) // linux/unix lets you delete a file even when it's open assertTrue ("file deleted with more available", !file.delete ()); // parser.getLexer ().getPage ().close (); parser = null; enumeration = null; System.gc (); System.runFinalization (); assertTrue ("file not deleted after destroy", file.delete ()); } catch (Exception e) { fail (e.toString ()); } finally { file.delete (); } } /** * Test with a HTTP header with a valid charset parameter. * Here, ibm.co.jp is an example of a HTTP server that correctly sets the * charset in the header to match the content encoding. */ public void testHTTPCharset () { Parser parser; try { parser = new Parser("http://www.ibm.com/jp/", Parser.DEVNULL); assertTrue("Character set should be Shift_JIS", parser.getEncoding ().equalsIgnoreCase ("Shift_JIS")); } catch (ParserException e) { fail ("could not open http://www.ibm.com/jp/"); } } /** * Test with a HTML header with a charset parameter not matching the HTTP header. * Here, www.sony.co.jp is an example of a HTTP server that does not set the * charset in the header to match the content encoding. We check that after * the enumeration is created, that the charset has changed to the correct value. */ public void testHTMLCharset () { Parser parser; NodeIterator enumeration; try { parser = new Parser("http://www.sony.co.jp", Parser.DEVNULL); assertEquals("Character set by default is ISO-8859-1", "ISO-8859-1", parser.getEncoding ()); enumeration = parser.elements(); // search for the <BODY> tag while (enumeration.hasMoreNodes ()) if (enumeration.nextNode () instanceof BodyTag) break; assertTrue("Character set should be Shift_JIS", parser.getEncoding ().equalsIgnoreCase ("Shift_JIS")); } catch (ParserException e) { fail ("could not open http://www.sony.co.jp"); } } /** * Test the case of a charset directive different than the HTTP header. * See bug #707447 META TAG - CHARSET * and bug #699886 can't parse website other than iso-8859-1 */ public void testSwitchCharset () throws ParserException { Parser parser; String url = "http://htmlparser.sourceforge.net/test/gb2312Charset.html"; int i; Node[] nodes; parser = new Parser(url); parser.setNodeFactory (new PrototypicalNodeFactory (new MetaTag ())); i = 0; nodes = new Node[30]; for (NodeIterator e = parser.elements(); e.hasMoreNodes();) nodes[i++] = e.nextNode(); assertEquals ("Expected nodes", 23, i); } /** * Test the case of a double quoted charset directive. * See bug #694477. * Technically, this format does not meet the HTTP/1.1 * specification in RFC 2068. In this case that I believe * that the quotes are being inproperly generated in the * header by a server-side web application. * Nonetheless, it would be nice to handle this case. */ public void testDoubleQuotedCharset () throws ParserException { Parser parser; String url = "http://htmlparser.sourceforge.net/test/DoublequotedCharset.html"; parser = new Parser(url); for (NodeIterator e = parser.elements();e.hasMoreNodes();) e.nextNode(); assertTrue ("Wrong encoding", parser.getEncoding ().equals ("UTF-8")); } /** * Test the case of a single quoted charset directive. * See bug #694477. * Technically, this format does not meet the HTTP/1.1 * specification in RFC 2068. In this case that I believe * that the quotes are being inproperly generated in the * header by a server-side web application. * Nonetheless, it would be nice to handle this case. */ public void testSingleQuotedCharset () throws ParserException { Parser parser; String url = "http://htmlparser.sourceforge.net/test/SinglequotedCharset.html"; parser = new Parser(url); for (NodeIterator e = parser.elements();e.hasMoreNodes();) e.nextNode(); assertTrue ("Wrong encoding", parser.getEncoding ().equals ("UTF-8")); } // This test is commented out because the URL no longer has a comma delimited character set. // Reinstate when a suitable URL is discovered, or the unit tests set up their own HTTP server.// /**// * Test a bogus comma delimited charset specification in the HTTP header.// * See bug #722941.// * A comma delimted charset in the HTTP header does not meet the HTTP/1.1// * specification in RFC 2068. In this case that I believe// * that some idiot has misconfigured the HTTP server, but since it's// * AOL it would be nice to handle this case.// */// public void testCommaListCharset () throws ParserException// {// URL url;// URLConnection connection;// Parser parser;// String bogus = "http://users.aol.com/geinster/rej.htm";//// try// {// url = new URL (bogus);// connection = url.openConnection ();// parser = new Parser (new Lexer (new Page (connection)));// // must be the default// assertTrue ("Wrong encoding", parser.getEncoding ().equals ("ISO-8859-1"));// for (NodeIterator e = parser.elements();e.hasMoreNodes();)// e.nextNode();// assertTrue ("Wrong encoding", parser.getEncoding ().equals ("windows-1252"));// }// catch (Exception e)// {// fail (e.getMessage ());// }// } public void testNullUrl() { try { new Parser("http://none.existant.url.org", Parser.DEVNULL); assertTrue("Should have thrown an exception!",false); } catch (ParserException e) { // expected outcome } } public void testURLWithSpaces() throws ParserException{ Parser parser; String url = "http://htmlparser.sourceforge.net/test/This is a Test Page.html"; parser = new Parser(url); parser.setNodeFactory (new PrototypicalNodeFactory (true)); Node node [] = new Node[30]; int i = 0; for (NodeIterator e = parser.elements();e.hasMoreNodes();) { node[i] = e.nextNode(); i++; } assertEquals("Expected nodes",20,i); } public void testLinkCollection() throws ParserException { createParser( "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"><title>Google</title><style><!--\n"+ "body,td,a,p,.h{font-family:arial,sans-serif;} .h{font-size: 20px;} .h{color:} .q{text-decoration:none; color:#0000cc;}\n"+ "//--></style>\n"+ "<script>\n"+ "<!--\n"+ "function sf(){document.f.q.focus();}\n"+ "function c(p){var f=document.f;if (f.action) {f.action = 'http://'+p;f.submit();return false;}return true;}\n"+ "// -->\n"+ "</script>\n"+ "</head><body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onLoad=sf()><center><table border=0 cellspacing=0 cellpadding=0><tr><td><img src=\"images/logo.gif\" width=276 height=110 alt=\"Google\"></td></tr></table><br>\n"+ "<table border=0 cellspacing=0 cellpadding=0>" + "<tr>" + "<td width=15> </td>" + "<td id=0 bgcolor=#3366cc align=center width=95 nowrap>" + "<font color=#ffffff size=-1><b>Web</b></font>" + "</td>" + "<td width=15> </td>" + "<td id=1 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/imghp');\" style=cursor:pointer;cursor:hand;><a id=1a class=q href=\"/imghp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/imghp');\"><font size=-1>Images</font></a></td><td width=15> </td><td id=2 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/grphp');\" style=cursor:pointer;cursor:hand;><a id=2a class=q href=\"/grphp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/grphp');\"><font size=-1>Groups</font></a></td><td width=15> </td><td id=3 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/dirhp');\" style=cursor:pointer;cursor:hand;><a id=3a class=q href=\"/dirhp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/dirhp');\"><font size=-1>Directory</font></a></td><td width=15> </td><td id=4 bgcolor=#efefef align=center width=95 nowrap onClick=\"return c('www.google.com/nwshp');\" style=cursor:pointer;cursor:hand;><a id=4a class=q href=\"/nwshp?hl=en&ie=UTF-8&oe=UTF-8\" onClick=\"return c('www.google.com/nwshp');\"><font size=-1><nobr>News-<font color=red>New!</font></nobr></font></a></td><td width=15> </td></tr><tr><td colspan=12 bgcolor=#3366cc><img width=1 height=1 alt=\"\">" + "</td>" + "</tr>" + "</table>" + "<br>" + "<form action=\"/search\" name=f>" + "<table cellspacing=0 cellpadding=0>" + "<tr>" + "<td width=75> </td>" + "<td align=center>" + "<input type=hidden name=hl value=en>" + "<input type=hidden name=ie value=\"UTF-8\">" + "<input type=hidden name=oe value=\"UTF-8\">" + "<input maxLength=256 size=55 name=q value=\"\"><br>" + "<input type=submit value=\"Google Search\" name=btnG>" + "<input type=submit value=\"I'm Feeling Lucky\" name=btnI>" + "</td>" + "<td valign=top nowrap>" + "<font size=-2> • <a href=/advanced_search?hl=en>Advanced Search</a>" + "<br> • <a href=/preferences?hl=en>Preferences</a>" + "<br> • <a href=/language_tools?hl=en>Language Tools</a>" + "</font>" + "</td>" + "</tr>" + "</table>" + "</form><br>\n"+ "<br><font size=-1><a href=\"/ads/\">Advertise with Us</a> - <a href=\"/services/\">Search Solutions</a> - <a href=\"/options/\">Services & Tools</a> - <a href=/about.html>Jobs, Press, & Help</a><span id=hp style=\"behavior:url(#default#homepage)\"></span>\n"+ "<script>\n"+ "if (!hp.isHomePage('http://www.google.com/')) {document.write(\"<p><a href=\"/mgyhp.html\" onClick=\"style.behavior='url(#default#homepage)';setHomePage('http://www.google.com/');\">Make Google Your Homepage!</a>\");}\n"+ "</script></font>\n"+ "<p><font size=-2>©2002 Google</font><font size=-2> - Searching 3,083,324,652 web pages</font></center></body></html>\n" ); NodeList collectionList = new NodeList();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -