📄 uurifactorytest.java
字号:
/* UURIFactoryTest * * $Id: UURIFactoryTest.java 5106 2007-05-01 00:07:29Z gojomo $ * * Created on Apr 2, 2004 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.net;import java.util.Iterator;import java.util.TreeMap;import junit.framework.TestCase;import org.apache.commons.httpclient.URIException;/** * Test UURIFactory for proper UURI creation across variety of * important/tricky cases. * * Be careful writing this file. Make sure you write it with UTF-8 encoding. * * @author igor stack gojomo */public class UURIFactoryTest extends TestCase { public final void testEscaping() throws URIException { // Note: single quote is not being escaped by URI class. final String ESCAPED_URISTR = "http://archive.org/" + UURIFactory.ESCAPED_SPACE + UURIFactory.ESCAPED_SPACE + UURIFactory.ESCAPED_CIRCUMFLEX + UURIFactory.ESCAPED_QUOT + UURIFactory.SQUOT + UURIFactory.ESCAPED_APOSTROPH + UURIFactory.ESCAPED_LSQRBRACKET + UURIFactory.ESCAPED_RSQRBRACKET + UURIFactory.ESCAPED_LCURBRACKET + UURIFactory.ESCAPED_RCURBRACKET + UURIFactory.SLASH + "a.gif"; // NBSP and SPACE should be trimmed; final String URISTR = "http://archive.org/.././" + "\u00A0" + UURIFactory.SPACE + UURIFactory.CIRCUMFLEX + UURIFactory.QUOT + UURIFactory.SQUOT + UURIFactory.APOSTROPH + UURIFactory.LSQRBRACKET + UURIFactory.RSQRBRACKET + UURIFactory.LCURBRACKET + UURIFactory.RCURBRACKET + UURIFactory.BACKSLASH + "test/../a.gif" + "\u00A0" + UURIFactory.SPACE; UURI uuri = UURIFactory.getInstance(URISTR); final String uuriStr = uuri.toString(); assertEquals("expected escaping", ESCAPED_URISTR, uuriStr); } public final void testUnderscoreMakesPortParseFail() throws URIException { UURI uuri = UURIFactory.getInstance("http://one-two_three:8080/index.html"); int port = uuri.getPort(); assertTrue("Failed find of port " + uuri, port == 8080); } public final void testRelativeURIWithTwoSlashes() throws URIException { UURI base = UURIFactory.getInstance("http://www.archive.org"); UURI uuri = UURIFactory.getInstance(base, "one//index.html"); assertTrue("Doesn't do right thing with two slashes " + uuri, uuri.toString().equals( "http://www.archive.org/one//index.html")); } public final void testTrailingEncodedSpace() throws URIException { UURI uuri = UURIFactory.getInstance("http://www.nps-shoes.co.uk%20"); assertTrue("Doesn't strip trailing encoded space 1 " + uuri, uuri.toString().equals("http://www.nps-shoes.co.uk/")); uuri = UURIFactory.getInstance("http://www.nps-shoes.co.uk%20%20%20"); assertTrue("Doesn't strip trailing encoded space 2 " + uuri, uuri.toString().equals("http://www.nps-shoes.co.uk/")); } public final void testPort0080is80() throws URIException { UURI uuri = UURIFactory.getInstance("http://archive.org:0080"); assertTrue("Doesn't strip leading zeros " + uuri, uuri.toString().equals("http://archive.org/")); } // DISABLING TEST AS PRECURSOR TO ELIMINATION// the problematic input given -- specifically the "%6s" incomplete uri-escape,// shouldn't necessarily be rejected as a bad URI. IE and Firefox, at least, // will attempt to fetch such an URL (getting, in this case against that ad // server, a bad-request error). Ideally, we'd generate exactly the same // request against the server as they do. However, with the most recent // fixup for stray '%' signs, we come close, but not exactly. That's enough// to cause this test to fail (it's not getting the expected exception) but// our almost-URI, which might be what was intended, is better than trying // nothing.// public final void testBadPath() {// String message = null;// try {// UURIFactory.getInstance("http://ads.as4x.tmcs.net/" +// "html.ng/site=cs&pagepos=102&page=home&adsize=1x1&context=" +// "generic&Params.richmedia=yes%26city%3Dseattle%26" +// "rstid%3D2415%26market_id%3D86%26brand%3Dcitysearch" +// "%6state%3DWA");// } catch (URIException e) {// message = e.getMessage();// }// assertNotNull("Didn't get expected exception.", message);// } public final void testEscapeEncoding() throws URIException { UURI uuri = UURIFactory.getInstance("http://www.y1y1.com/" + "albums/userpics/11111/normal_%E3%E4%EC%EC%EC.jpg", "windows-1256"); uuri.getPath(); } public final void testTooLongAfterEscaping() { StringBuffer buffer = new StringBuffer("http://www.archive.org/a/"); // Append bunch of spaces. When escaped, they'll triple in size. for (int i = 0; i < 1024; i++) { buffer.append(" "); } buffer.append("/index.html"); String message = null; try { UURIFactory.getInstance(buffer.toString()); } catch (URIException e) { message = e.getMessage(); } assertTrue("Wrong or no exception: " + message, (message != null) && message.startsWith("Created (escaped) uuri >")); } public final void testFtpUris() throws URIException { final String FTP = "ftp"; final String AUTHORITY = "pfbuser:pfbuser@mprsrv.agri.gov.cn"; final String PATH = "/clzreceive/"; final String uri = FTP + "://" + AUTHORITY + PATH; UURI uuri = UURIFactory.getInstance(uri); assertTrue("Failed to get matching scheme: " + uuri.getScheme(), (uuri.getScheme()).equals(FTP)); assertTrue("Failed to get matching authority: " + uuri.getAuthority(), (uuri.getAuthority()).equals(AUTHORITY)); assertTrue("Failed to get matching path: " + uuri.getPath(), (uuri.getPath()).equals(PATH)); } public final void testWhitespaceEscaped() throws URIException { // Test that we get all whitespace even if the uri is // already escaped. String uri = "http://archive.org/index%25 .html"; String tgtUri = "http://archive.org/index%25%20.html"; UURI uuri = UURIFactory.getInstance(uri); assertTrue("Not equal " + uuri.toString(), uuri.toString().equals(tgtUri)); uri = "http://archive.org/index%25\u001D.html"; tgtUri = "http://archive.org/index%25%1D.html".toLowerCase(); uuri = UURIFactory.getInstance(uri); assertEquals("whitespace escaping", tgtUri, uuri.toString()); uri = "http://gemini.info.usaid.gov/directory/" + "pbResults.cfm?&urlNameLast=Rumplestiltskin"; tgtUri = "http://gemini.info.usaid.gov/directory/faxResults.cfm?" + "name=Ebenezer%20+Rumplestiltskin,&location=RRB%20%20%20%205%2E08%2D006"; uuri = UURIFactory.getInstance(UURIFactory.getInstance(uri), "faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location=" + "RRB%20%20%20%205%2E08%2D006"); assertEquals("whitespace escaping", tgtUri, uuri.toString()); } // public final void testFailedGetPath() throws URIException {// final String path = "/RealMedia/ads/" +// "click_lx.ads/%%PAGE%%/%%RAND%%/%%POS%%/%%CAMP%%/empty";// // decoding in getPath will interpret %CA as 8-bit escaped char,// // possibly incomplete// final String uri = "http://ads.nandomedia.com" + path;// final UURI uuri = UURIFactory.getInstance(uri);// String foundPath = uuri.getPath();// assertEquals("unexpected path", path, foundPath);// } public final void testDnsHost() throws URIException { String uri = "dns://ads.nandomedia.com:81/one.html"; UURI uuri = UURIFactory.getInstance(uri); String host = uuri.getReferencedHost(); assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com")); uri = "dns:ads.nandomedia.com"; uuri = UURIFactory.getInstance(uri); host = uuri.getReferencedHost(); assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com")); uri = "dns:ads.nandomedia.com?a=b"; uuri = UURIFactory.getInstance(uri); host = uuri.getReferencedHost(); assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com")); } public final void testPercentEscaping() throws URIException { final String uri = "http://archive.org/%a%%%%%.html"; // tests indicate firefox (1.0.6) does not encode '%' at all final String tgtUri = "http://archive.org/%a%%%%%.html"; UURI uuri = UURIFactory.getInstance(uri); assertEquals("Not equal",tgtUri, uuri.toString()); } public final void testRelativeDblPathSlashes() throws URIException { UURI base = UURIFactory.getInstance("http://www.archive.org/index.html"); UURI uuri = UURIFactory.getInstance(base, "JIGOU//KYC//INDEX.HTM"); assertTrue("Double slash not working " + uuri.toString(), uuri.getPath().equals("/JIGOU//KYC//INDEX.HTM")); } public final void testRelativeWithScheme() throws URIException { UURI base = UURIFactory.getInstance("http://www.example.com/some/page"); UURI uuri = UURIFactory.getInstance(base, "http:boo"); assertTrue("Relative with scheme not working " + uuri.toString(), uuri.toString().equals("http://www.example.com/some/boo")); } public final void testBadBaseResolve() throws URIException { UURI base = UURIFactory.getInstance("http://license.joins.com/board/" + "etc_board_list.asp?board_name=new_main&b_type=&nPage=" + "2&category=G&lic_id=70&site=changeup&g_page=changeup&g_sPage=" + "notice&gate=02"); UURIFactory.getInstance(base, "http://www.changeup.com/...</a"); } public final void testTilde() throws URIException { noChangeExpected("http://license.joins.com/~igor"); } public final void testCurlies() throws URIException { // Firefox allows curlies in the query string portion of a URL only // (converts curlies if they are in the path portion ahead of the // query string). UURI uuri = noChangeExpected("http://license.joins.com/igor?one={curly}"); assertEquals(uuri.getQuery(), "one={curly}"); assertEquals(UURIFactory. getInstance("http://license.joins.com/igor{curly}.html"). toString(), "http://license.joins.com/igor%7Bcurly%7D.html"); boolean exception = false; try { UURIFactory.getInstance("http://license.{curly}.com/igor.html"); } catch (URIException u) { exception = true; } assertTrue("Did not get exception.", exception); } protected UURI noChangeExpected(final String original) throws URIException { UURI uuri = UURIFactory.getInstance(original); assertEquals(original, uuri.toString()); return uuri; } public final void testTrimSpaceNBSP() throws URIException { final String uri = " http://archive.org/DIR WITH SPACES/" + UURIFactory.NBSP + "home.html " + UURIFactory.NBSP + " "; final String tgtUri = "http://archive.org/DIR%20WITH%20SPACES/%20home.html"; UURI uuri = UURIFactory.getInstance(uri); assertTrue("Not equal " + uuri.toString(), uuri.toString().equals(tgtUri)); } /** * Test space plus encoding ([ 1010966 ] crawl.log has URIs with spaces in them). * See <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1010966&group_id=73833&atid=539099">[ 1010966 ] crawl.log has URIs with spaces in them</a>. * @throws URIException */ public final void testSpaceDoubleEncoding() throws URIException { final String uri = "http://www.brook.edu/i.html? %20taxonomy=Politics"; final String encodedUri = "http://www.brook.edu/i.html?%20%20taxonomy=Politics"; UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1"); assertTrue("Not equal " + uuri.toString(), uuri.toString().equals(encodedUri)); } /** * Test for doubly-encoded sequences. * See <a href="https://sourceforge.net/tracker/index.php?func=detail&aid=966219&group_id=73833&atid=539099">[ 966219 ] UURI doubly-encodes %XX sequences</a>. * @throws URIException */ public final void testDoubleEncoding() throws URIException { final char ae = '\u00E6'; final String uri = "http://archive.org/DIR WITH SPACES/home" + ae + ".html"; final String encodedUri = "http://archive.org/DIR%20WITH%20SPACES/home%E6.html"; UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1"); assertEquals("single encoding", encodedUri, uuri.toString()); // Dbl-encodes. uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1"); uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1"); assertEquals("double encoding", encodedUri, uuri.toString()); // Do default utf-8 test. uuri = UURIFactory.getInstance(uri); final String encodedUtf8Uri = "http://archive.org/DIR%20WITH%20SPACES/home%C3%A6.html"; assertEquals("Not equal utf8", encodedUtf8Uri, uuri.toString()); // Now dbl-encode.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -