⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 uurifactorytest.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
/* UURIFactoryTest * * $Id: UURIFactoryTest.java,v 1.12 2006/07/18 00:40:16 gojomo Exp $ * * Created on Apr 2, 2004 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.net;import java.util.Iterator;import java.util.TreeMap;import junit.framework.TestCase;import org.apache.commons.httpclient.URIException;/** * Test UURIFactory for proper UURI creation across variety of * important/tricky cases. *  * Be careful writing this file.  Make sure you write it with UTF-8 encoding. * * @author igor stack gojomo */public class UURIFactoryTest extends TestCase {		public final void testEscaping() throws URIException {		// Note: single quote is not being escaped by URI class.		final String ESCAPED_URISTR = "http://archive.org/" +		    UURIFactory.ESCAPED_SPACE +			UURIFactory.ESCAPED_SPACE +			UURIFactory.ESCAPED_CIRCUMFLEX +			UURIFactory.ESCAPED_QUOT +			UURIFactory.SQUOT +			UURIFactory.ESCAPED_APOSTROPH +			UURIFactory.ESCAPED_LSQRBRACKET +			UURIFactory.ESCAPED_RSQRBRACKET +			UURIFactory.ESCAPED_LCURBRACKET +			UURIFactory.ESCAPED_RCURBRACKET +			UURIFactory.SLASH + "a.gif"; // NBSP and SPACE should be trimmed;				final String URISTR = "http://archive.org/.././" + "\u00A0" +		    UURIFactory.SPACE + UURIFactory.CIRCUMFLEX +			UURIFactory.QUOT + UURIFactory.SQUOT +			UURIFactory.APOSTROPH + UURIFactory.LSQRBRACKET +			UURIFactory.RSQRBRACKET + UURIFactory.LCURBRACKET +			UURIFactory.RCURBRACKET + UURIFactory.BACKSLASH +			"test/../a.gif" + "\u00A0" + UURIFactory.SPACE;				UURI uuri = UURIFactory.getInstance(URISTR);		final String uuriStr = uuri.toString();		assertEquals("expected escaping", ESCAPED_URISTR, uuriStr);	}    public final void testUnderscoreMakesPortParseFail() throws URIException {        UURI uuri = UURIFactory.getInstance("http://one-two_three:8080/index.html");        int port = uuri.getPort();        assertTrue("Failed find of port " + uuri, port == 8080);    }        public final void testRelativeURIWithTwoSlashes() throws URIException {        UURI base = UURIFactory.getInstance("http://www.archive.org");        UURI uuri = UURIFactory.getInstance(base, "one//index.html");        assertTrue("Doesn't do right thing with two slashes " + uuri,            uuri.toString().equals(                "http://www.archive.org/one//index.html"));    }        public final void testTrailingEncodedSpace() throws URIException {        UURI uuri = UURIFactory.getInstance("http://www.nps-shoes.co.uk%20");        assertTrue("Doesn't strip trailing encoded space 1 " + uuri,            uuri.toString().equals("http://www.nps-shoes.co.uk/"));        uuri = UURIFactory.getInstance("http://www.nps-shoes.co.uk%20%20%20");        assertTrue("Doesn't strip trailing encoded space 2 " + uuri,            uuri.toString().equals("http://www.nps-shoes.co.uk/"));    }        public final void testPort0080is80() throws URIException {        UURI uuri = UURIFactory.getInstance("http://archive.org:0080");        assertTrue("Doesn't strip leading zeros " + uuri,            uuri.toString().equals("http://archive.org/"));    }    // DISABLING TEST AS PRECURSOR TO ELIMINATION// the problematic input given -- specifically the "%6s" incomplete uri-escape,// shouldn't necessarily be rejected as a bad URI. IE and Firefox, at least, // will  attempt to fetch such an URL (getting, in this case against that ad // server, a bad-request error). Ideally, we'd generate exactly the same // request against the server as they do. However, with the most recent // fixup for stray '%' signs, we come close, but not exactly. That's enough// to cause this test to fail (it's not getting the expected exception) but// our almost-URI, which might be what was intended, is better than trying // nothing.//    public final void testBadPath() {//        String message = null;//        try {//            UURIFactory.getInstance("http://ads.as4x.tmcs.net/" +//                "html.ng/site=cs&pagepos=102&page=home&adsize=1x1&context=" +//                "generic&Params.richmedia=yes%26city%3Dseattle%26" +//                "rstid%3D2415%26market_id%3D86%26brand%3Dcitysearch" +//                "%6state%3DWA");//        } catch (URIException e) {//            message = e.getMessage();//        }//        assertNotNull("Didn't get expected exception.", message);//    }           public final void testEscapeEncoding() throws URIException {        UURI uuri = UURIFactory.getInstance("http://www.y1y1.com/" +            "albums/userpics/11111/normal_%E3%E4%EC%EC%EC.jpg", "windows-1256");        uuri.getPath();    }           public final void testTooLongAfterEscaping() {        StringBuffer buffer = new StringBuffer("http://www.archive.org/a/");        // Append bunch of spaces.  When escaped, they'll triple in size.        for (int i = 0; i < 1024; i++) {        	buffer.append(" ");        }        buffer.append("/index.html");        String message = null;        try {        	UURIFactory.getInstance(buffer.toString());        } catch (URIException e) {            message = e.getMessage();        }        assertTrue("Wrong or no exception: " + message, (message != null) &&            message.startsWith("Created (escaped) uuri >"));    }		public final void testFtpUris() throws URIException {		final String FTP = "ftp";		final String AUTHORITY = "pfbuser:pfbuser@mprsrv.agri.gov.cn";		final String PATH = "/clzreceive/";		final String uri = FTP + "://" + AUTHORITY + PATH;		UURI uuri = UURIFactory.getInstance(uri);		assertTrue("Failed to get matching scheme: " + uuri.getScheme(),				(uuri.getScheme()).equals(FTP));		assertTrue("Failed to get matching authority: " +				uuri.getAuthority(), (uuri.getAuthority()).equals(AUTHORITY));		assertTrue("Failed to get matching path: " +				uuri.getPath(), (uuri.getPath()).equals(PATH));       	}        public final void testWhitespaceEscaped() throws URIException {        // Test that we get all whitespace even if the uri is        // already escaped.        String uri = "http://archive.org/index%25 .html";        String tgtUri = "http://archive.org/index%25%20.html";        UURI uuri = UURIFactory.getInstance(uri);        assertTrue("Not equal " + uuri.toString(),                uuri.toString().equals(tgtUri));        uri = "http://archive.org/index%25\t.html";        tgtUri = "http://archive.org/index%25%09.html";        uuri = UURIFactory.getInstance(uri);        assertEquals("whitespace escaping", tgtUri, uuri.toString());               uri = "http://archive.org/index%25\u001D.html";        tgtUri = "http://archive.org/index%25%1D.html".toLowerCase();        uuri = UURIFactory.getInstance(uri);        assertEquals("whitespace escaping", tgtUri, uuri.toString());        uri = "http://gemini.info.usaid.gov/directory/" +            "pbResults.cfm?&urlNameLast=Adamson";        tgtUri = "http://gemini.info.usaid.gov/directory/faxResults.cfm?" +            "name=Charisse%20+Adamson,&location=RRB%20%20%20%205%2E08%2D006";        uuri = UURIFactory.getInstance(UURIFactory.getInstance(uri),            "faxResults.cfm?name=Charisse +Adamson,&location=" +            "RRB%20%20%20%205%2E08%2D006");        assertEquals("whitespace escaping", tgtUri, uuri.toString());    }    //	public final void testFailedGetPath() throws URIException {//		final String path = "/RealMedia/ads/" +//		"click_lx.ads/%%PAGE%%/%%RAND%%/%%POS%%/%%CAMP%%/empty";//        // decoding in getPath will interpret %CA as 8-bit escaped char,//        // possibly incomplete//		final String uri = "http://ads.nandomedia.com" + path;//		final UURI uuri = UURIFactory.getInstance(uri);//		String foundPath = uuri.getPath();//		assertEquals("unexpected path", path, foundPath);//	}        public final void testDnsHost() throws URIException {        String uri = "dns://ads.nandomedia.com:81/one.html";        UURI uuri = UURIFactory.getInstance(uri);        String host = uuri.getReferencedHost();        assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));        uri = "dns:ads.nandomedia.com";        uuri = UURIFactory.getInstance(uri);        host = uuri.getReferencedHost();        assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));        uri = "dns:ads.nandomedia.com?a=b";        uuri = UURIFactory.getInstance(uri);        host = uuri.getReferencedHost();        assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));    }		public final void testPercentEscaping() throws URIException {		final String uri = "http://archive.org/%a%%%%%.html";        // tests indicate firefox (1.0.6) does not encode '%' at all        final String tgtUri = "http://archive.org/%a%%%%%.html";		UURI uuri = UURIFactory.getInstance(uri);		assertEquals("Not equal",tgtUri, uuri.toString());	}    	public final void testRelativeDblPathSlashes() throws URIException {		UURI base = UURIFactory.getInstance("http://www.archive.org/index.html");		UURI uuri = UURIFactory.getInstance(base, "JIGOU//KYC//INDEX.HTM");        assertTrue("Double slash not working " + uuri.toString(),                uuri.getPath().equals("/JIGOU//KYC//INDEX.HTM"));	}        public final void testRelativeWithScheme() throws URIException {        UURI base = UURIFactory.getInstance("http://www.example.com/some/page");        UURI uuri = UURIFactory.getInstance(base, "http:boo");        assertTrue("Relative with scheme not working " + uuri.toString(),                uuri.toString().equals("http://www.example.com/some/boo"));    }        public final void testBadBaseResolve() throws URIException {        UURI base = UURIFactory.getInstance("http://license.joins.com/board/" +            "etc_board_list.asp?board_name=new_main&b_type=&nPage=" +            "2&category=G&lic_id=70&site=changeup&g_page=changeup&g_sPage=" +            "notice&gate=02");        UURIFactory.getInstance(base, "http://www.changeup.com/...</a");    }        public final void testTilde() throws URIException {        noChangeExpected("http://license.joins.com/~igor");    }        public final void testCurlies() throws URIException {        // Firefox allows curlies in the query string portion of a URL only        // (converts curlies if they are in the path portion ahead of the        // query string).        UURI uuri =            noChangeExpected("http://license.joins.com/igor?one={curly}");        assertEquals(uuri.getQuery(), "one={curly}");        assertEquals(UURIFactory.                getInstance("http://license.joins.com/igor{curly}.html").                    toString(),            "http://license.joins.com/igor%7Bcurly%7D.html");        boolean exception = false;        try {            UURIFactory.getInstance("http://license.{curly}.com/igor.html");        } catch (URIException u) {            exception = true;        }        assertTrue("Did not get exception.", exception);    }        protected UURI noChangeExpected(final String original)    throws URIException {        UURI uuri = UURIFactory.getInstance(original);        assertEquals(original, uuri.toString());        return uuri;    }    	public final void testTrimSpaceNBSP() throws URIException {		final String uri = "   http://archive.org/DIR WITH SPACES/" +		UURIFactory.NBSP + "home.html    " + UURIFactory.NBSP + "   ";		final String tgtUri =			"http://archive.org/DIR%20WITH%20SPACES/%20home.html";		UURI uuri = UURIFactory.getInstance(uri);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -