📄 uurifactory.java
字号:
public static UURI getInstance(String uri, String charset) throws URIException { return UURIFactory.factory.create(uri, charset); } /** * @param base Base uri to use resolving passed relative uri. * @param relative URI as string. * @return An instance of UURI * @throws URIException */ public static UURI getInstance(UURI base, String relative) throws URIException { return UURIFactory.factory.create(base, relative); } /** * Test of whether passed String has an allowed URI scheme. * First tests if likely scheme suffix. If so, we then test if its one of * the supported schemes. * @param possibleUrl URL string to examine. * @return True if passed string looks like it could be an URL. */ public static boolean hasSupportedScheme(String possibleUrl) { boolean hasScheme = UURI.hasScheme(possibleUrl); if (!hasScheme || UURIFactory.factory.schemes == null) { return hasScheme; } String tmpStr = possibleUrl.substring(0, possibleUrl.indexOf(':')); return Arrays.binarySearch(UURIFactory.factory.schemes, tmpStr) >= 0; } /** * @param uri URI as string. * @return Instance of UURI. * @throws URIException */ private UURI create(String uri) throws URIException { return create(uri, UURI.getDefaultProtocolCharset()); } /** * @param uri URI as string. * @param charset Original encoding of the string. * @return Instance of UURI. * @throws URIException */ private UURI create(String uri, String charset) throws URIException { UURI uuri = new UURI(fixup(uri, null, charset), true, charset); if (logger.isLoggable(Level.FINE)) { logger.fine("URI " + uri + " PRODUCT " + uuri.toString() + " CHARSET " + charset); } return validityCheck(uuri); } /** * @param base UURI to use as a base resolving <code>relative</code>. * @param relative Relative URI. * @return Instance of UURI. * @throws URIException */ private UURI create(UURI base, String relative) throws URIException { UURI uuri = new UURI(base, new UURI(fixup(relative, base, base.getProtocolCharset()), true, base.getProtocolCharset())); if (logger.isLoggable(Level.FINE)) { logger.fine(" URI " + relative + " PRODUCT " + uuri.toString() + " CHARSET " + base.getProtocolCharset() + " BASE " + base); } return validityCheck(uuri); } /** * Check the generated UURI. * * At the least look at length of uuri string. We were seeing case * where before escaping, string was < MAX_URL_LENGTH but after was * >. Letting out a too-big message was causing us troubles later * down the processing chain. * @param uuri Created uuri to check. * @return The passed <code>uuri</code> so can easily inline this check. * @throws URIException */ protected UURI validityCheck(UURI uuri) throws URIException { if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) { throw new URIException("Created (escaped) uuri > " + UURI.MAX_URL_LENGTH +": "+uuri.toString()); } return uuri; } /** * Do heritrix fix-up on passed uri string. * * Does heritrix escaping; usually escaping done to make our behavior align * with IEs. This method codifies our experience pulling URIs from the * wilds. Its does all the escaping we want; its output can always be * assumed to be 'escaped' (though perhaps to a laxer standard than the * vanilla HttpClient URI class or official specs might suggest). * * @param uri URI as string. * @param base May be null. * @param e True if the uri is already escaped. * @return A fixed up URI string. * @throws URIException */ private String fixup(String uri, final URI base, final String charset) throws URIException { if (uri == null) { throw new NullPointerException(); } else if (uri.length() == 0 && base == null) { throw new URIException("URI length is zero (and not relative)."); } if (uri.length() > UURI.MAX_URL_LENGTH) { // We check length here and again later after all convertions. throw new URIException("URI length > " + UURI.MAX_URL_LENGTH + ": " + uri); } // Replace nbsp with normal spaces (so that they get stripped if at // ends, or encoded if in middle) if (uri.indexOf(NBSP) >= 0) { uri = TextUtils.replaceAll(NBSP, uri, SPACE); } // Get rid of any trailing spaces or new-lines. uri = uri.trim(); // IE actually converts backslashes to slashes rather than to %5C. // Since URIs that have backslashes usually work only with IE, we will // convert backslashes to slashes as well. // TODO: Maybe we can first convert backslashes by specs and than by IE // so that we fetch both versions. if (uri.indexOf(BACKSLASH) >= 0) { uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH); } // Kill newlines etc uri = TextUtils.replaceAll(NEWLINE, uri, EMPTY_STRING); // Test for the case of more than two slashes after the http(s) scheme. // Replace with two slashes as mozilla does if found. // See [ 788219 ] URI Syntax Errors stop page parsing. Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri); if (matcher.matches()) { uri = matcher.group(1) + matcher.group(2); } // now, minimally escape any whitespace uri = escapeWhitespace(uri); // For further processing, get uri elements. See the RFC2396REGEX // comment above for explaination of group indices used in the below. matcher = RFC2396REGEX.matcher(uri); if (!matcher.matches()) { throw new URIException("Failed parse of " + uri); } String uriScheme = checkUriElementAndLowerCase(matcher.group(2)); String uriSchemeSpecificPart = checkUriElement(matcher.group(3)); String uriAuthority = checkUriElement(matcher.group(5)); String uriPath = checkUriElement(matcher.group(6)); String uriQuery = checkUriElement(matcher.group(8)); // UNUSED String uriFragment = checkUriElement(matcher.group(10)); // If a scheme, is it a supported scheme? if (uriScheme != null && uriScheme.length() > 0 && this.schemes != null) { if (!(Arrays.binarySearch(schemes,uriScheme)>=0)) { // unsupported; see if silently ignored if((Arrays.binarySearch(ignoredSchemes,uriScheme)>=0)) { throw new URIException( IGNORED_SCHEME, "Ignored scheme: " + uriScheme); } else { throw new URIException("Unsupported scheme: " + uriScheme); } } } // Test if relative URI. If so, need a base to resolve against. if (uriScheme == null || uriScheme.length() <= 0) { if (base == null) { throw new URIException("Relative URI but no base: " + uri); } } // fixup authority portion: lowercase/IDN-punycode any domain; // remove stray trailing spaces uriAuthority = fixupAuthority(uriAuthority); // Do some checks if absolute path. if (uriSchemeSpecificPart != null && uriSchemeSpecificPart.startsWith(SLASH)) { if (uriPath != null) { // Eliminate '..' if its first thing in the path. IE does this. uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath, SLASH); } // Ensure root URLs end with '/': browsers always send "/" // on the request-line, so we should consider "http://host" // to be "http://host/". if (uriPath == null || EMPTY_STRING.equals(uriPath)) { uriPath = SLASH; } } if (uriAuthority != null) { if (uriScheme != null && uriScheme.length() > 0 && uriScheme.equals(HTTP)) { uriAuthority = checkPort(uriAuthority); uriAuthority = stripTail(uriAuthority, HTTP_PORT); } else if (uriScheme != null && uriScheme.length() > 0 && uriScheme.equals(HTTPS)) { uriAuthority = checkPort(uriAuthority); uriAuthority = stripTail(uriAuthority, HTTPS_PORT); } // Strip any prefix dot or tail dots from the authority. uriAuthority = stripTail(uriAuthority, DOT); uriAuthority = stripPrefix(uriAuthority, DOT); } else { // no authority; may be relative. consider stripping scheme // to work-around org.apache.commons.httpclient.URI bug // ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 ) if (uriScheme != null && base != null && uriScheme.equals(base.getScheme())) { // uriScheme redundant and will only confound httpclient.URI uriScheme = null; } } // Ensure minimal escaping. Use of 'lax' URI and URLCodec // means minimal escaping isn't necessarily complete/consistent. // There is a chance such lax encoding will throw exceptions // later at inconvenient times. // // One reason for these bad escapings -- though not the only -- // is that the page is using an encoding other than the ASCII or the // UTF-8 that is our default URI encoding. In this case the parent // class is burping on the passed URL encoding. If the page encoding // was passed into this factory, the encoding seems to be parsed // correctly (See the testEscapedEncoding unit test). // // This fixup may cause us to miss content. There is the charset case // noted above. TODO: Look out for cases where we fail other than for // the above given reason which will be fixed when we address // '[ 913687 ] Make extractors interrogate for charset'. uriPath = ensureMinimalEscaping(uriPath, charset); uriQuery = ensureMinimalEscaping(uriQuery, charset, LaxURLCodec.QUERY_SAFE); // Preallocate. The '1's and '2's in below are space for ':', // '//', etc. URI characters. MutableString s = new MutableString( ((uriScheme != null)? uriScheme.length(): 0) + 1 // ';' + ((uriAuthority != null)? uriAuthority.length(): 0) + 2 // '//' + ((uriPath != null)? uriPath.length(): 0) + 1 // '?' + ((uriQuery != null)? uriQuery.length(): 0)); appendNonNull(s, uriScheme, ":", true); appendNonNull(s, uriAuthority, "//", false); appendNonNull(s, uriPath, "", false); appendNonNull(s, uriQuery, "?", false); return s.toString(); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -