📄 uurifactory.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
    public static UURI getInstance(String uri, String charset)    		throws URIException {        return UURIFactory.factory.create(uri, charset);    }        /**     * @param base Base uri to use resolving passed relative uri.     * @param relative URI as string.     * @return An instance of UURI     * @throws URIException     */    public static UURI getInstance(UURI base, String relative)    		throws URIException {        return UURIFactory.factory.create(base, relative);    }        /**     * Test of whether passed String has an allowed URI scheme.     * First tests if likely scheme suffix.  If so, we then test if its one of     * the supported schemes.     * @param possibleUrl URL string to examine.     * @return True if passed string looks like it could be an URL.     */    public static boolean hasSupportedScheme(String possibleUrl) {        boolean hasScheme = UURI.hasScheme(possibleUrl);        if (!hasScheme || UURIFactory.factory.schemes == null) {            return hasScheme;        }        String tmpStr = possibleUrl.substring(0, possibleUrl.indexOf(':'));        return Arrays.binarySearch(UURIFactory.factory.schemes, tmpStr) >= 0;    }    /**     * @param uri URI as string.     * @return Instance of UURI.     * @throws URIException     */    private UURI create(String uri) throws URIException {        return create(uri, UURI.getDefaultProtocolCharset());    }        /**     * @param uri URI as string.     * @param charset Original encoding of the string.     * @return Instance of UURI.     * @throws URIException     */    private UURI create(String uri, String charset) throws URIException {        UURI uuri  = new UURI(fixup(uri, null, charset), true, charset);        if (logger.isLoggable(Level.FINE)) {            logger.fine("URI " + uri +                " PRODUCT " + uuri.toString() +                " CHARSET " + charset);        }        return validityCheck(uuri);    }        /**     * @param base UURI to use as a base resolving <code>relative</code>.     * @param relative Relative URI.     * @return Instance of UURI.     * @throws URIException     */    private UURI create(UURI base, String relative) throws URIException {        UURI uuri = new UURI(base, new UURI(fixup(relative, base, base.getProtocolCharset()),            true, base.getProtocolCharset()));        if (logger.isLoggable(Level.FINE)) {            logger.fine(" URI " + relative +                " PRODUCT " + uuri.toString() +                " CHARSET " + base.getProtocolCharset() +                " BASE " + base);        }        return validityCheck(uuri);    }    /**     * Check the generated UURI.     *      * At the least look at length of uuri string.  We were seeing case     * where before escaping, string was &lt; MAX_URL_LENGTH but after was     * &gt;.  Letting out a too-big message was causing us troubles later     * down the processing chain.     * @param uuri Created uuri to check.     * @return The passed <code>uuri</code> so can easily inline this check.     * @throws URIException     */    protected UURI validityCheck(UURI uuri) throws URIException {        if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) {           throw new URIException("Created (escaped) uuri > " +              UURI.MAX_URL_LENGTH +": "+uuri.toString());        }        return uuri;    }        /**     * Do heritrix fix-up on passed uri string.     *     * Does heritrix escaping; usually escaping done to make our behavior align     * with IEs.  This method codifies our experience pulling URIs from the     * wilds.  Its does all the escaping we want; its output can always be     * assumed to be 'escaped' (though perhaps to a laxer standard than the      * vanilla HttpClient URI class or official specs might suggest).      *     * @param uri URI as string.     * @param base May be null.     * @param e True if the uri is already escaped.     * @return A fixed up URI string.     * @throws URIException     */    private String fixup(String uri, final URI base, final String charset)    throws URIException {        if (uri == null) {            throw new NullPointerException();        } else if (uri.length() == 0 && base == null) {            throw new URIException("URI length is zero (and not relative).");        }                if (uri.length() > UURI.MAX_URL_LENGTH) {            // We check length here and again later after all convertions.            throw new URIException("URI length > " + UURI.MAX_URL_LENGTH +                ": " + uri);        }                // Replace nbsp with normal spaces (so that they get stripped if at        // ends, or encoded if in middle)        if (uri.indexOf(NBSP) >= 0) {            uri = TextUtils.replaceAll(NBSP, uri, SPACE);        }                // Get rid of any trailing spaces or new-lines.         uri = uri.trim();                // IE actually converts backslashes to slashes rather than to %5C.        // Since URIs that have backslashes usually work only with IE, we will        // convert backslashes to slashes as well.        // TODO: Maybe we can first convert backslashes by specs and than by IE        // so that we fetch both versions.        if (uri.indexOf(BACKSLASH) >= 0) {            uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH);        }                // Kill newlines etc        uri = TextUtils.replaceAll(NEWLINE, uri, EMPTY_STRING);                // Test for the case of more than two slashes after the http(s) scheme.        // Replace with two slashes as mozilla does if found.        // See [ 788219 ] URI Syntax Errors stop page parsing.        Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri);        if (matcher.matches()) {            uri = matcher.group(1) + matcher.group(2);        }        // now, minimally escape any whitespace        uri = escapeWhitespace(uri);                // For further processing, get uri elements.  See the RFC2396REGEX        // comment above for explaination of group indices used in the below.        matcher = RFC2396REGEX.matcher(uri);        if (!matcher.matches()) {            throw new URIException("Failed parse of " + uri);        }        String uriScheme = checkUriElementAndLowerCase(matcher.group(2));        String uriSchemeSpecificPart = checkUriElement(matcher.group(3));        String uriAuthority = checkUriElement(matcher.group(5));        String uriPath = checkUriElement(matcher.group(6));        String uriQuery = checkUriElement(matcher.group(8));        // UNUSED String uriFragment = checkUriElement(matcher.group(10));                // If a scheme, is it a supported scheme?        if (uriScheme != null && uriScheme.length() > 0 &&                this.schemes != null) {            if (!(Arrays.binarySearch(schemes,uriScheme)>=0)) {                // unsupported; see if silently ignored                if((Arrays.binarySearch(ignoredSchemes,uriScheme)>=0)) {                    throw new URIException(                            IGNORED_SCHEME, "Ignored scheme: " + uriScheme);                } else {                    throw new URIException("Unsupported scheme: " + uriScheme);                }            }        }                // Test if relative URI. If so, need a base to resolve against.        if (uriScheme == null || uriScheme.length() <= 0) {            if (base == null) {                throw new URIException("Relative URI but no base: " + uri);            }        }                // fixup authority portion: lowercase/IDN-punycode any domain;         // remove stray trailing spaces        uriAuthority = fixupAuthority(uriAuthority);        // Do some checks if absolute path.        if (uriSchemeSpecificPart != null &&                uriSchemeSpecificPart.startsWith(SLASH)) {            if (uriPath != null) {                // Eliminate '..' if its first thing in the path.  IE does this.                uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath,                    SLASH);            }            // Ensure root URLs end with '/': browsers always send "/"            // on the request-line, so we should consider "http://host"            // to be "http://host/".            if (uriPath == null || EMPTY_STRING.equals(uriPath)) {                uriPath = SLASH;            }        }        if (uriAuthority != null) {            if (uriScheme != null && uriScheme.length() > 0 &&                    uriScheme.equals(HTTP)) {                uriAuthority = checkPort(uriAuthority);                uriAuthority = stripTail(uriAuthority, HTTP_PORT);            } else if (uriScheme != null && uriScheme.length() > 0 &&                    uriScheme.equals(HTTPS)) {                uriAuthority = checkPort(uriAuthority);                uriAuthority = stripTail(uriAuthority, HTTPS_PORT);            }            // Strip any prefix dot or tail dots from the authority.            uriAuthority = stripTail(uriAuthority, DOT);            uriAuthority = stripPrefix(uriAuthority, DOT);        } else {            // no authority; may be relative. consider stripping scheme            // to work-around org.apache.commons.httpclient.URI bug            // ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 )            if (uriScheme != null && base != null                    && uriScheme.equals(base.getScheme())) {                // uriScheme redundant and will only confound httpclient.URI                uriScheme = null;             }        }                // Ensure minimal escaping. Use of 'lax' URI and URLCodec         // means minimal escaping isn't necessarily complete/consistent.        // There is a chance such lax encoding will throw exceptions        // later at inconvenient times.         //        // One reason for these bad escapings -- though not the only --        // is that the page is using an encoding other than the ASCII or the        // UTF-8 that is our default URI encoding.  In this case the parent        // class is burping on the passed URL encoding.  If the page encoding        // was passed into this factory, the encoding seems to be parsed        // correctly (See the testEscapedEncoding unit test).        //        // This fixup may cause us to miss content.  There is the charset case        // noted above.  TODO: Look out for cases where we fail other than for        // the above given reason which will be fixed when we address        // '[ 913687 ] Make extractors interrogate for charset'.        uriPath = ensureMinimalEscaping(uriPath, charset);        uriQuery = ensureMinimalEscaping(uriQuery, charset,            LaxURLCodec.QUERY_SAFE);        // Preallocate.  The '1's and '2's in below are space for ':',        // '//', etc. URI characters.        MutableString s = new MutableString(            ((uriScheme != null)? uriScheme.length(): 0)            + 1 // ';'             + ((uriAuthority != null)? uriAuthority.length(): 0)            + 2 // '//'            + ((uriPath != null)? uriPath.length(): 0)            + 1 // '?'            + ((uriQuery != null)? uriQuery.length(): 0));        appendNonNull(s, uriScheme, ":", true);        appendNonNull(s, uriAuthority, "//", false);        appendNonNull(s, uriPath, "", false);        appendNonNull(s, uriQuery, "?", false);        return s.toString();    }
💿 文件大小 18588 K
👤 上传用户 bonylee_java
📂 所属分类 Jsp/Servlet
🏷️ 相关标签

#工程
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -