laxuri.java

来自「Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按」· Java 代码 · 共 452 行 · 第 1/2 页

JAVA

452 行

     * <p><blockquote><pre>     *   ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?     *    12            3  4          5       6  7        8 9     * </pre></blockquote><p>     * For example, matching the above expression to     *   http://jakarta.apache.org/ietf/uri/#Related     * results in the following subexpression matches:     * <p><blockquote><pre>     *               $1 = http:     *  scheme    =  $2 = http     *               $3 = //jakarta.apache.org     *  authority =  $4 = jakarta.apache.org     *  path      =  $5 = /ietf/uri/     *               $6 = <undefined>     *  query     =  $7 = <undefined>     *               $8 = #Related     *  fragment  =  $9 = Related     * </pre></blockquote><p>     *     * @param original the original character sequence     * @param escaped <code>true</code> if <code>original</code> is escaped     * @throws URIException If an error occurs.     */    protected void parseUriReference(String original, boolean escaped)        throws URIException {        // validate and contruct the URI character sequence        if (original == null) {            throw new URIException("URI-Reference required");        }        /* @         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?         */        String tmp = original.trim();                /*         * The length of the string sequence of characters.         * It may not be equal to the length of the byte array.         */        int length = tmp.length();        /*         * Remove the delimiters like angle brackets around an URI.         */        if (length > 0) {            char[] firstDelimiter = { tmp.charAt(0) };            if (validate(firstDelimiter, delims)) {                if (length >= 2) {                    char[] lastDelimiter = { tmp.charAt(length - 1) };                    if (validate(lastDelimiter, delims)) {                        tmp = tmp.substring(1, length - 1);                        length = length - 2;                    }                }            }        }        /*         * The starting index         */        int from = 0;        /*         * The test flag whether the URI is started from the path component.         */        boolean isStartedFromPath = false;        int atColon = tmp.indexOf(':');        int atSlash = tmp.indexOf('/');        if ((atColon <= 0 && !tmp.startsWith("//"))            || (atSlash >= 0 && atSlash < atColon)) {            isStartedFromPath = true;        }        /*         * <p><blockquote><pre>         *     @@@@@@@@         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?         * </pre></blockquote><p>         */        int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);        if (at == -1) {             at = 0;        }        /*         * Parse the scheme.         * <p><blockquote><pre>         *  scheme    =  $2 = http         *              @         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?         * </pre></blockquote><p>         */        if (at > 0 && at < length && tmp.charAt(at) == ':') {            char[] target = tmp.substring(0, at).toLowerCase().toCharArray();            if (validate(target, scheme)) {                _scheme = target;            } else {                throw new URIException("incorrect scheme");            }            from = ++at;        }        /*         * Parse the authority component.         * <p><blockquote><pre>         *  authority =  $4 = jakarta.apache.org         *                  @@         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?         * </pre></blockquote><p>         */        // Reset flags        _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;        if (0 <= at && at < length && tmp.charAt(at) == '/') {            // Set flag            _is_hier_part = true;            if (at + 2 < length && tmp.charAt(at + 1) == '/'                 && !isStartedFromPath) {                // the temporary index to start the search from                int next = indexFirstOf(tmp, "/?#", at + 2);                if (next == -1) {                    next = (tmp.substring(at + 2).length() == 0) ? at + 2                         : tmp.length();                }                parseAuthority(tmp.substring(at + 2, next), escaped);                from = at = next;                // Set flag                _is_net_path = true;            }            if (from == at) {                // Set flag                _is_abs_path = true;            }        }        /*         * Parse the path component.         * <p><blockquote><pre>         *  path      =  $5 = /ietf/uri/         *                                @@@@@@         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?         * </pre></blockquote><p>         */        if (from < length) {            // rel_path = rel_segment [ abs_path ]            int next = indexFirstOf(tmp, "?#", from);            if (next == -1) {                next = tmp.length();            }            if (!_is_abs_path) {                if (!escaped                     && prevalidate(tmp.substring(from, next), disallowed_rel_path)                     || escaped                     && validate(tmp.substring(from, next).toCharArray(), rel_path)) {                    // Set flag                    _is_rel_path = true;                } else if (!escaped                     && prevalidate(tmp.substring(from, next), disallowed_opaque_part)                     || escaped                     && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {                    // Set flag                    _is_opaque_part = true;                } else {                    // the path component may be empty                    _path = null;                }            }            String s = tmp.substring(from, next);            if (escaped) {                setRawPath(s.toCharArray());            } else {                setPath(s);            }            at = next;        }        // set the charset to do escape encoding        String charset = getProtocolCharset();        /*         * Parse the query component.         * <p><blockquote><pre>         *  query     =  $7 = <undefined>         *                                        @@@@@@@@@         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?         * </pre></blockquote><p>         */        if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {            int next = tmp.indexOf('#', at + 1);            if (next == -1) {                next = tmp.length();            }            if (escaped) {                _query = tmp.substring(at + 1, next).toCharArray();                if (!validate(_query, query)) {                    throw new URIException("Invalid query");                }            } else {                _query = encode(tmp.substring(at + 1, next), allowed_query, charset);            }            at = next;        }        /*         * Parse the fragment component.         * <p><blockquote><pre>         *  fragment  =  $9 = Related         *                                                   @@@@@@@@         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?         * </pre></blockquote><p>         */        if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {            if (at + 1 == length) { // empty fragment                _fragment = "".toCharArray();            } else {                _fragment = (escaped) ? tmp.substring(at + 1).toCharArray()                     : encode(tmp.substring(at + 1), allowed_fragment, charset);            }        }        // set this URI.        setURI();    }    }

laxuri.java - 源码说明

本页面展示了「Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。」中的 laxuri.java 源码文件，采用 Java 编程语言编写，共 452 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与Heritrix相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?