⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 uurifactory.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
    /**     * Fixup 'authority' portion of URI, by removing any stray      * encoded spaces, lowercasing any domain names, and applying     * IDN-punycoding to Unicode domains.      *      * @param uriAuthority the authority string to fix     * @return fixed version     * @throws URIException     */    private String fixupAuthority(String uriAuthority) throws URIException {        // Lowercase the host part of the uriAuthority; don't destroy any        // userinfo capitalizations.  Make sure no illegal characters in        // domainlabel substring of the uri authority.        if (uriAuthority != null) {            // Get rid of any trailing escaped spaces:            // http://www.archive.org%20.  Rare but happens.            // TODO: reevaluate: do IE or firefox do such mid-URI space-removal?            // if not, we shouldn't either.             while(uriAuthority.endsWith(ESCAPED_SPACE)) {                uriAuthority = uriAuthority.substring(0,uriAuthority.length()-3);            }            // lowercase & IDN-punycode only the domain portion            int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);            int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex);            if(atIndex<0 && portColonIndex<0) {                // most common case: neither userinfo nor port                return fixupDomainlabel(uriAuthority);            } else if (atIndex<0 && portColonIndex>-1) {                // next most common: port but no userinfo                String domain = fixupDomainlabel(uriAuthority.substring(0,portColonIndex));                String port = uriAuthority.substring(portColonIndex);                return domain + port;            } else if (atIndex>-1 && portColonIndex<0) {                // uncommon: userinfo, no port                String userinfo = uriAuthority.substring(0,atIndex+1);                String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1));                return userinfo + domain;            } else {                // uncommon: userinfo, port                String userinfo = uriAuthority.substring(0,atIndex+1);                String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1,portColonIndex));                String port = uriAuthority.substring(portColonIndex);                return userinfo + domain + port;            }        }        return uriAuthority;    }        /**     * Fixup the domain label part of the authority.     *      * We're more lax than the spec. in that we allow underscores.     *      * @param label Domain label to fix.     * @return Return fixed domain label.     * @throws URIException     */    private String fixupDomainlabel(String label)    throws URIException {                // apply IDN-punycoding, as necessary        try {            // TODO: optimize: only apply when necessary, or            // keep cache of recent encodings            label = IDNA.toASCII(label);        } catch (IDNAException e) {            if(TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN,label)) {                // domain name has ACE prefix, leading/trailing dash, or                 // underscore -- but is still a name we wish to tolerate;                // simply continue            } else {                // problematic domain: neither ASCII acceptable characters                // nor IDN-punycodable, so throw exception                 // TODO: change to HeritrixURIException so distinguishable                // from URIExceptions in library code                URIException ue = new URIException(e+" "+label);                ue.initCause(e);                throw ue;            }        }        label = label.toLowerCase();        return label;    }        /**     * Ensure that there all characters needing escaping     * in the passed-in String are escaped. Stray '%' characters     * are *not* escaped, as per browser behavior.      *      * @param u String to escape     * @param charset      * @return string with any necessary escaping applied     */    private String ensureMinimalEscaping(String u, final String charset) {        return ensureMinimalEscaping(u, charset, LaxURLCodec.EXPANDED_URI_SAFE);    }        /**     * Ensure that there all characters needing escaping     * in the passed-in String are escaped. Stray '%' characters     * are *not* escaped, as per browser behavior.      *      * @param u String to escape     * @param charset      * @param bitset      * @return string with any necessary escaping applied     */    private String ensureMinimalEscaping(String u, final String charset,            final BitSet bitset) {        if (u == null) {            return null;        }        for (int i = 0; i < u.length(); i++) {            char c = u.charAt(i);            if (!bitset.get(c)) {                try {                    u = LaxURLCodec.DEFAULT.encode(bitset, u, charset);                } catch (UnsupportedEncodingException e) {                    e.printStackTrace();                }                break;            }        }        return u;    }    /**     * Escape any whitespace found.     *      * The parent class takes care of the bulk of escaping.  But if any     * instance of escaping is found in the URI, then we ask for parent     * to do NO escaping.  Here we escape any whitespace found irrespective     * of whether the uri has already been escaped.  We do this for     * case where uri has been judged already-escaped only, its been     * incompletly done and whitespace remains.  Spaces, etc., in the URI are     * a real pain.  Their presence will break log file and ARC parsing.     * @param uri URI string to check.     * @return uri with spaces escaped if any found.     */    protected String escapeWhitespace(String uri) {        // Just write a new string anyways.  The perl '\s' is not        // as inclusive as the Character.isWhitespace so there are        // whitespace characters we could miss.  So, rather than        // write some awkward regex, just go through the string        // a character at a time.  Only create buffer first time        // we find a space.        MutableString buffer = null;        for (int i = 0; i < uri.length(); i++) {            char c = uri.charAt(i);            if (Character.isWhitespace(c)) {                if (buffer == null) {                    buffer = new MutableString(uri.length() +                        2 /*If space, two extra characters (at least)*/);                    buffer.append(uri.substring(0, i));                }                buffer.append("%");                String hexStr = Integer.toHexString(c);                if ((hexStr.length() % 2) > 0) {                    buffer.append("0");                }                buffer.append(hexStr);                            } else {                if (buffer != null) {                    buffer.append(c);                }            }        }        return (buffer !=  null)? buffer.toString(): uri;    }    /**     * Check port on passed http authority.  Make sure the size is not larger     * than allowed: See the 'port' definition on this     * page, http://www.kerio.com/manual/wrp/en/418.htm.     * Also, we've seen port numbers of '0080' whose leading zeros confuse     * the parent class. Strip the leading zeros.     *     * @param uriAuthority     * @return Null or an amended port number.     * @throws URIException     */    private String checkPort(String uriAuthority)    throws URIException {        Matcher m = PORTREGEX.matcher(uriAuthority);        if (m.matches()) {            String no = m.group(2);            if (no != null && no.length() > 0) {                // First check if the port has leading zeros                // as in '0080'.  Strip them if it has and                // then reconstitute the uriAuthority.  Be careful                // of cases where port is '0' or '000'.                while (no.charAt(0) == '0' && no.length() > 1) {                    no = no.substring(1);                }                uriAuthority = m.group(1) + no;                // Now makesure the number is legit.                int portNo = Integer.parseInt(no);                if (portNo <= 0 || portNo > 65535) {                    throw new URIException("Port out of bounds: " +                        uriAuthority);                }            }        }        return uriAuthority;    }    /**     * @param b Buffer to append to.     * @param str String to append if not null.     * @param substr Suffix or prefix to use if <code>str</code> is not null.     * @param suffix True if <code>substr</code> is a suffix.     */    private void appendNonNull(MutableString b, String str, String substr,            boolean suffix) {        if (str != null && str.length() > 0) {            if (!suffix) {                b.append(substr);            }            b.append(str);            if (suffix) {                b.append(substr);            }        }    }    /**     * @param str String to work on.     * @param prefix Prefix to strip if present.     * @return <code>str</code> w/o <code>prefix</code>.     */    private String stripPrefix(String str, String prefix) {        return str.startsWith(prefix)?            str.substring(prefix.length(), str.length()):            str;    }    /**     * @param str String to work on.     * @param tail Tail to strip if present.     * @return <code>str</code> w/o <code>tail</code>.     */    private static String stripTail(String str, String tail) {        return str.endsWith(tail)?            str.substring(0, str.length() - tail.length()):            str;    }    /**     * @param element to examine.     * @return Null if passed null or an empty string otherwise     * <code>element</code>.     */    private String checkUriElement(String element) {        return (element == null || element.length() <= 0)? null: element;    }    /**     * @param element to examine and lowercase if non-null.     * @return Null if passed null or an empty string otherwise     * <code>element</code> lowercased.     */    private String checkUriElementAndLowerCase(String element) {        String tmp = checkUriElement(element);        return (tmp != null)? tmp.toLowerCase(): tmp;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -