📄 uurifactory.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* UURIFactory * * $Id: UURIFactory.java,v 1.12 2006/07/18 00:40:16 gojomo Exp $ * * Created on July 16, 2004 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.net;import gnu.inet.encoding.IDNA;import gnu.inet.encoding.IDNAException;import it.unimi.dsi.mg4j.util.MutableString;import java.io.UnsupportedEncodingException;import java.util.Arrays;import java.util.BitSet;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.httpclient.URI;import org.apache.commons.httpclient.URIException;import org.archive.util.TextUtils;/** * Factory that returns UURIs. *  * Does escaping and fixup on URIs massaging in accordance with RFC2396 * and to match browser practice. For example, it removes any * '..' if first thing in the path as per IE,  converts backslashes to forward * slashes, and discards any 'fragment'/anchor portion of the URI. This * class will also fail URIs if they are longer than IE's allowed maximum * length. *  * <p>TODO: Test logging. *  * @author stack */public class UURIFactory extends URI {        /**     * Logging instance.     */    private static Logger logger =        Logger.getLogger(UURIFactory.class.getName());        /**     * The single instance of this factory.     */    private static final UURIFactory factory = new UURIFactory();        /**     * RFC 2396-inspired regex.     *     * From the RFC Appendix B:     * <pre>     * URI Generic Syntax                August 1998     *     * B. Parsing a URI Reference with a Regular Expression     *     * As described in Section 4.3, the generic URI syntax is not sufficient     * to disambiguate the components of some forms of URI.  Since the     * "greedy algorithm" described in that section is identical to the     * disambiguation method used by POSIX regular expressions, it is     * natural and commonplace to use a regular expression for parsing the     * potential four components and fragment identifier of a URI reference.     *     * The following line is the regular expression for breaking-down a URI     * reference into its components.     *     * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?     * 12            3  4          5       6  7        8 9     *     * The numbers in the second line above are only to assist readability;     * they indicate the reference points for each subexpression (i.e., each     * paired parenthesis).  We refer to the value matched for subexpression     * <n> as $<n>.  For example, matching the above expression to     *     * http://www.ics.uci.edu/pub/ietf/uri/#Related     *     * results in the following subexpression matches:     *     * $1 = http:     * $2 = http     * $3 = //www.ics.uci.edu     * $4 = www.ics.uci.edu     * $5 = /pub/ietf/uri/     * $6 = <undefined>     * $7 = <undefined>     * $8 = #Related     * $9 = Related     *     * where <undefined> indicates that the component is not present, as is     * the case for the query component in the above example.  Therefore, we     * can determine the value of the four components and fragment as     *     * scheme    = $2     * authority = $4     * path      = $5     * query     = $7     * fragment  = $9     * </pre>     *     * --      * <p>Below differs from the rfc regex in that it has java escaping of     * regex characters and we allow a URI made of a fragment only (Added extra     * group so indexing is off by one after scheme).     */    final static Pattern RFC2396REGEX = Pattern.compile(        "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?");    //    12            34  5          6       7   8          9 A    //              2 1             54        6          87 3      A9    // 1: scheme    // 2: scheme:    // 3: //authority/path    // 4: //authority    // 5: authority    // 6: path    // 7: ?query    // 8: query     // 9: #fragment    // A: fragment    public static final String SLASHDOTDOTSLASH = "^(/\\.\\./)+";    public static final String SLASH = "/";    public static final String HTTP = "http";    public static final String HTTP_PORT = ":80";    public static final String HTTPS = "https";    public static final String HTTPS_PORT = ":443";    public static final String DOT = ".";    public static final String EMPTY_STRING = "";    public static final String NBSP = "\u00A0";    public static final String SPACE = " ";    public static final String ESCAPED_SPACE = "%20";    public static final String TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$";    public static final String PIPE = "|";    public static final String PIPE_PATTERN = "\\|";    public static final String ESCAPED_PIPE = "%7C";    public static final String CIRCUMFLEX = "^";    public static final String CIRCUMFLEX_PATTERN = "\\^";    public static final String ESCAPED_CIRCUMFLEX = "%5E";    public static final String QUOT = "\"";    public static final String ESCAPED_QUOT = "%22";    public static final String SQUOT = "'";    public static final String ESCAPED_SQUOT = "%27";    public static final String APOSTROPH = "`";    public static final String ESCAPED_APOSTROPH = "%60";    public static final String LSQRBRACKET = "[";    public static final String LSQRBRACKET_PATTERN = "\\[";    public static final String ESCAPED_LSQRBRACKET = "%5B";    public static final String RSQRBRACKET = "]";    public static final String RSQRBRACKET_PATTERN = "\\]";    public static final String ESCAPED_RSQRBRACKET = "%5D";    public static final String LCURBRACKET = "{";    public static final String LCURBRACKET_PATTERN = "\\{";    public static final String ESCAPED_LCURBRACKET = "%7B";    public static final String RCURBRACKET = "}";    public static final String RCURBRACKET_PATTERN = "\\}";    public static final String ESCAPED_RCURBRACKET = "%7D";    public static final String BACKSLASH = "\\";    public static final String BACKSLASH_PATTERN = "\\\\";    public static final String ESCAPED_BACKSLASH = "%5C";    public static final String NEWLINE = "\n+|\r+";    public static final String IMPROPERESC_REPLACE = "%25$1";    public static final String IMPROPERESC =        "%((?:[^\\p{XDigit}])|(?:.[^\\p{XDigit}])|(?:\\z))";    public static final String COMMERCIAL_AT = "@";    public static final char PERCENT_SIGN = '%';    public static final char COLON = ':';        /**     * First percent sign in string followed by two hex chars.     */    public static final String URI_HEX_ENCODING =        "^[^%]*%[\\p{XDigit}][\\p{XDigit}].*";        /**     * Authority port number regex.     */    final static Pattern PORTREGEX = Pattern.compile("(.*:)([0-9]+)$");        /**     * Characters we'll accept in the domain label part of a URI     * authority: ASCII letters-digits-hyphen (LDH) plus underscore,     * with single intervening '.' characters.     *      * (We accept '_' because DNS servers have tolerated for many     * years counter to spec; we also accept dash patterns and ACE     * prefixes that will be rejected by IDN-punycoding attempt.)     */    final static String ACCEPTABLE_ASCII_DOMAIN =        "^(?:[a-zA-Z0-9_-]++(?:\\.)?)++$";        /**     * Pattern that looks for case of three or more slashes after the      * scheme.  If found, we replace them with two only as mozilla does.     */    final static Pattern HTTP_SCHEME_SLASHES =        Pattern.compile("^(https?://)/+(.*)");        /**     * Pattern that looks for case of two or more slashes in a path.     */    final static Pattern MULTIPLE_SLASHES = Pattern.compile("//+");        /**     * System property key for list of supported schemes.     */    private static final String SCHEMES_KEY = ".schemes";        /**     * System property key for list of purposefully-ignored schemes.     */    private static final String IGNORED_SCHEMES_KEY = ".ignored-schemes";    private String[] schemes = null;    private String[] ignoredSchemes = null;    public static final int IGNORED_SCHEME = 9999999;        /**     * Protected constructor.     */    private UURIFactory() {        super();        String s = System.getProperty(this.getClass().getName() + SCHEMES_KEY);        if (s != null && s.length() > 0) {            schemes = s.split("[, ]+");            Arrays.sort(schemes);        }        String ignored = System.getProperty(this.getClass().getName() + IGNORED_SCHEMES_KEY);        if (ignored != null && ignored.length() > 0) {            ignoredSchemes  = ignored.split("[, ]+");            Arrays.sort(ignoredSchemes);        }    }        /**     * @param uri URI as string.     * @return An instance of UURI     * @throws URIException     */    public static UURI getInstance(String uri) throws URIException {        return UURIFactory.factory.create(uri);    }        /**     * @param uri URI as string.     * @param charset Character encoding of the passed uri string.     * @return An instance of UURI     * @throws URIException     */
12 3 下一页
💿 文件大小 18588 K
👤 上传用户 bonylee_java
📂 所属分类 Jsp/Servlet
🏷️ 相关标签

#工程
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -