⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 laxuri.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* LaxURI** $Id: LaxURI.java,v 1.6 2006/07/18 00:40:16 gojomo Exp $** Created on Aug 3, 2005** Copyright (C) 2005 Internet Archive.** This file is part of the Heritrix web crawler (crawler.archive.org).** Heritrix is free software; you can redistribute it and/or modify* it under the terms of the GNU Lesser Public License as published by* the Free Software Foundation; either version 2.1 of the License, or* any later version.** Heritrix is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU Lesser Public License for more details.** You should have received a copy of the GNU Lesser Public License* along with Heritrix; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/ package org.archive.net;import java.util.Arrays;import java.util.BitSet;import org.apache.commons.httpclient.URI;import org.apache.commons.httpclient.URIException;import org.apache.commons.httpclient.util.EncodingUtil;/** * URI subclass which allows partial/inconsistent encoding, matching * the URIs which will be relayed in requests from popular web * browsers (esp. Mozilla Firefox and MS IE). *  * @author gojomo */public class LaxURI extends URI {    final protected static char[] HTTP_SCHEME = {'h','t','t','p'};    final protected static char[] HTTPS_SCHEME = {'h','t','t','p','s'};        protected static final BitSet lax_rel_segment = new BitSet(256);    // Static initializer for lax_rel_segment    static {        lax_rel_segment.or(rel_segment);        lax_rel_segment.set(':'); // allow ':'        // TODO: add additional allowances as need is demonstrated    }    protected static final BitSet lax_abs_path = new BitSet(256);    static {        lax_abs_path.or(abs_path);        lax_abs_path.set('|'); // tests indicate Firefox (1.0.6) doesn't escape.    }        protected static final BitSet lax_query = new BitSet(256);    static {        lax_query.or(query);        lax_query.set('{'); // tests indicate FF doesn't escape { in query        lax_query.set('}'); // tests indicate FF doesn't escape } in query        lax_query.set('|'); // tests indicate FF doesn't escape | in query        lax_query.set('['); // tests indicate FF doesn't escape [ in query        lax_query.set(']'); // tests indicate FF doesn't escape ] in query        lax_query.set('^'); // tests indicate FF doesn't escape ^ in query    }        // passthrough initializers    public LaxURI(String uri, boolean escaped, String charset)    throws URIException {        super(uri,escaped,charset);    }    public LaxURI(URI base, URI relative) throws URIException {        super(base,relative);    }    public LaxURI(String uri, boolean escaped) throws URIException {        super(uri,escaped);    }    public LaxURI() {        super();    }    // overridden to use this class's static decode()    public String getURI() throws URIException {        return (_uri == null) ? null : decode(_uri, getProtocolCharset());    }        // overridden to use this class's static decode()    public String getPath() throws URIException {        char[] p = getRawPath();        return (p == null) ? null : decode(p, getProtocolCharset());    }    // overridden to use this class's static decode()    public String getPathQuery() throws URIException {        char[] rawPathQuery = getRawPathQuery();        return (rawPathQuery == null) ? null : decode(rawPathQuery,                getProtocolCharset());    }    // overridden to use this class's static decode()    protected static String decode(char[] component, String charset)            throws URIException {        if (component == null) {            throw new IllegalArgumentException(                    "Component array of chars may not be null");        }        return decode(new String(component), charset);    }    // overridden to use IA's LaxURLCodec, which never throws DecoderException    protected static String decode(String component, String charset)            throws URIException {        if (component == null) {            throw new IllegalArgumentException(                    "Component array of chars may not be null");        }        byte[] rawdata = null;        //     try {        rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil                .getAsciiBytes(component));        //     } catch (DecoderException e) {        //         throw new URIException(e.getMessage());        //     }        return EncodingUtil.getString(rawdata, charset);    }        // overidden to lax() the acceptable-char BitSet passed in    protected boolean validate(char[] component, BitSet generous) {        return super.validate(component, lax(generous));    }    // overidden to lax() the acceptable-char BitSet passed in    protected boolean validate(char[] component, int soffset, int eoffset,            BitSet generous) {        return super.validate(component, soffset, eoffset, lax(generous));    }        /**     * Given a BitSet -- typically one of the URI superclass's     * predefined static variables -- possibly replace it with     * a more-lax version to better match the character sets     * actually left unencoded in web browser requests     *      * @param generous original BitSet     * @return (possibly more lax) BitSet to use     */    protected BitSet lax(BitSet generous) {        if (generous == rel_segment) {            // Swap in more lax allowable set            return lax_rel_segment;        }        if (generous == abs_path) {            return lax_abs_path;        }        if (generous == query) {            return lax_query;        }        // otherwise, leave as is        return generous;    }        /**      * Coalesce the _host and _authority fields where      * possible.     *      * In the web crawl/http domain, most URIs have an      * identical _host and _authority. (There is no port     * or user info.) However, the superclass always      * creates two separate char[] instances.      *      * Notably, the lengths of these char[] fields are      * equal if and only if their values are identical.     * This method makes use of this fact to reduce the     * two instances to one where possible, slimming      * instances.       *      * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean)     */    protected void parseAuthority(String original, boolean escaped)            throws URIException {        super.parseAuthority(original, escaped);        if (_host != null && _authority != null                && _host.length == _authority.length) {            _host = _authority;        }    }            /**      * Coalesce _scheme to existing instances, where appropriate.     *      * In the web-crawl domain, most _schemes are 'http' or 'https',     * but the superclass always creates a new char[] instance. For     * these two cases, we replace the created instance with a      * long-lived instance from a static field, saving 12-14 bytes     * per instance.      *      * @see org.apache.commons.httpclient.URI#setURI()     */    protected void setURI() {        if (_scheme != null) {            if (_scheme.length == 4 && Arrays.equals(_scheme, HTTP_SCHEME)) {                _scheme = HTTP_SCHEME;            } else if (_scheme.length == 5                    && Arrays.equals(_scheme, HTTP_SCHEME)) {                _scheme = HTTPS_SCHEME;            }        }        super.setURI();    }        /**     * IA OVERRIDDEN IN LaxURI TO INCLUDE FIX FOR      * http://issues.apache.org/jira/browse/HTTPCLIENT-588     *      * In order to avoid any possilbity of conflict with non-ASCII characters,     * Parse a URI reference as a <code>String</code> with the character     * encoding of the local system or the document.     * <p>     * The following line is the regular expression for breaking-down a URI     * reference into its components.     * <p><blockquote><pre>     *   ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -