⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchhttp.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
            "File to preload cookies from", ""));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES,            "When crawl finishes save cookies to this file", ""));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_TRUST,            "SSL certificate trust level.  Range is from the default 'open'"            + " (trust all certs including expired, selfsigned, and those for"            + " which we do not have a CA) through 'loose' (trust all valid"            + " certificates including selfsigned), 'normal' (all valid"            + " certificates not including selfsigned) to 'strict' (Cert is"            + " valid and DN must match servername)",            ConfigurableX509TrustManager.DEFAULT,            ConfigurableX509TrustManager.LEVELS_AS_ARRAY));        e.setOverrideable(false);        e.setExpertSetting(true);        e = addElementToDefinition(new StringList(ATTR_ACCEPT_HEADERS,            "Accept Headers to include in each request. Each must be the"            + " complete header, e.g., 'Accept-Language: en'"));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST,            "Proxy host IP (set only if needed).", ""));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT,            "Proxy port (set only if needed)", ""));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING,            "The character encoding to use for files that do not have one" +            " specified in the HTTP response headers.  Default: " +            DEFAULT_CONTENT_CHARSET + ".",            DEFAULT_CONTENT_CHARSET));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SHA1_CONTENT,                "Whether or not to perform an on-the-fly SHA1 hash of" +                "retrieved content-bodies.",                DEFAULT_SHA1_CONTENT));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE,            "Send 'Connection: close' header with every request.",             new Boolean(true)));        e.setOverrideable(true);        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER,             "Send 'Referer' header with every request.\n" +             "The 'Referer' header contans the location the crawler came " +             " from, " +             "the page the current URI was discovered in. The 'Referer' " +             "usually is " +             "logged on the remote server and can be of assistance to " +             "webmasters trying to figure how a crawler got to a " +             "particular area on a site.",             new Boolean(true)));        e.setOverrideable(true);        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE,              "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES +              ") on document size.\n" +              "Be polite to the HTTP servers and send the 'Range' header," +              "stating that you are only interested in the first n bytes. " +              "Only pertinent if " + ATTR_MAX_LENGTH_BYTES + " > 0. " +              "Sending the 'Range' header results in a " +              "'206 Partial Content' status response, which is better than " +              "just cutting the response mid-download. On rare occasion, " +              " sending 'Range' will " +              "generate '416 Request Range Not Satisfiable' response.",              new Boolean(false)));           e.setOverrideable(true);           e.setExpertSetting(true);           e = addElementToDefinition(new SimpleType(ATTR_LOCAL_ADDRESS,               "Local IP address or hostname to use when making connections " +               "(binding sockets). When not specified, uses default local" +               "address(es).", ""));           e.setExpertSetting(true);    }    protected void innerProcess(final CrawlURI curi)    throws InterruptedException {        if (!canFetch(curi)) {            // Cannot fetch this, due to protocol, retries, or other problems            return;        }        this.curisHandled++;        // Note begin time        curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());        // Get a reference to the HttpRecorder that is set into this ToeThread.        HttpRecorder rec = HttpRecorder.getHttpRecorder();                // Shall we get a digest on the content downloaded?        boolean sha1Content = ((Boolean)getUncheckedAttribute(curi,            ATTR_SHA1_CONTENT)).booleanValue();        if(sha1Content) {            rec.getRecordedInput().setSha1Digest();        } else {            // clear            rec.getRecordedInput().setDigest(null);        }                // Below we do two inner classes that add check of midfetch        // filters just as we're about to receive the response body.        String curiString = curi.getUURI().toString();        HttpMethodBase method = null;        if (curi.isPost()) {            method = new HttpRecorderPostMethod(curiString, rec) {                protected void readResponseBody(HttpState state,                        HttpConnection conn)                throws IOException, HttpException {                    addResponseContent(this, curi);                    if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) {                        doAbort(curi, this, MIDFETCH_ABORT_LOG);                    } else {                        super.readResponseBody(state, conn);                    }                }            };        } else {            method = new HttpRecorderGetMethod(curiString, rec) {                protected void readResponseBody(HttpState state,                        HttpConnection conn)                throws IOException, HttpException {                    addResponseContent(this, curi);                    if (checkMidfetchAbort(curi, this.httpRecorderMethod,                            conn)) {                        doAbort(curi, this, MIDFETCH_ABORT_LOG);                    } else {                        super.readResponseBody(state, conn);                    }                }            };        }        HostConfiguration customConfigOrNull = configureMethod(curi, method);                // Set httpRecorder into curi. Subsequent code both here and later        // in extractors expects to find the HttpRecorder in the CrawlURI.        curi.setHttpRecorder(rec);                // Populate credentials. Set config so auth. is not automatic.        boolean addedCredentials = populateCredentials(curi, method);        method.setDoAuthentication(addedCredentials);                try {            this.http.executeMethod(customConfigOrNull, method);        } catch (RecorderTooMuchHeaderException ex) {            // when too much header material, abort like other truncations            doAbort(curi, method, HEADER_TRUNC);        } catch (IOException e) {        	failedExecuteCleanup(method, curi, e);        	return;        } catch (ArrayIndexOutOfBoundsException e) {            // For weird windows-only ArrayIndex exceptions in native            // code... see            // http://forum.java.sun.com/thread.jsp?forum=11&thread=378356            // treating as if it were an IOException            failedExecuteCleanup(method, curi, e);            return;        }                // set softMax on bytes to get (if implied by content-length)         long softMax = method.getResponseContentLength();                // set hardMax on bytes (if set by operator)        long hardMax = getMaxLength(curi);	// Get max fetch rate (bytes/ms). It comes in in KB/sec, which	// requires nothing to normalize.        int maxFetchRate = getMaxFetchRate(curi);        try {            if (!method.isAborted()) {                // Force read-to-end, so that any socket hangs occur here,                // not in later modules.                rec.getRecordedInput().readFullyOrUntil(softMax,                        hardMax, 1000 * getTimeout(curi), maxFetchRate);            }        } catch (RecorderTimeoutException ex) {            doAbort(curi, method, TIMER_TRUNC);        } catch (RecorderLengthExceededException ex) {            doAbort(curi, method, LENGTH_TRUNC);        } catch (IOException e) {            cleanup(curi, e, "readFully", S_CONNECT_LOST);            return;        } catch (ArrayIndexOutOfBoundsException e) {            // For weird windows-only ArrayIndex exceptions from native code            // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356            // treating as if it were an IOException            cleanup(curi, e, "readFully", S_CONNECT_LOST);            return;        } finally {            // ensure recording has stopped            rec.closeRecorders();            if (!method.isAborted()) {                method.releaseConnection();            }            // Note completion time            curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());            // Set the response charset into the HttpRecord if available.            setCharacterEncoding(rec, method);            curi.setContentSize(rec.getRecordedInput().getSize());        }                curi.setContentDigest(SHA1, rec.getRecordedInput().getDigestValue());        if (logger.isLoggable(Level.INFO)) {            logger.info((curi.isPost()? "POST": "GET") + " " +                curi.getUURI().toString() + " " + method.getStatusCode() +                " " + rec.getRecordedInput().getSize() + " " +                curi.getContentType());        }        if (curi.isSuccess() && addedCredentials) {            // Promote the credentials from the CrawlURI to the CrawlServer            // so they are available for all subsequent CrawlURIs on this            // server.            promoteCredentials(curi);            if (logger.isLoggable(Level.FINE)) {                // Print out the cookie.  Might help with the debugging.                Header setCookie = method.getResponseHeader("set-cookie");                if (setCookie != null) {                    logger.fine(setCookie.toString().trim());                }            }        } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {            // 401 is not 'success'.            handle401(method, curi);        }                if (rec.getRecordedInput().isOpen()) {            logger.severe(curi.toString() + " RIS still open. Should have" +                " been closed by method release: " +                Thread.currentThread().getName());            try {                rec.getRecordedInput().close();            } catch (IOException e) {                logger.log(Level.SEVERE,"second-chance RIS close failed",e);            }        }    }        protected void doAbort(CrawlURI curi, HttpMethod method,            String annotation) {        curi.addAnnotation(annotation);        curi.getHttpRecorder().close();        method.abort();    }        protected boolean checkMidfetchAbort(CrawlURI curi,            HttpRecorderMethod method, HttpConnection conn) {        if (curi.isPrerequisite() || filtersAccept(midfetchfilters, curi)) {            return false;        }        method.markContentBegin(conn);        return true;    }        /**     * This method populates <code>curi</code> with response status and     * content type.     * @param curi CrawlURI to populate.     * @param method Method to get response status and headers from.     */    protected void addResponseContent (HttpMethod method, CrawlURI curi) {        curi.setFetchStatus(method.getStatusCode());        Header ct = method.getResponseHeader("content-type");        curi.setContentType((ct == null)? null: ct.getValue());        // Save method into curi too.  Midfetch filters may want to leverage        // info in here.        curi.putObject(A_HTTP_TRANSACTION, method);    }    /**     * Set the character encoding based on the result headers or default.     *     * The HttpClient returns its own default encoding ("ISO-8859-1") if one     * isn't specified in the Content-Type response header. We give the user     * the option of overriding this, so we need to detect the case where the     * default is returned.     *     * Now, it may well be the case that the default returned by HttpClient     * and the default defined by the user are the same.     *      * @param rec Recorder for this request.     * @param method Method used for the request.     */    private void setCharacterEncoding(final HttpRecorder rec,        final HttpMethod method) {        String encoding = null;        try {            encoding = ((HttpMethodBase) method).getResponseCharSet();            if (encoding == null ||                    encoding.equals(DEFAULT_CONTENT_CHARSET)) {                encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING);            }        } catch (Exception e) {            logger.warning("Failed get default encoding: " +                e.getLocalizedMessage());        }        rec.setCharacterEncoding(encoding);    }    /**     * Cleanup after a failed method execute.     * @param curi CrawlURI we failed on.     * @param method Method we failed on.     * @param exception Exception we failed with.     */    private void failedExecuteCleanup(final HttpMethod method,            final CrawlURI curi, final Exception exception) {        cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED);        method.releaseConnection();    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -