⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchhttp.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
        }    }        /**     * This method populates <code>curi</code> with response status and     * content type.     * @param curi CrawlURI to populate.     * @param method Method to get response status and headers from.     */    protected void addResponseContent (HttpMethod method, CrawlURI curi) {        curi.setFetchStatus(method.getStatusCode());        Header ct = method.getResponseHeader("content-type");        curi.setContentType((ct == null)? null: ct.getValue());        // Save method into curi too.  Midfetch filters may want to leverage        // info in here.        curi.putObject(A_HTTP_TRANSACTION, method);    }    /**     * Set the character encoding based on the result headers or default.     *     * The HttpClient returns its own default encoding ("ISO-8859-1") if one     * isn't specified in the Content-Type response header. We give the user     * the option of overriding this, so we need to detect the case where the     * default is returned.     *     * Now, it may well be the case that the default returned by HttpClient     * and the default defined by the user are the same.     *      * @param rec Recorder for this request.     * @param method Method used for the request.     */    private void setCharacterEncoding(final HttpRecorder rec,        final HttpMethod method) {        String encoding = null;        try {            encoding = ((HttpMethodBase) method).getResponseCharSet();            if (encoding == null ||                    encoding.equals(DEFAULT_CONTENT_CHARSET)) {                encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING);            }        } catch (Exception e) {            logger.warning("Failed get default encoding: " +                e.getLocalizedMessage());        }        rec.setCharacterEncoding(encoding);    }    /**     * Cleanup after a failed method execute.     * @param curi CrawlURI we failed on.     * @param method Method we failed on.     * @param exception Exception we failed with.     */    private void failedExecuteCleanup(final HttpMethod method,            final CrawlURI curi, final Exception exception) {        cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED);        method.releaseConnection();    }        /**     * Cleanup after a failed method execute.     * @param curi CrawlURI we failed on.     * @param exception Exception we failed with.     * @param message Message to log with failure.     * @param status Status to set on the fetch.     */    private void cleanup(final CrawlURI curi, final Exception exception,            final String message, final int status) {        curi.addLocalizedError(this.getName(), exception, message);        curi.setFetchStatus(status);        curi.getHttpRecorder().close();    }    /**     * Can this processor fetch the given CrawlURI. May set a fetch     * status if this processor would usually handle the CrawlURI,     * but cannot in this instance.     *     * @param curi     * @return True if processor can fetch.     */    private boolean canFetch(CrawlURI curi) {        if(curi.getFetchStatus()<0) {            // already marked as errored, this pass through            // skip to end            curi.skipToProcessorChain(getController().getPostprocessorChain());            return false;                     }        String scheme = curi.getUURI().getScheme();         if (!(scheme.equals("http") || scheme.equals("https"))) {             // handles only plain http and https             return false;         }         CrawlHost host = getController().getServerCache().getHostFor(curi);         // make sure the dns lookup succeeded         if (host.getIP() == null && host.hasBeenLookedUp()) {             curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);             return false;         }        return true;    }    /**     * Configure the HttpMethod setting options and headers.     *     * @param curi CrawlURI from which we pull configuration.     * @param method The Method to configure.     */    protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) {        // Don't auto-follow redirects        method.setFollowRedirects(false);        //        // set soTimeout//        method.getParams().setSoTimeout(//                ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))//                        .intValue());                // Set cookie policy.        method.getParams().setCookiePolicy(            (((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)).                booleanValue())?                    CookiePolicy.IGNORE_COOKIES:                CookiePolicy.BROWSER_COMPATIBILITY);        // Use only HTTP/1.0 (to avoid receiving chunked responses)        method.getParams().setVersion(HttpVersion.HTTP_1_0);        CrawlOrder order = getSettingsHandler().getOrder();        String userAgent = curi.getUserAgent();        if (userAgent == null) {            userAgent = order.getUserAgent(curi);        }        method.setRequestHeader("User-Agent", userAgent);        method.setRequestHeader("From", order.getFrom(curi));                // Set retry handler.        method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,            new HeritrixHttpMethodRetryHandler());                final long maxLength = getMaxLength(curi);        if(maxLength > 0 &&                ((Boolean)getUncheckedAttribute(curi, ATTR_SEND_RANGE)).                    booleanValue()) {            method.addRequestHeader(RANGE,                RANGE_PREFIX.concat(Long.toString(maxLength - 1)));        }                if (((Boolean)getUncheckedAttribute(curi,                ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) {            method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE);        }                if (((Boolean)getUncheckedAttribute(curi,                ATTR_SEND_REFERER)).booleanValue()) {            // RFC2616 says no referer header if referer is https and the url            // is not            String via = curi.flattenVia();            if (via != null && via.length() > 0 &&                !(via.startsWith(HTTPS_SCHEME) &&                    curi.getUURI().getScheme().equals(HTTP_SCHEME))) {                method.setRequestHeader(REFERER, via);            }        }                if(!curi.isPrerequisite()) {            setConditionalGetHeader(curi, method, ATTR_SEND_IF_MODIFIED_SINCE,                     CoreAttributeConstants.A_LAST_MODIFIED_HEADER, "If-Modified-Since");            setConditionalGetHeader(curi, method, ATTR_SEND_IF_NONE_MATCH,                     CoreAttributeConstants.A_ETAG_HEADER, "If-None-Match");        }                // TODO: What happens if below method adds a header already        // added above: e.g. Connection, Range, or Referer?        setAcceptHeaders(curi, method);                return configureProxy(curi);    }    /**     * Set the given conditional-GET header, if the setting is enabled and     * a suitable value is available in the URI history.      * @param curi source CrawlURI     * @param method HTTP operation pending     * @param setting true/false enablement setting name to consult     * @param sourceHeader header to consult in URI history     * @param targetHeader header to set if possible     */    protected void setConditionalGetHeader(CrawlURI curi, HttpMethod method,             String setting, String sourceHeader, String targetHeader) {        if(((Boolean)getUncheckedAttribute(curi,setting))) {            try {                String previous = curi.getAList().getAListArray(                        A_FETCH_HISTORY)[0].getString(sourceHeader);                if(previous!=null) {                    method.setRequestHeader(targetHeader, previous);                }            } catch (RuntimeException e) {                // for absent key, bad index, etc. just do nothing            }        }    }    /**     * Setup proxy, based on attributes in CrawlURI and settings,      * for this CrawlURI only.      * @return HostConfiguration customized as necessary, or null if no     * customization required     */    private HostConfiguration configureProxy(CrawlURI curi) {        String proxy = (String) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST);        int port = -1;         if(proxy.length()==0) {            proxy = null;         } else {            String portString = (String)getAttributeEither(curi, ATTR_HTTP_PROXY_PORT);            port = portString.length()>0 ? Integer.parseInt(portString) : -1;         }        HostConfiguration config = this.http.getHostConfiguration();        if(config.getProxyHost() == proxy && config.getProxyPort() == port) {            // no change            return null;         }        if (proxy != null && proxy.equals(config.getProxyHost())                 && config.getProxyPort() == port) {            // no change            return null;         }        config = new HostConfiguration(config); // copy of config        config.setProxy(proxy,port);        return config;     }    /**     * Get a value either from inside the CrawlURI instance, or from      * settings (module attributes).      *      * @param curi CrawlURI to consult     * @param key key to lookup     * @return value from either CrawlURI (preferred) or settings     */    protected Object getAttributeEither(CrawlURI curi, String key) {        Object obj = curi!=null ? curi.getObject(key) : null;        if(obj==null) {            obj = getUncheckedAttribute(curi, key);        }        return obj;    }    /**     * Add credentials if any to passed <code>method</code>.     *     * Do credential handling.  Credentials are in two places.  1. Credentials     * that succeeded are added to the CrawlServer (Or rather, avatars for     * credentials are whats added because its not safe to keep around     * references to credentials).  2. Credentials to be tried are in the curi.     * Returns true if found credentials to be tried.     *     * @param curi Current CrawlURI.     * @param method The method to add to.     * @return True if prepopulated <code>method</code> with credentials AND the     * credentials came from the <code>curi</code>, not from the CrawlServer.     * The former is  special in that if the <code>curi</curi> credentials     * succeed, then the caller needs to promote them from the CrawlURI to the     * CrawlServer so they are available for all subsequent CrawlURIs on this     * server.     */    private boolean populateCredentials(CrawlURI curi, HttpMethod method) {        // First look at the server avatars. Add any that are to be volunteered        // on every request (e.g. RFC2617 credentials).  Every time creds will        // return true when we call 'isEveryTime().        CrawlServer server =            getController().getServerCache().getServerFor(curi);        if (server.hasCredentialAvatars()) {            Set avatars = server.getCredentialAvatars();            for (Iterator i = avatars.iterator(); i.hasNext();) {                CredentialAvatar ca = (CredentialAvatar)i.next();                Credential c = ca.getCredential(getSettingsHandler(), curi);                if (c.isEveryTime()) {                    c.populate(curi, this.http, method, ca.getPayload());                }            }        }        boolean result = false;        // Now look in the curi.  The Curi will have credentials loaded either        // by the handle401 method if its a rfc2617 or it'll have been set into        // the curi by the preconditionenforcer as this login uri came through.        if (curi.hasCredentialAvatars()) {            Set avatars = curi.getCredentialAvatars();            for (Iterator i = avatars.iterator(); i.hasNext();) {                CredentialAvatar ca = (CredentialAvatar)i.next();                Credential c = ca.getCredential(getSettingsHandler(), curi);                if (c.populate(curi, this.http, method, ca.getPayload())) {                    result = true;                }            }        }        return result;    }    /**     * Promote successful credential to the server.     *     * @param curi CrawlURI whose credentials we are to promote.     */    private void promoteCredentials(final CrawlURI curi) {        if (!curi.hasCredentialAvatars()) {            logger.severe("No credentials to promote when there should be " +                curi);        } else {            Set avatars = curi.getCredentialAvatars();            for (Iterator i = avatars.iterator(); i.hasNext();) {                CredentialAvatar ca = (CredentialAvatar)i.next();                curi.removeCredentialAvatar(ca);                // The server to attach too may not be the server that hosts

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -