⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchhttp.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
        /**     * Cleanup after a failed method execute.     * @param curi CrawlURI we failed on.     * @param exception Exception we failed with.     * @param message Message to log with failure.     * @param status Status to set on the fetch.     */    private void cleanup(final CrawlURI curi, final Exception exception,            final String message, final int status) {        curi.addLocalizedError(this.getName(), exception, message);        curi.setFetchStatus(status);        curi.getHttpRecorder().close();    }    /**     * Can this processor fetch the given CrawlURI. May set a fetch     * status if this processor would usually handle the CrawlURI,     * but cannot in this instance.     *     * @param curi     * @return True if processor can fetch.     */    private boolean canFetch(CrawlURI curi) {        if(curi.getFetchStatus()<0) {            // already marked as errored, this pass through            // skip to end            curi.skipToProcessorChain(getController().getPostprocessorChain());            return false;                     }        String scheme = curi.getUURI().getScheme();         if (!(scheme.equals("http") || scheme.equals("https"))) {             // handles only plain http and https             return false;         }         CrawlHost host = getController().getServerCache().getHostFor(curi);         // make sure the dns lookup succeeded         if (host.getIP() == null && host.hasBeenLookedUp()) {             curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);             return false;         }        return true;    }    /**     * Configure the HttpMethod setting options and headers.     *     * @param curi CrawlURI from which we pull configuration.     * @param method The Method to configure.     */    protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) {        // Don't auto-follow redirects        method.setFollowRedirects(false);        //        // set soTimeout//        method.getParams().setSoTimeout(//                ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))//                        .intValue());                // Set cookie policy.        method.getParams().setCookiePolicy(            (((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)).                booleanValue())?                    CookiePolicy.IGNORE_COOKIES:                CookiePolicy.BROWSER_COMPATIBILITY);        // Use only HTTP/1.0 (to avoid receiving chunked responses)        method.getParams().setVersion(HttpVersion.HTTP_1_0);        CrawlOrder order = getSettingsHandler().getOrder();        String userAgent = curi.getUserAgent();        if (userAgent == null) {            userAgent = order.getUserAgent(curi);        }        method.setRequestHeader("User-Agent", userAgent);        method.setRequestHeader("From", order.getFrom(curi));                // Set retry handler.        method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,            new HeritrixHttpMethodRetryHandler());                final long maxLength = getMaxLength(curi);        if(maxLength > 0 &&                ((Boolean)getUncheckedAttribute(curi, ATTR_SEND_RANGE)).                    booleanValue()) {            method.addRequestHeader(RANGE,                RANGE_PREFIX.concat(Long.toString(maxLength - 1)));        }                if (((Boolean)getUncheckedAttribute(curi,                ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) {            method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE);        }                if (((Boolean)getUncheckedAttribute(curi,                ATTR_SEND_REFERER)).booleanValue()) {            // RFC2616 says no referer header if referer is https and the url            // is not            String via = curi.flattenVia();            if (via != null && via.length() > 0 &&                !(via.startsWith(HTTPS_SCHEME) &&                    curi.getUURI().getScheme().equals(HTTP_SCHEME))) {                method.setRequestHeader(REFERER, via);            }        }                // TODO: What happens if below method adds a header already        // added above: e.g. Connection, Range, or Referer?        setAcceptHeaders(curi, method);                return configureProxy(curi);    }    /**     * Setup proxy, based on attributes in CrawlURI and settings,      * for this CrawlURI only.      * @return HostConfiguration customized as necessary, or null if no     * customization required     */    private HostConfiguration configureProxy(CrawlURI curi) {        String proxy = (String) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST);        int port = -1;         if(proxy.length()==0) {            proxy = null;         } else {            String portString = (String)getAttributeEither(curi, ATTR_HTTP_PROXY_PORT);            port = portString.length()>0 ? Integer.parseInt(portString) : -1;         }        HostConfiguration config = this.http.getHostConfiguration();        if(config.getProxyHost() == proxy && config.getProxyPort() == port) {            // no change            return null;         }        if (proxy != null && proxy.equals(config.getProxyHost())                 && config.getProxyPort() == port) {            // no change            return null;         }        config = new HostConfiguration(config); // copy of config        config.setProxy(proxy,port);        return config;     }    /**     * Get a value either from inside the CrawlURI instance, or from      * settings (module attributes).      *      * @param curi CrawlURI to consult     * @param key key to lookup     * @return value from either CrawlURI (preferred) or settings     */    protected Object getAttributeEither(CrawlURI curi, String key) {        Object obj = curi!=null ? curi.getObject(key) : null;        if(obj==null) {            obj = getUncheckedAttribute(curi, key);        }        return obj;    }    /**     * Add credentials if any to passed <code>method</code>.     *     * Do credential handling.  Credentials are in two places.  1. Credentials     * that succeeded are added to the CrawlServer (Or rather, avatars for     * credentials are whats added because its not safe to keep around     * references to credentials).  2. Credentials to be tried are in the curi.     * Returns true if found credentials to be tried.     *     * @param curi Current CrawlURI.     * @param method The method to add to.     * @return True if prepopulated <code>method</code> with credentials AND the     * credentials came from the <code>curi</code>, not from the CrawlServer.     * The former is  special in that if the <code>curi</curi> credentials     * succeed, then the caller needs to promote them from the CrawlURI to the     * CrawlServer so they are available for all subsequent CrawlURIs on this     * server.     */    private boolean populateCredentials(CrawlURI curi, HttpMethod method) {        // First look at the server avatars. Add any that are to be volunteered        // on every request (e.g. RFC2617 credentials).  Every time creds will        // return true when we call 'isEveryTime().        CrawlServer server =            getController().getServerCache().getServerFor(curi);        if (server.hasCredentialAvatars()) {            Set avatars = server.getCredentialAvatars();            for (Iterator i = avatars.iterator(); i.hasNext();) {                CredentialAvatar ca = (CredentialAvatar)i.next();                Credential c = ca.getCredential(getSettingsHandler(), curi);                if (c.isEveryTime()) {                    c.populate(curi, this.http, method, ca.getPayload());                }            }        }        boolean result = false;        // Now look in the curi.  The Curi will have credentials loaded either        // by the handle401 method if its a rfc2617 or it'll have been set into        // the curi by the preconditionenforcer as this login uri came through.        if (curi.hasCredentialAvatars()) {            Set avatars = curi.getCredentialAvatars();            for (Iterator i = avatars.iterator(); i.hasNext();) {                CredentialAvatar ca = (CredentialAvatar)i.next();                Credential c = ca.getCredential(getSettingsHandler(), curi);                if (c.populate(curi, this.http, method, ca.getPayload())) {                    result = true;                }            }        }        return result;    }    /**     * Promote successful credential to the server.     *     * @param curi CrawlURI whose credentials we are to promote.     */    private void promoteCredentials(final CrawlURI curi) {        if (!curi.hasCredentialAvatars()) {            logger.severe("No credentials to promote when there should be " +                curi);        } else {            Set avatars = curi.getCredentialAvatars();            for (Iterator i = avatars.iterator(); i.hasNext();) {                CredentialAvatar ca = (CredentialAvatar)i.next();                curi.removeCredentialAvatar(ca);                // The server to attach too may not be the server that hosts                // this passed curi.  It might be of another subdomain.                // The avatar needs to be added to the server that is dependent                // on this precondition.  Find it by name.  Get the name from                // the credential this avatar represents.                Credential c = ca.getCredential(getSettingsHandler(), curi);                String cd = null;                try {                    cd = c.getCredentialDomain(curi);                }                catch (AttributeNotFoundException e) {                    logger.severe("Failed to get cred domain for " + curi +                        " for " + ca + ": " + e.getMessage());                }                if (cd != null) {                    CrawlServer cs                        = getController().getServerCache().getServerFor(cd);                    if (cs != null) {                        cs.addCredentialAvatar(ca);                    }                }            }        }    }    /**     * Server is looking for basic/digest auth credentials (RFC2617). If we have     * any, put them into the CrawlURI and have it come around again. Presence     * of the credential serves as flag to frontier to requeue promptly. If we     * already tried this domain and still got a 401, then our credentials are     * bad. Remove them and let this curi die.     *     * @param method Method that got a 401.     * @param curi CrawlURI that got a 401.     */    protected void handle401(final HttpMethod method, final CrawlURI curi) {        AuthScheme authscheme = getAuthScheme(method, curi);        if (authscheme == null) {        	return;        }        String realm = authscheme.getRealm();                // Look to see if this curi had rfc2617 avatars loaded.  If so, are        // any of them for this realm?  If so, then the credential failed        // if we got a 401 and it should be let die a natural 401 death.        Set curiRfc2617Credentials = getCredentials(getSettingsHandler(),        		curi, Rfc2617Credential.class);        Rfc2617Credential extant = Rfc2617Credential.		    getByRealm(curiRfc2617Credentials, realm, curi);        if (extant != null) {        	// Then, already tried this credential.  Remove ANY rfc2617        	// credential since presence of a rfc2617 credential serves        	// as flag to frontier to requeue this curi and let the curi        	// die a natural death.        	extant.detachAll(curi);        	logger.warning("Auth failed (401) though supplied realm " +        			realm + " to " + curi.toString());        } else {        	// Look see if we have a credential that corresponds to this        	// realm in credential store.  Filter by type and credential        	// domain.  If not, let this curi die. Else, add it to the        	// curi and let it come around again. Add in the AuthScheme        	// we got too.  Its needed when we go to run the Auth on        	// second time around.        	CredentialStore cs =        		CredentialStore.getCredentialStore(getSettingsHandler());        	if (cs == null) {        		logger.severe("No credential store for " + curi);        	} else {                CrawlServer server = getController().getServerCache().                    getServerFor(curi);        		Set storeRfc2617Credentials = cs.subset(curi,        		    Rfc2617Credential.class, server.getName());        		if (storeRfc2617Credentials == null ||        				storeRfc2617Credentials.size() <= 0) {        			logger.info("No rfc2617 credentials for " + curi);        		} else {        			Rfc2617Credential found = Rfc2617Credential.					    getByRealm(storeRfc2617Credentials, realm, curi);        			if (found == null) {        				logger.info("No rfc2617 credentials for realm " +        						realm + " in " + curi);        			} else {        				found.attach(curi, authscheme.getRealm());        				logger.info("Found credential for realm " + realm +        				    " in store for " + curi.toString());

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -