📄 fetchhttp.java
字号:
/** * Cleanup after a failed method execute. * @param curi CrawlURI we failed on. * @param exception Exception we failed with. * @param message Message to log with failure. * @param status Status to set on the fetch. */ private void cleanup(final CrawlURI curi, final Exception exception, final String message, final int status) { curi.addLocalizedError(this.getName(), exception, message); curi.setFetchStatus(status); curi.getHttpRecorder().close(); } /** * Can this processor fetch the given CrawlURI. May set a fetch * status if this processor would usually handle the CrawlURI, * but cannot in this instance. * * @param curi * @return True if processor can fetch. */ private boolean canFetch(CrawlURI curi) { if(curi.getFetchStatus()<0) { // already marked as errored, this pass through // skip to end curi.skipToProcessorChain(getController().getPostprocessorChain()); return false; } String scheme = curi.getUURI().getScheme(); if (!(scheme.equals("http") || scheme.equals("https"))) { // handles only plain http and https return false; } CrawlHost host = getController().getServerCache().getHostFor(curi); // make sure the dns lookup succeeded if (host.getIP() == null && host.hasBeenLookedUp()) { curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE); return false; } return true; } /** * Configure the HttpMethod setting options and headers. * * @param curi CrawlURI from which we pull configuration. * @param method The Method to configure. */ protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) { // Don't auto-follow redirects method.setFollowRedirects(false); // // set soTimeout// method.getParams().setSoTimeout(// ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))// .intValue()); // Set cookie policy. method.getParams().setCookiePolicy( (((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)). booleanValue())? CookiePolicy.IGNORE_COOKIES: CookiePolicy.BROWSER_COMPATIBILITY); // Use only HTTP/1.0 (to avoid receiving chunked responses) method.getParams().setVersion(HttpVersion.HTTP_1_0); CrawlOrder order = getSettingsHandler().getOrder(); String userAgent = curi.getUserAgent(); if (userAgent == null) { userAgent = order.getUserAgent(curi); } method.setRequestHeader("User-Agent", userAgent); method.setRequestHeader("From", order.getFrom(curi)); // Set retry handler. method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new HeritrixHttpMethodRetryHandler()); final long maxLength = getMaxLength(curi); if(maxLength > 0 && ((Boolean)getUncheckedAttribute(curi, ATTR_SEND_RANGE)). booleanValue()) { method.addRequestHeader(RANGE, RANGE_PREFIX.concat(Long.toString(maxLength - 1))); } if (((Boolean)getUncheckedAttribute(curi, ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) { method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE); } if (((Boolean)getUncheckedAttribute(curi, ATTR_SEND_REFERER)).booleanValue()) { // RFC2616 says no referer header if referer is https and the url // is not String via = curi.flattenVia(); if (via != null && via.length() > 0 && !(via.startsWith(HTTPS_SCHEME) && curi.getUURI().getScheme().equals(HTTP_SCHEME))) { method.setRequestHeader(REFERER, via); } } // TODO: What happens if below method adds a header already // added above: e.g. Connection, Range, or Referer? setAcceptHeaders(curi, method); return configureProxy(curi); } /** * Setup proxy, based on attributes in CrawlURI and settings, * for this CrawlURI only. * @return HostConfiguration customized as necessary, or null if no * customization required */ private HostConfiguration configureProxy(CrawlURI curi) { String proxy = (String) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST); int port = -1; if(proxy.length()==0) { proxy = null; } else { String portString = (String)getAttributeEither(curi, ATTR_HTTP_PROXY_PORT); port = portString.length()>0 ? Integer.parseInt(portString) : -1; } HostConfiguration config = this.http.getHostConfiguration(); if(config.getProxyHost() == proxy && config.getProxyPort() == port) { // no change return null; } if (proxy != null && proxy.equals(config.getProxyHost()) && config.getProxyPort() == port) { // no change return null; } config = new HostConfiguration(config); // copy of config config.setProxy(proxy,port); return config; } /** * Get a value either from inside the CrawlURI instance, or from * settings (module attributes). * * @param curi CrawlURI to consult * @param key key to lookup * @return value from either CrawlURI (preferred) or settings */ protected Object getAttributeEither(CrawlURI curi, String key) { Object obj = curi!=null ? curi.getObject(key) : null; if(obj==null) { obj = getUncheckedAttribute(curi, key); } return obj; } /** * Add credentials if any to passed <code>method</code>. * * Do credential handling. Credentials are in two places. 1. Credentials * that succeeded are added to the CrawlServer (Or rather, avatars for * credentials are whats added because its not safe to keep around * references to credentials). 2. Credentials to be tried are in the curi. * Returns true if found credentials to be tried. * * @param curi Current CrawlURI. * @param method The method to add to. * @return True if prepopulated <code>method</code> with credentials AND the * credentials came from the <code>curi</code>, not from the CrawlServer. * The former is special in that if the <code>curi</curi> credentials * succeed, then the caller needs to promote them from the CrawlURI to the * CrawlServer so they are available for all subsequent CrawlURIs on this * server. */ private boolean populateCredentials(CrawlURI curi, HttpMethod method) { // First look at the server avatars. Add any that are to be volunteered // on every request (e.g. RFC2617 credentials). Every time creds will // return true when we call 'isEveryTime(). CrawlServer server = getController().getServerCache().getServerFor(curi); if (server.hasCredentialAvatars()) { Set avatars = server.getCredentialAvatars(); for (Iterator i = avatars.iterator(); i.hasNext();) { CredentialAvatar ca = (CredentialAvatar)i.next(); Credential c = ca.getCredential(getSettingsHandler(), curi); if (c.isEveryTime()) { c.populate(curi, this.http, method, ca.getPayload()); } } } boolean result = false; // Now look in the curi. The Curi will have credentials loaded either // by the handle401 method if its a rfc2617 or it'll have been set into // the curi by the preconditionenforcer as this login uri came through. if (curi.hasCredentialAvatars()) { Set avatars = curi.getCredentialAvatars(); for (Iterator i = avatars.iterator(); i.hasNext();) { CredentialAvatar ca = (CredentialAvatar)i.next(); Credential c = ca.getCredential(getSettingsHandler(), curi); if (c.populate(curi, this.http, method, ca.getPayload())) { result = true; } } } return result; } /** * Promote successful credential to the server. * * @param curi CrawlURI whose credentials we are to promote. */ private void promoteCredentials(final CrawlURI curi) { if (!curi.hasCredentialAvatars()) { logger.severe("No credentials to promote when there should be " + curi); } else { Set avatars = curi.getCredentialAvatars(); for (Iterator i = avatars.iterator(); i.hasNext();) { CredentialAvatar ca = (CredentialAvatar)i.next(); curi.removeCredentialAvatar(ca); // The server to attach too may not be the server that hosts // this passed curi. It might be of another subdomain. // The avatar needs to be added to the server that is dependent // on this precondition. Find it by name. Get the name from // the credential this avatar represents. Credential c = ca.getCredential(getSettingsHandler(), curi); String cd = null; try { cd = c.getCredentialDomain(curi); } catch (AttributeNotFoundException e) { logger.severe("Failed to get cred domain for " + curi + " for " + ca + ": " + e.getMessage()); } if (cd != null) { CrawlServer cs = getController().getServerCache().getServerFor(cd); if (cs != null) { cs.addCredentialAvatar(ca); } } } } } /** * Server is looking for basic/digest auth credentials (RFC2617). If we have * any, put them into the CrawlURI and have it come around again. Presence * of the credential serves as flag to frontier to requeue promptly. If we * already tried this domain and still got a 401, then our credentials are * bad. Remove them and let this curi die. * * @param method Method that got a 401. * @param curi CrawlURI that got a 401. */ protected void handle401(final HttpMethod method, final CrawlURI curi) { AuthScheme authscheme = getAuthScheme(method, curi); if (authscheme == null) { return; } String realm = authscheme.getRealm(); // Look to see if this curi had rfc2617 avatars loaded. If so, are // any of them for this realm? If so, then the credential failed // if we got a 401 and it should be let die a natural 401 death. Set curiRfc2617Credentials = getCredentials(getSettingsHandler(), curi, Rfc2617Credential.class); Rfc2617Credential extant = Rfc2617Credential. getByRealm(curiRfc2617Credentials, realm, curi); if (extant != null) { // Then, already tried this credential. Remove ANY rfc2617 // credential since presence of a rfc2617 credential serves // as flag to frontier to requeue this curi and let the curi // die a natural death. extant.detachAll(curi); logger.warning("Auth failed (401) though supplied realm " + realm + " to " + curi.toString()); } else { // Look see if we have a credential that corresponds to this // realm in credential store. Filter by type and credential // domain. If not, let this curi die. Else, add it to the // curi and let it come around again. Add in the AuthScheme // we got too. Its needed when we go to run the Auth on // second time around. CredentialStore cs = CredentialStore.getCredentialStore(getSettingsHandler()); if (cs == null) { logger.severe("No credential store for " + curi); } else { CrawlServer server = getController().getServerCache(). getServerFor(curi); Set storeRfc2617Credentials = cs.subset(curi, Rfc2617Credential.class, server.getName()); if (storeRfc2617Credentials == null || storeRfc2617Credentials.size() <= 0) { logger.info("No rfc2617 credentials for " + curi); } else { Rfc2617Credential found = Rfc2617Credential. getByRealm(storeRfc2617Credentials, realm, curi); if (found == null) { logger.info("No rfc2617 credentials for realm " + realm + " in " + curi); } else { found.attach(curi, authscheme.getRealm()); logger.info("Found credential for realm " + realm + " in store for " + curi.toString());
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -