📄 fetchhttp.java
字号:
} } /** * This method populates <code>curi</code> with response status and * content type. * @param curi CrawlURI to populate. * @param method Method to get response status and headers from. */ protected void addResponseContent (HttpMethod method, CrawlURI curi) { curi.setFetchStatus(method.getStatusCode()); Header ct = method.getResponseHeader("content-type"); curi.setContentType((ct == null)? null: ct.getValue()); // Save method into curi too. Midfetch filters may want to leverage // info in here. curi.putObject(A_HTTP_TRANSACTION, method); } /** * Set the character encoding based on the result headers or default. * * The HttpClient returns its own default encoding ("ISO-8859-1") if one * isn't specified in the Content-Type response header. We give the user * the option of overriding this, so we need to detect the case where the * default is returned. * * Now, it may well be the case that the default returned by HttpClient * and the default defined by the user are the same. * * @param rec Recorder for this request. * @param method Method used for the request. */ private void setCharacterEncoding(final HttpRecorder rec, final HttpMethod method) { String encoding = null; try { encoding = ((HttpMethodBase) method).getResponseCharSet(); if (encoding == null || encoding.equals(DEFAULT_CONTENT_CHARSET)) { encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING); } } catch (Exception e) { logger.warning("Failed get default encoding: " + e.getLocalizedMessage()); } rec.setCharacterEncoding(encoding); } /** * Cleanup after a failed method execute. * @param curi CrawlURI we failed on. * @param method Method we failed on. * @param exception Exception we failed with. */ private void failedExecuteCleanup(final HttpMethod method, final CrawlURI curi, final Exception exception) { cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED); method.releaseConnection(); } /** * Cleanup after a failed method execute. * @param curi CrawlURI we failed on. * @param exception Exception we failed with. * @param message Message to log with failure. * @param status Status to set on the fetch. */ private void cleanup(final CrawlURI curi, final Exception exception, final String message, final int status) { curi.addLocalizedError(this.getName(), exception, message); curi.setFetchStatus(status); curi.getHttpRecorder().close(); } /** * Can this processor fetch the given CrawlURI. May set a fetch * status if this processor would usually handle the CrawlURI, * but cannot in this instance. * * @param curi * @return True if processor can fetch. */ private boolean canFetch(CrawlURI curi) { if(curi.getFetchStatus()<0) { // already marked as errored, this pass through // skip to end curi.skipToProcessorChain(getController().getPostprocessorChain()); return false; } String scheme = curi.getUURI().getScheme(); if (!(scheme.equals("http") || scheme.equals("https"))) { // handles only plain http and https return false; } CrawlHost host = getController().getServerCache().getHostFor(curi); // make sure the dns lookup succeeded if (host.getIP() == null && host.hasBeenLookedUp()) { curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE); return false; } return true; } /** * Configure the HttpMethod setting options and headers. * * @param curi CrawlURI from which we pull configuration. * @param method The Method to configure. */ protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) { // Don't auto-follow redirects method.setFollowRedirects(false); // // set soTimeout// method.getParams().setSoTimeout(// ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))// .intValue()); // Set cookie policy. method.getParams().setCookiePolicy( (((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)). booleanValue())? CookiePolicy.IGNORE_COOKIES: CookiePolicy.BROWSER_COMPATIBILITY); // Use only HTTP/1.0 (to avoid receiving chunked responses) method.getParams().setVersion(HttpVersion.HTTP_1_0); CrawlOrder order = getSettingsHandler().getOrder(); String userAgent = curi.getUserAgent(); if (userAgent == null) { userAgent = order.getUserAgent(curi); } method.setRequestHeader("User-Agent", userAgent); method.setRequestHeader("From", order.getFrom(curi)); // Set retry handler. method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new HeritrixHttpMethodRetryHandler()); final long maxLength = getMaxLength(curi); if(maxLength > 0 && ((Boolean)getUncheckedAttribute(curi, ATTR_SEND_RANGE)). booleanValue()) { method.addRequestHeader(RANGE, RANGE_PREFIX.concat(Long.toString(maxLength - 1))); } if (((Boolean)getUncheckedAttribute(curi, ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) { method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE); } if (((Boolean)getUncheckedAttribute(curi, ATTR_SEND_REFERER)).booleanValue()) { // RFC2616 says no referer header if referer is https and the url // is not String via = curi.flattenVia(); if (via != null && via.length() > 0 && !(via.startsWith(HTTPS_SCHEME) && curi.getUURI().getScheme().equals(HTTP_SCHEME))) { method.setRequestHeader(REFERER, via); } } if(!curi.isPrerequisite()) { setConditionalGetHeader(curi, method, ATTR_SEND_IF_MODIFIED_SINCE, CoreAttributeConstants.A_LAST_MODIFIED_HEADER, "If-Modified-Since"); setConditionalGetHeader(curi, method, ATTR_SEND_IF_NONE_MATCH, CoreAttributeConstants.A_ETAG_HEADER, "If-None-Match"); } // TODO: What happens if below method adds a header already // added above: e.g. Connection, Range, or Referer? setAcceptHeaders(curi, method); return configureProxy(curi); } /** * Set the given conditional-GET header, if the setting is enabled and * a suitable value is available in the URI history. * @param curi source CrawlURI * @param method HTTP operation pending * @param setting true/false enablement setting name to consult * @param sourceHeader header to consult in URI history * @param targetHeader header to set if possible */ protected void setConditionalGetHeader(CrawlURI curi, HttpMethod method, String setting, String sourceHeader, String targetHeader) { if(((Boolean)getUncheckedAttribute(curi,setting))) { try { String previous = curi.getAList().getAListArray( A_FETCH_HISTORY)[0].getString(sourceHeader); if(previous!=null) { method.setRequestHeader(targetHeader, previous); } } catch (RuntimeException e) { // for absent key, bad index, etc. just do nothing } } } /** * Setup proxy, based on attributes in CrawlURI and settings, * for this CrawlURI only. * @return HostConfiguration customized as necessary, or null if no * customization required */ private HostConfiguration configureProxy(CrawlURI curi) { String proxy = (String) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST); int port = -1; if(proxy.length()==0) { proxy = null; } else { String portString = (String)getAttributeEither(curi, ATTR_HTTP_PROXY_PORT); port = portString.length()>0 ? Integer.parseInt(portString) : -1; } HostConfiguration config = this.http.getHostConfiguration(); if(config.getProxyHost() == proxy && config.getProxyPort() == port) { // no change return null; } if (proxy != null && proxy.equals(config.getProxyHost()) && config.getProxyPort() == port) { // no change return null; } config = new HostConfiguration(config); // copy of config config.setProxy(proxy,port); return config; } /** * Get a value either from inside the CrawlURI instance, or from * settings (module attributes). * * @param curi CrawlURI to consult * @param key key to lookup * @return value from either CrawlURI (preferred) or settings */ protected Object getAttributeEither(CrawlURI curi, String key) { Object obj = curi!=null ? curi.getObject(key) : null; if(obj==null) { obj = getUncheckedAttribute(curi, key); } return obj; } /** * Add credentials if any to passed <code>method</code>. * * Do credential handling. Credentials are in two places. 1. Credentials * that succeeded are added to the CrawlServer (Or rather, avatars for * credentials are whats added because its not safe to keep around * references to credentials). 2. Credentials to be tried are in the curi. * Returns true if found credentials to be tried. * * @param curi Current CrawlURI. * @param method The method to add to. * @return True if prepopulated <code>method</code> with credentials AND the * credentials came from the <code>curi</code>, not from the CrawlServer. * The former is special in that if the <code>curi</curi> credentials * succeed, then the caller needs to promote them from the CrawlURI to the * CrawlServer so they are available for all subsequent CrawlURIs on this * server. */ private boolean populateCredentials(CrawlURI curi, HttpMethod method) { // First look at the server avatars. Add any that are to be volunteered // on every request (e.g. RFC2617 credentials). Every time creds will // return true when we call 'isEveryTime(). CrawlServer server = getController().getServerCache().getServerFor(curi); if (server.hasCredentialAvatars()) { Set avatars = server.getCredentialAvatars(); for (Iterator i = avatars.iterator(); i.hasNext();) { CredentialAvatar ca = (CredentialAvatar)i.next(); Credential c = ca.getCredential(getSettingsHandler(), curi); if (c.isEveryTime()) { c.populate(curi, this.http, method, ca.getPayload()); } } } boolean result = false; // Now look in the curi. The Curi will have credentials loaded either // by the handle401 method if its a rfc2617 or it'll have been set into // the curi by the preconditionenforcer as this login uri came through. if (curi.hasCredentialAvatars()) { Set avatars = curi.getCredentialAvatars(); for (Iterator i = avatars.iterator(); i.hasNext();) { CredentialAvatar ca = (CredentialAvatar)i.next(); Credential c = ca.getCredential(getSettingsHandler(), curi); if (c.populate(curi, this.http, method, ca.getPayload())) { result = true; } } } return result; } /** * Promote successful credential to the server. * * @param curi CrawlURI whose credentials we are to promote. */ private void promoteCredentials(final CrawlURI curi) { if (!curi.hasCredentialAvatars()) { logger.severe("No credentials to promote when there should be " + curi); } else { Set avatars = curi.getCredentialAvatars(); for (Iterator i = avatars.iterator(); i.hasNext();) { CredentialAvatar ca = (CredentialAvatar)i.next(); curi.removeCredentialAvatar(ca); // The server to attach too may not be the server that hosts
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -