📄 fetchhttp.java
字号:
"File to preload cookies from", "")); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES, "When crawl finishes save cookies to this file", "")); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_TRUST, "SSL certificate trust level. Range is from the default 'open'" + " (trust all certs including expired, selfsigned, and those for" + " which we do not have a CA) through 'loose' (trust all valid" + " certificates including selfsigned), 'normal' (all valid" + " certificates not including selfsigned) to 'strict' (Cert is" + " valid and DN must match servername)", ConfigurableX509TrustManager.DEFAULT, ConfigurableX509TrustManager.LEVELS_AS_ARRAY)); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new StringList(ATTR_ACCEPT_HEADERS, "Accept Headers to include in each request. Each must be the" + " complete header, e.g., 'Accept-Language: en'")); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST, "Proxy host IP (set only if needed).", "")); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT, "Proxy port (set only if needed)", "")); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING, "The character encoding to use for files that do not have one" + " specified in the HTTP response headers. Default: " + DEFAULT_CONTENT_CHARSET + ".", DEFAULT_CONTENT_CHARSET)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SHA1_CONTENT, "Whether or not to perform an on-the-fly SHA1 hash of" + "retrieved content-bodies.", DEFAULT_SHA1_CONTENT)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE, "Send 'Connection: close' header with every request.", new Boolean(true))); e.setOverrideable(true); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER, "Send 'Referer' header with every request.\n" + "The 'Referer' header contans the location the crawler came " + " from, " + "the page the current URI was discovered in. The 'Referer' " + "usually is " + "logged on the remote server and can be of assistance to " + "webmasters trying to figure how a crawler got to a " + "particular area on a site.", new Boolean(true))); e.setOverrideable(true); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE, "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES + ") on document size.\n" + "Be polite to the HTTP servers and send the 'Range' header," + "stating that you are only interested in the first n bytes. " + "Only pertinent if " + ATTR_MAX_LENGTH_BYTES + " > 0. " + "Sending the 'Range' header results in a " + "'206 Partial Content' status response, which is better than " + "just cutting the response mid-download. On rare occasion, " + " sending 'Range' will " + "generate '416 Request Range Not Satisfiable' response.", new Boolean(false))); e.setOverrideable(true); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_LOCAL_ADDRESS, "Local IP address or hostname to use when making connections " + "(binding sockets). When not specified, uses default local" + "address(es).", "")); e.setExpertSetting(true); } protected void innerProcess(final CrawlURI curi) throws InterruptedException { if (!canFetch(curi)) { // Cannot fetch this, due to protocol, retries, or other problems return; } this.curisHandled++; // Note begin time curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis()); // Get a reference to the HttpRecorder that is set into this ToeThread. HttpRecorder rec = HttpRecorder.getHttpRecorder(); // Shall we get a digest on the content downloaded? boolean sha1Content = ((Boolean)getUncheckedAttribute(curi, ATTR_SHA1_CONTENT)).booleanValue(); if(sha1Content) { rec.getRecordedInput().setSha1Digest(); } else { // clear rec.getRecordedInput().setDigest(null); } // Below we do two inner classes that add check of midfetch // filters just as we're about to receive the response body. String curiString = curi.getUURI().toString(); HttpMethodBase method = null; if (curi.isPost()) { method = new HttpRecorderPostMethod(curiString, rec) { protected void readResponseBody(HttpState state, HttpConnection conn) throws IOException, HttpException { addResponseContent(this, curi); if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) { doAbort(curi, this, MIDFETCH_ABORT_LOG); } else { super.readResponseBody(state, conn); } } }; } else { method = new HttpRecorderGetMethod(curiString, rec) { protected void readResponseBody(HttpState state, HttpConnection conn) throws IOException, HttpException { addResponseContent(this, curi); if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) { doAbort(curi, this, MIDFETCH_ABORT_LOG); } else { super.readResponseBody(state, conn); } } }; } HostConfiguration customConfigOrNull = configureMethod(curi, method); // Set httpRecorder into curi. Subsequent code both here and later // in extractors expects to find the HttpRecorder in the CrawlURI. curi.setHttpRecorder(rec); // Populate credentials. Set config so auth. is not automatic. boolean addedCredentials = populateCredentials(curi, method); method.setDoAuthentication(addedCredentials); try { this.http.executeMethod(customConfigOrNull, method); } catch (RecorderTooMuchHeaderException ex) { // when too much header material, abort like other truncations doAbort(curi, method, HEADER_TRUNC); } catch (IOException e) { failedExecuteCleanup(method, curi, e); return; } catch (ArrayIndexOutOfBoundsException e) { // For weird windows-only ArrayIndex exceptions in native // code... see // http://forum.java.sun.com/thread.jsp?forum=11&thread=378356 // treating as if it were an IOException failedExecuteCleanup(method, curi, e); return; } // set softMax on bytes to get (if implied by content-length) long softMax = method.getResponseContentLength(); // set hardMax on bytes (if set by operator) long hardMax = getMaxLength(curi); // Get max fetch rate (bytes/ms). It comes in in KB/sec, which // requires nothing to normalize. int maxFetchRate = getMaxFetchRate(curi); try { if (!method.isAborted()) { // Force read-to-end, so that any socket hangs occur here, // not in later modules. rec.getRecordedInput().readFullyOrUntil(softMax, hardMax, 1000 * getTimeout(curi), maxFetchRate); } } catch (RecorderTimeoutException ex) { doAbort(curi, method, TIMER_TRUNC); } catch (RecorderLengthExceededException ex) { doAbort(curi, method, LENGTH_TRUNC); } catch (IOException e) { cleanup(curi, e, "readFully", S_CONNECT_LOST); return; } catch (ArrayIndexOutOfBoundsException e) { // For weird windows-only ArrayIndex exceptions from native code // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356 // treating as if it were an IOException cleanup(curi, e, "readFully", S_CONNECT_LOST); return; } finally { // ensure recording has stopped rec.closeRecorders(); if (!method.isAborted()) { method.releaseConnection(); } // Note completion time curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis()); // Set the response charset into the HttpRecord if available. setCharacterEncoding(rec, method); curi.setContentSize(rec.getRecordedInput().getSize()); } curi.setContentDigest(SHA1, rec.getRecordedInput().getDigestValue()); if (logger.isLoggable(Level.INFO)) { logger.info((curi.isPost()? "POST": "GET") + " " + curi.getUURI().toString() + " " + method.getStatusCode() + " " + rec.getRecordedInput().getSize() + " " + curi.getContentType()); } if (curi.isSuccess() && addedCredentials) { // Promote the credentials from the CrawlURI to the CrawlServer // so they are available for all subsequent CrawlURIs on this // server. promoteCredentials(curi); if (logger.isLoggable(Level.FINE)) { // Print out the cookie. Might help with the debugging. Header setCookie = method.getResponseHeader("set-cookie"); if (setCookie != null) { logger.fine(setCookie.toString().trim()); } } } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) { // 401 is not 'success'. handle401(method, curi); } if (rec.getRecordedInput().isOpen()) { logger.severe(curi.toString() + " RIS still open. Should have" + " been closed by method release: " + Thread.currentThread().getName()); try { rec.getRecordedInput().close(); } catch (IOException e) { logger.log(Level.SEVERE,"second-chance RIS close failed",e); } } } protected void doAbort(CrawlURI curi, HttpMethod method, String annotation) { curi.addAnnotation(annotation); curi.getHttpRecorder().close(); method.abort(); } protected boolean checkMidfetchAbort(CrawlURI curi, HttpRecorderMethod method, HttpConnection conn) { if (curi.isPrerequisite() || filtersAccept(midfetchfilters, curi)) { return false; } method.markContentBegin(conn); return true; } /** * This method populates <code>curi</code> with response status and * content type. * @param curi CrawlURI to populate. * @param method Method to get response status and headers from. */ protected void addResponseContent (HttpMethod method, CrawlURI curi) { curi.setFetchStatus(method.getStatusCode()); Header ct = method.getResponseHeader("content-type"); curi.setContentType((ct == null)? null: ct.getValue()); // Save method into curi too. Midfetch filters may want to leverage // info in here. curi.putObject(A_HTTP_TRANSACTION, method); } /** * Set the character encoding based on the result headers or default. * * The HttpClient returns its own default encoding ("ISO-8859-1") if one * isn't specified in the Content-Type response header. We give the user * the option of overriding this, so we need to detect the case where the * default is returned. * * Now, it may well be the case that the default returned by HttpClient * and the default defined by the user are the same. * * @param rec Recorder for this request. * @param method Method used for the request. */ private void setCharacterEncoding(final HttpRecorder rec, final HttpMethod method) { String encoding = null; try { encoding = ((HttpMethodBase) method).getResponseCharSet(); if (encoding == null || encoding.equals(DEFAULT_CONTENT_CHARSET)) { encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING); } } catch (Exception e) { logger.warning("Failed get default encoding: " + e.getLocalizedMessage()); } rec.setCharacterEncoding(encoding); } /** * Cleanup after a failed method execute. * @param curi CrawlURI we failed on. * @param method Method we failed on. * @param exception Exception we failed with. */ private void failedExecuteCleanup(final HttpMethod method, final CrawlURI curi, final Exception exception) { cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED); method.releaseConnection(); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -