📄 fetchhttp.java
字号:
"Disable cookie-handling.", DEFAULT_IGNORE_COOKIES)); e.setOverrideable(true); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_BDB_COOKIES, "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES, "File to preload cookies from", "")); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES, "When crawl finishes save cookies to this file", "")); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_TRUST, "SSL certificate trust level. Range is from the default 'open'" + " (trust all certs including expired, selfsigned, and those for" + " which we do not have a CA) through 'loose' (trust all valid" + " certificates including selfsigned), 'normal' (all valid" + " certificates not including selfsigned) to 'strict' (Cert is" + " valid and DN must match servername)", ConfigurableX509TrustManager.DEFAULT, ConfigurableX509TrustManager.LEVELS_AS_ARRAY)); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new StringList(ATTR_ACCEPT_HEADERS, "Accept Headers to include in each request. Each must be the" + " complete header, e.g., 'Accept-Language: en'")); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST, "Proxy host IP (set only if needed).", "")); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT, "Proxy port (set only if needed)", "")); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING, "The character encoding to use for files that do not have one" + " specified in the HTTP response headers. Default: " + DEFAULT_CONTENT_CHARSET + ".", DEFAULT_CONTENT_CHARSET)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_DIGEST_CONTENT, "Whether or not to perform an on-the-fly digest hash of" + " retrieved content-bodies.", DEFAULT_DIGEST_CONTENT)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_DIGEST_ALGORITHM, "Which algorithm (for example MD5 or SHA-1) to use to perform an on-the-fly digest" + " hash of retrieved content-bodies.", DEFAULT_DIGEST_ALGORITHM, DIGEST_ALGORITHMS)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_MODIFIED_SINCE, "Send 'If-Modified-Since' header, if previous 'Last-Modified' " + "fetch history information is available in URI history.", new Boolean(true))); e.setOverrideable(true); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_NONE_MATCH, "Send 'If-None-Match' header, if previous 'Etag' fetch " + "history information is available in URI history.", new Boolean(true))); e.setOverrideable(true); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE, "Send 'Connection: close' header with every request.", new Boolean(true))); e.setOverrideable(true); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER, "Send 'Referer' header with every request.\n" + "The 'Referer' header contans the location the crawler came " + " from, " + "the page the current URI was discovered in. The 'Referer' " + "usually is " + "logged on the remote server and can be of assistance to " + "webmasters trying to figure how a crawler got to a " + "particular area on a site.", new Boolean(true))); e.setOverrideable(true); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE, "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES + ") on document size.\n" + "Be polite to the HTTP servers and send the 'Range' header," + "stating that you are only interested in the first n bytes. " + "Only pertinent if " + ATTR_MAX_LENGTH_BYTES + " > 0. " + "Sending the 'Range' header results in a " + "'206 Partial Content' status response, which is better than " + "just cutting the response mid-download. On rare occasion, " + " sending 'Range' will " + "generate '416 Request Range Not Satisfiable' response.", new Boolean(false))); e.setOverrideable(true); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_LOCAL_ADDRESS, "Local IP address or hostname to use when making connections " + "(binding sockets). When not specified, uses default local" + "address(es).", "")); e.setExpertSetting(true); } protected void innerProcess(final CrawlURI curi) throws InterruptedException { if (!canFetch(curi)) { // Cannot fetch this, due to protocol, retries, or other problems return; } this.curisHandled++; // Note begin time curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis()); // Get a reference to the HttpRecorder that is set into this ToeThread. HttpRecorder rec = HttpRecorder.getHttpRecorder(); // Shall we get a digest on the content downloaded? boolean digestContent = ((Boolean)getUncheckedAttribute(curi, ATTR_DIGEST_CONTENT)).booleanValue(); String algorithm = null; if (digestContent) { algorithm = ((String)getUncheckedAttribute(curi, ATTR_DIGEST_ALGORITHM)); rec.getRecordedInput().setDigest(algorithm); } else { // clear rec.getRecordedInput().setDigest((MessageDigest)null); } // Below we do two inner classes that add check of midfetch // filters just as we're about to receive the response body. String curiString = curi.getUURI().toString(); HttpMethodBase method = null; if (curi.isPost()) { method = new HttpRecorderPostMethod(curiString, rec) { protected void readResponseBody(HttpState state, HttpConnection conn) throws IOException, HttpException { addResponseContent(this, curi); if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) { doAbort(curi, this, MIDFETCH_ABORT_LOG); } else { super.readResponseBody(state, conn); } } }; } else { method = new HttpRecorderGetMethod(curiString, rec) { protected void readResponseBody(HttpState state, HttpConnection conn) throws IOException, HttpException { addResponseContent(this, curi); if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) { doAbort(curi, this, MIDFETCH_ABORT_LOG); } else { super.readResponseBody(state, conn); } } }; } HostConfiguration customConfigOrNull = configureMethod(curi, method); // Set httpRecorder into curi. Subsequent code both here and later // in extractors expects to find the HttpRecorder in the CrawlURI. curi.setHttpRecorder(rec); // Populate credentials. Set config so auth. is not automatic. boolean addedCredentials = populateCredentials(curi, method); method.setDoAuthentication(addedCredentials); // set hardMax on bytes (if set by operator) long hardMax = getMaxLength(curi); // set overall timeout (if set by operator) long timeoutMs = 1000 * getTimeout(curi); // Get max fetch rate (bytes/ms). It comes in in KB/sec long maxRateKBps = getMaxFetchRate(curi); rec.getRecordedInput().setLimits(hardMax, timeoutMs, maxRateKBps); try { this.http.executeMethod(customConfigOrNull, method); } catch (RecorderTooMuchHeaderException ex) { // when too much header material, abort like other truncations doAbort(curi, method, HEADER_TRUNC); } catch (IOException e) { failedExecuteCleanup(method, curi, e); return; } catch (ArrayIndexOutOfBoundsException e) { // For weird windows-only ArrayIndex exceptions in native // code... see // http://forum.java.sun.com/thread.jsp?forum=11&thread=378356 // treating as if it were an IOException failedExecuteCleanup(method, curi, e); return; } // set softMax on bytes to get (if implied by content-length) long softMax = method.getResponseContentLength(); try { if (!method.isAborted()) { // Force read-to-end, so that any socket hangs occur here, // not in later modules. rec.getRecordedInput().readFullyOrUntil(softMax); } } catch (RecorderTimeoutException ex) { doAbort(curi, method, TIMER_TRUNC); } catch (RecorderLengthExceededException ex) { doAbort(curi, method, LENGTH_TRUNC); } catch (IOException e) { cleanup(curi, e, "readFully", S_CONNECT_LOST); return; } catch (ArrayIndexOutOfBoundsException e) { // For weird windows-only ArrayIndex exceptions from native code // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356 // treating as if it were an IOException cleanup(curi, e, "readFully", S_CONNECT_LOST); return; } finally { // ensure recording has stopped rec.closeRecorders(); if (!method.isAborted()) { method.releaseConnection(); } // Note completion time curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis()); // Set the response charset into the HttpRecord if available. setCharacterEncoding(rec, method); setSizes(curi, rec); } if (digestContent) { curi.setContentDigest(algorithm, rec.getRecordedInput().getDigestValue()); } if (logger.isLoggable(Level.INFO)) { logger.info((curi.isPost()? "POST": "GET") + " " + curi.getUURI().toString() + " " + method.getStatusCode() + " " + rec.getRecordedInput().getSize() + " " + curi.getContentType()); } if (curi.isSuccess() && addedCredentials) { // Promote the credentials from the CrawlURI to the CrawlServer // so they are available for all subsequent CrawlURIs on this // server. promoteCredentials(curi); if (logger.isLoggable(Level.FINE)) { // Print out the cookie. Might help with the debugging. Header setCookie = method.getResponseHeader("set-cookie"); if (setCookie != null) { logger.fine(setCookie.toString().trim()); } } } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) { // 401 is not 'success'. handle401(method, curi); } if (rec.getRecordedInput().isOpen()) { logger.severe(curi.toString() + " RIS still open. Should have" + " been closed by method release: " + Thread.currentThread().getName()); try { rec.getRecordedInput().close(); } catch (IOException e) { logger.log(Level.SEVERE,"second-chance RIS close failed",e); } } } /** * Update CrawlURI internal sizes based on current transaction (and * in the case of 304s, history) * * @param curi CrawlURI * @param rec HttpRecorder */ protected void setSizes(final CrawlURI curi, HttpRecorder rec) { // set reporting size curi.setContentSize(rec.getRecordedInput().getSize()); // special handling for 304-not modified if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED && curi.containsKey(A_FETCH_HISTORY)) { AList history[] = curi.getAList().getAListArray(A_FETCH_HISTORY); if (history[0] != null && history[0] .containsKey(CoreAttributeConstants.A_REFERENCE_LENGTH)) { long referenceLength = history[0].getLong(A_REFERENCE_LENGTH); // carry-forward previous 'reference-length' for future curi.putLong(A_REFERENCE_LENGTH, referenceLength); // increase content-size to virtual-size for reporting curi.setContentSize(rec.getRecordedInput().getSize() + referenceLength); } } } protected void doAbort(CrawlURI curi, HttpMethod method, String annotation) { curi.addAnnotation(annotation); curi.getHttpRecorder().close(); method.abort(); } protected boolean checkMidfetchAbort(CrawlURI curi, HttpRecorderMethod method, HttpConnection conn) { if (curi.isPrerequisite() || rulesAccept(getMidfetchRule(curi), curi)) { return false; } method.markContentBegin(conn); return true; } protected DecideRule getMidfetchRule(Object o) { try { return (DecideRule)getAttribute(o, ATTR_MIDFETCH_DECIDE_RULES); } catch (AttributeNotFoundException e) { throw new RuntimeException(e);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -