📄 fetchhttp.java
字号:
} } } } } /** * @param method Method that got a 401. * @param curi CrawlURI that got a 401. * @return Returns first wholesome authscheme found else null. */ protected AuthScheme getAuthScheme(final HttpMethod method, final CrawlURI curi) { Header [] headers = method.getResponseHeaders("WWW-Authenticate"); if (headers == null || headers.length <= 0) { logger.info("We got a 401 but no WWW-Authenticate challenge: " + curi.toString()); return null; } Map authschemes = null; try { authschemes = AuthChallengeParser.parseChallenges(headers); } catch(MalformedChallengeException e) { logger.info("Failed challenge parse: " + e.getMessage()); } if (authschemes == null || authschemes.size() <= 0) { logger.info("We got a 401 and WWW-Authenticate challenge" + " but failed parse of the header " + curi.toString()); return null; } AuthScheme result = null; // Use the first auth found. for (Iterator i = authschemes.keySet().iterator(); result == null && i.hasNext();) { String key = (String)i.next(); String challenge = (String)authschemes.get(key); if (key == null || key.length() <= 0 || challenge == null || challenge.length() <= 0) { logger.warning("Empty scheme: " + curi.toString() + ": " + headers); } AuthScheme authscheme = null; if (key.equals("basic")) { authscheme = new BasicScheme(); } else if (key.equals("digest")) { authscheme = new DigestScheme(); } else { logger.info("Unsupported scheme: " + key); continue; } try { authscheme.processChallenge(challenge); } catch (MalformedChallengeException e) { logger.info(e.getMessage() + " " + curi + " " + headers); continue; } if (authscheme.isConnectionBased()) { logger.info("Connection based " + authscheme); continue; } if (authscheme.getRealm() == null || authscheme.getRealm().length() <= 0) { logger.info("Empty realm " + authscheme + " for " + curi); continue; } result = authscheme; } return result; } /** * @param handler Settings Handler. * @param curi CrawlURI that got a 401. * @param type Class of credential to get from curi. * @return Set of credentials attached to this curi. */ private Set getCredentials(SettingsHandler handler, CrawlURI curi, Class type) { Set result = null; if (curi.hasCredentialAvatars()) { for (Iterator i = curi.getCredentialAvatars().iterator(); i.hasNext();) { CredentialAvatar ca = (CredentialAvatar)i.next(); if (ca.match(type)) { if (result == null) { result = new HashSet(); } result.add(ca.getCredential(handler, curi)); } } } return result; } public void initialTasks() { super.initialTasks(); this.getController().addCrawlStatusListener(this); configureHttp(); // load cookies from a file if specified in the order file. loadCookies(); // I tried to get the default KeyManagers but doesn't work unless you // point at a physical keystore. Passing null seems to do the right // thing so we'll go w/ that. try { SSLContext context = SSLContext.getInstance("SSL"); context.init(null, new TrustManager[] { new ConfigurableX509TrustManager((String) getAttribute(ATTR_TRUST))}, null); this.sslfactory = context.getSocketFactory(); } catch (Exception e) { logger.log(Level.WARNING, "Failed configure of ssl context " + e.getMessage(), e); } } public void finalTasks() { // At the end save cookies to the file specified in the order file. saveCookies(); cleanupHttp(); super.finalTasks(); } /** * Perform any final cleanup related to the HttpClient instance. */ protected void cleanupHttp() { if(cookieDb!=null) { try { cookieDb.close(); } catch (DatabaseException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } protected void configureHttp() throws RuntimeException { // Get timeout. Use it for socket and for connection timeout. int timeout = (getSoTimeout(null) > 0)? getSoTimeout(null): 0; // HttpConnectionManager cm = new ThreadLocalHttpConnectionManager(); HttpConnectionManager cm = new SingleHttpConnectionManager(); // TODO: The following settings should be made in the corresponding // HttpConnectionManager, not here. HttpConnectionManagerParams hcmp = cm.getParams(); hcmp.setConnectionTimeout(timeout); hcmp.setStaleCheckingEnabled(true); // Minimizes bandwidth usage. Setting to true disables Nagle's // algorithm. IBM JVMs < 142 give an NPE setting this boolean // on ssl sockets. hcmp.setTcpNoDelay(false); this.http = new HttpClient(cm); HttpClientParams hcp = this.http.getParams(); // Set default socket timeout. hcp.setSoTimeout(timeout); // Set client to be version 1.0. hcp.setVersion(HttpVersion.HTTP_1_0); String addressStr = null; try { addressStr = (String) getAttribute(ATTR_LOCAL_ADDRESS); } catch (Exception e1) { // If exception, just use default. } if (addressStr != null && addressStr.length() > 0) { try { InetAddress localAddress = InetAddress.getByName(addressStr); this.http.getHostConfiguration().setLocalAddress(localAddress); } catch (UnknownHostException e) { // Convert all to RuntimeException so get an exception out // if initialization fails. throw new RuntimeException("Unknown host " + addressStr + " in " + ATTR_LOCAL_ADDRESS); } } configureHttpCookies(); // Configure how we want the method to act. this.http.getParams().setParameter( HttpMethodParams.SINGLE_COOKIE_HEADER, new Boolean(true)); this.http.getParams().setParameter( HttpMethodParams.UNAMBIGUOUS_STATUS_LINE , new Boolean(false)); this.http.getParams().setParameter( HttpMethodParams.STRICT_TRANSFER_ENCODING, new Boolean(false)); this.http.getParams().setIntParameter( HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10); HostConfiguration configOrNull = configureProxy(null); if(configOrNull!=null) { // global proxy settings are in effect this.http.setHostConfiguration(configOrNull); } // Use our own protocol factory, one that gets IP to use from // heritrix cache (They're cached in CrawlHost instances). final ServerCache cache = getController().getServerCache(); hcmp.setParameter(SERVER_CACHE_KEY, cache); hcmp.setParameter(SSL_FACTORY_KEY, this.sslfactory); } /** * Set the HttpClient HttpState instance to use a BDB-backed * StoredSortedMap for cookie storage, if that option is chosen. */ private void configureHttpCookies() { // If Bdb-backed cookies chosen, replace map in HttpState if(((Boolean)getUncheckedAttribute(null, ATTR_BDB_COOKIES)). booleanValue()) { try { Environment env = getController().getBdbEnvironment(); StoredClassCatalog classCatalog = getController().getClassCatalog(); DatabaseConfig dbConfig = new DatabaseConfig(); dbConfig.setTransactional(false); dbConfig.setAllowCreate(true); cookieDb = env.openDatabase(null, COOKIEDB_NAME, dbConfig); StoredSortedMap cookiesMap = new StoredSortedMap(cookieDb, new StringBinding(), new SerialBinding(classCatalog, Cookie.class), true); this.http.getState().setCookiesMap(cookiesMap); } catch (DatabaseException e) { // TODO Auto-generated catch block logger.severe(e.getMessage()); e.printStackTrace(); } } } /** * @param curi Current CrawlURI. Used to get context. * @return Socket timeout value. */ private int getSoTimeout(CrawlURI curi) { Integer res = null; try { res = (Integer) getAttribute(ATTR_SOTIMEOUT_MS, curi); } catch (Exception e) { res = DEFAULT_SOTIMEOUT_MS; } return res.intValue(); } /** * @param curi Current CrawlURI. Used to get context. * @return Timeout value for total request. */ private int getTimeout(CrawlURI curi) { Integer res; try { res = (Integer) getAttribute(ATTR_TIMEOUT_SECONDS, curi); } catch (Exception e) { res = DEFAULT_TIMEOUT_SECONDS; } return res.intValue(); } private int getMaxFetchRate(CrawlURI curi) { Integer res; try { res = (Integer)getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi); } catch (Exception e) { res = DEFAULT_FETCH_BANDWIDTH_MAX; } return res.intValue(); } private long getMaxLength(CrawlURI curi) { Long res; try { res = (Long) getAttribute(ATTR_MAX_LENGTH_BYTES, curi); if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) { res = DEFAULT_MAX_LENGTH_BYTES; } } catch (Exception e) { res = DEFAULT_MAX_LENGTH_BYTES; } return res.longValue(); } /** * Load cookies from a file before the first fetch. * <p> * The file is a text file in the Netscape's 'cookies.txt' file format.<br> * Example entry of cookies.txt file:<br> * <br> * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br> * <br> * Each line has 7 tab-separated fields:<br> * <li>1. DOMAIN: The domain that created and have access to the cookie * value. * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given * domain can access the cookie value. * <li>3. PATH: The path within the domain that the cookie value is valid * for. * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure * connection to access the cookie value. * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.) * <li>6. NAME: The name of the cookie value * <li>7. VALUE: The cookie value * * @param cookiesFile file in the Netscape's 'cookies.txt' format. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -