📄 fetchhttp.java
字号:
// this passed curi. It might be of another subdomain. // The avatar needs to be added to the server that is dependent // on this precondition. Find it by name. Get the name from // the credential this avatar represents. Credential c = ca.getCredential(getSettingsHandler(), curi); String cd = null; try { cd = c.getCredentialDomain(curi); } catch (AttributeNotFoundException e) { logger.severe("Failed to get cred domain for " + curi + " for " + ca + ": " + e.getMessage()); } if (cd != null) { CrawlServer cs = getController().getServerCache().getServerFor(cd); if (cs != null) { cs.addCredentialAvatar(ca); } } } } } /** * Server is looking for basic/digest auth credentials (RFC2617). If we have * any, put them into the CrawlURI and have it come around again. Presence * of the credential serves as flag to frontier to requeue promptly. If we * already tried this domain and still got a 401, then our credentials are * bad. Remove them and let this curi die. * * @param method Method that got a 401. * @param curi CrawlURI that got a 401. */ protected void handle401(final HttpMethod method, final CrawlURI curi) { AuthScheme authscheme = getAuthScheme(method, curi); if (authscheme == null) { return; } String realm = authscheme.getRealm(); // Look to see if this curi had rfc2617 avatars loaded. If so, are // any of them for this realm? If so, then the credential failed // if we got a 401 and it should be let die a natural 401 death. Set curiRfc2617Credentials = getCredentials(getSettingsHandler(), curi, Rfc2617Credential.class); Rfc2617Credential extant = Rfc2617Credential. getByRealm(curiRfc2617Credentials, realm, curi); if (extant != null) { // Then, already tried this credential. Remove ANY rfc2617 // credential since presence of a rfc2617 credential serves // as flag to frontier to requeue this curi and let the curi // die a natural death. extant.detachAll(curi); logger.warning("Auth failed (401) though supplied realm " + realm + " to " + curi.toString()); } else { // Look see if we have a credential that corresponds to this // realm in credential store. Filter by type and credential // domain. If not, let this curi die. Else, add it to the // curi and let it come around again. Add in the AuthScheme // we got too. Its needed when we go to run the Auth on // second time around. CredentialStore cs = CredentialStore.getCredentialStore(getSettingsHandler()); if (cs == null) { logger.severe("No credential store for " + curi); } else { CrawlServer server = getController().getServerCache(). getServerFor(curi); Set storeRfc2617Credentials = cs.subset(curi, Rfc2617Credential.class, server.getName()); if (storeRfc2617Credentials == null || storeRfc2617Credentials.size() <= 0) { logger.info("No rfc2617 credentials for " + curi); } else { Rfc2617Credential found = Rfc2617Credential. getByRealm(storeRfc2617Credentials, realm, curi); if (found == null) { logger.info("No rfc2617 credentials for realm " + realm + " in " + curi); } else { found.attach(curi, authscheme.getRealm()); logger.info("Found credential for realm " + realm + " in store for " + curi.toString()); } } } } } /** * @param method Method that got a 401. * @param curi CrawlURI that got a 401. * @return Returns first wholesome authscheme found else null. */ protected AuthScheme getAuthScheme(final HttpMethod method, final CrawlURI curi) { Header [] headers = method.getResponseHeaders("WWW-Authenticate"); if (headers == null || headers.length <= 0) { logger.info("We got a 401 but no WWW-Authenticate challenge: " + curi.toString()); return null; } Map authschemes = null; try { authschemes = AuthChallengeParser.parseChallenges(headers); } catch(MalformedChallengeException e) { logger.info("Failed challenge parse: " + e.getMessage()); } if (authschemes == null || authschemes.size() <= 0) { logger.info("We got a 401 and WWW-Authenticate challenge" + " but failed parse of the header " + curi.toString()); return null; } AuthScheme result = null; // Use the first auth found. for (Iterator i = authschemes.keySet().iterator(); result == null && i.hasNext();) { String key = (String)i.next(); String challenge = (String)authschemes.get(key); if (key == null || key.length() <= 0 || challenge == null || challenge.length() <= 0) { logger.warning("Empty scheme: " + curi.toString() + ": " + headers); } AuthScheme authscheme = null; if (key.equals("basic")) { authscheme = new BasicScheme(); } else if (key.equals("digest")) { authscheme = new DigestScheme(); } else { logger.info("Unsupported scheme: " + key); continue; } try { authscheme.processChallenge(challenge); } catch (MalformedChallengeException e) { logger.info(e.getMessage() + " " + curi + " " + headers); continue; } if (authscheme.isConnectionBased()) { logger.info("Connection based " + authscheme); continue; } if (authscheme.getRealm() == null || authscheme.getRealm().length() <= 0) { logger.info("Empty realm " + authscheme + " for " + curi); continue; } result = authscheme; } return result; } /** * @param handler Settings Handler. * @param curi CrawlURI that got a 401. * @param type Class of credential to get from curi. * @return Set of credentials attached to this curi. */ private Set<Credential> getCredentials(SettingsHandler handler, CrawlURI curi, Class type) { Set<Credential> result = null; if (curi.hasCredentialAvatars()) { for (Iterator i = curi.getCredentialAvatars().iterator(); i.hasNext();) { CredentialAvatar ca = (CredentialAvatar)i.next(); if (ca.match(type)) { if (result == null) { result = new HashSet<Credential>(); } result.add(ca.getCredential(handler, curi)); } } } return result; } public void initialTasks() { super.initialTasks(); this.getController().addCrawlStatusListener(this); configureHttp(); // load cookies from a file if specified in the order file. loadCookies(); // I tried to get the default KeyManagers but doesn't work unless you // point at a physical keystore. Passing null seems to do the right // thing so we'll go w/ that. try { SSLContext context = SSLContext.getInstance("SSL"); context.init(null, new TrustManager[] { new ConfigurableX509TrustManager((String) getAttribute(ATTR_TRUST))}, null); this.sslfactory = context.getSocketFactory(); } catch (Exception e) { logger.log(Level.WARNING, "Failed configure of ssl context " + e.getMessage(), e); } } public void finalTasks() { // At the end save cookies to the file specified in the order file. saveCookies(); cleanupHttp(); super.finalTasks(); } /** * Perform any final cleanup related to the HttpClient instance. */ protected void cleanupHttp() { if(cookieDb!=null) { try { cookieDb.sync(); cookieDb.close(); } catch (DatabaseException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } protected void configureHttp() throws RuntimeException { // Get timeout. Use it for socket and for connection timeout. int timeout = (getSoTimeout(null) > 0)? getSoTimeout(null): 0; // HttpConnectionManager cm = new ThreadLocalHttpConnectionManager(); HttpConnectionManager cm = new SingleHttpConnectionManager(); // TODO: The following settings should be made in the corresponding // HttpConnectionManager, not here. HttpConnectionManagerParams hcmp = cm.getParams(); hcmp.setConnectionTimeout(timeout); hcmp.setStaleCheckingEnabled(true); // Minimizes bandwidth usage. Setting to true disables Nagle's // algorithm. IBM JVMs < 142 give an NPE setting this boolean // on ssl sockets. hcmp.setTcpNoDelay(false); this.http = new HttpClient(cm); HttpClientParams hcp = this.http.getParams(); // Set default socket timeout. hcp.setSoTimeout(timeout); // Set client to be version 1.0. hcp.setVersion(HttpVersion.HTTP_1_0); String addressStr = null; try { addressStr = (String) getAttribute(ATTR_LOCAL_ADDRESS); } catch (Exception e1) { // If exception, just use default. } if (addressStr != null && addressStr.length() > 0) { try { InetAddress localAddress = InetAddress.getByName(addressStr); this.http.getHostConfiguration().setLocalAddress(localAddress); } catch (UnknownHostException e) { // Convert all to RuntimeException so get an exception out // if initialization fails. throw new RuntimeException("Unknown host " + addressStr + " in " + ATTR_LOCAL_ADDRESS); } } configureHttpCookies(); // Configure how we want the method to act. this.http.getParams().setParameter( HttpMethodParams.SINGLE_COOKIE_HEADER, new Boolean(true)); this.http.getParams().setParameter( HttpMethodParams.UNAMBIGUOUS_STATUS_LINE , new Boolean(false)); this.http.getParams().setParameter( HttpMethodParams.STRICT_TRANSFER_ENCODING, new Boolean(false)); this.http.getParams().setIntParameter( HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10); HostConfiguration configOrNull = configureProxy(null); if(configOrNull!=null) { // global proxy settings are in effect this.http.setHostConfiguration(configOrNull); } // Use our own protocol factory, one that gets IP to use from // heritrix cache (They're cached in CrawlHost instances). final ServerCache cache = getController().getServerCache(); hcmp.setParameter(SERVER_CACHE_KEY, cache); hcmp.setParameter(SSL_FACTORY_KEY, this.sslfactory); } /** * Set the HttpClient HttpState instance to use a BDB-backed * StoredSortedMap for cookie storage, if that option is chosen. */ private void configureHttpCookies() { // If Bdb-backed cookies chosen, replace map in HttpState if(((Boolean)getUncheckedAttribute(null, ATTR_BDB_COOKIES)). booleanValue()) { try { EnhancedEnvironment env = getController().getBdbEnvironment(); StoredClassCatalog classCatalog = env.getClassCatalog(); DatabaseConfig dbConfig = new DatabaseConfig(); dbConfig.setTransactional(false); dbConfig.setAllowCreate(true); dbConfig.setDeferredWrite(true); cookieDb = env.openDatabase(null, COOKIEDB_NAME, dbConfig); StoredSortedMap cookiesMap = new StoredSortedMap(cookieDb, new StringBinding(), new SerialBinding(classCatalog, Cookie.class), true); this.http.getState().setCookiesMap(cookiesMap); } catch (DatabaseException e) { // TODO Auto-generated catch block
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -