⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchhttp.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
                // this passed curi.  It might be of another subdomain.                // The avatar needs to be added to the server that is dependent                // on this precondition.  Find it by name.  Get the name from                // the credential this avatar represents.                Credential c = ca.getCredential(getSettingsHandler(), curi);                String cd = null;                try {                    cd = c.getCredentialDomain(curi);                }                catch (AttributeNotFoundException e) {                    logger.severe("Failed to get cred domain for " + curi +                        " for " + ca + ": " + e.getMessage());                }                if (cd != null) {                    CrawlServer cs                        = getController().getServerCache().getServerFor(cd);                    if (cs != null) {                        cs.addCredentialAvatar(ca);                    }                }            }        }    }    /**     * Server is looking for basic/digest auth credentials (RFC2617). If we have     * any, put them into the CrawlURI and have it come around again. Presence     * of the credential serves as flag to frontier to requeue promptly. If we     * already tried this domain and still got a 401, then our credentials are     * bad. Remove them and let this curi die.     *     * @param method Method that got a 401.     * @param curi CrawlURI that got a 401.     */    protected void handle401(final HttpMethod method, final CrawlURI curi) {        AuthScheme authscheme = getAuthScheme(method, curi);        if (authscheme == null) {        	return;        }        String realm = authscheme.getRealm();                // Look to see if this curi had rfc2617 avatars loaded.  If so, are        // any of them for this realm?  If so, then the credential failed        // if we got a 401 and it should be let die a natural 401 death.        Set curiRfc2617Credentials = getCredentials(getSettingsHandler(),        		curi, Rfc2617Credential.class);        Rfc2617Credential extant = Rfc2617Credential.		    getByRealm(curiRfc2617Credentials, realm, curi);        if (extant != null) {        	// Then, already tried this credential.  Remove ANY rfc2617        	// credential since presence of a rfc2617 credential serves        	// as flag to frontier to requeue this curi and let the curi        	// die a natural death.        	extant.detachAll(curi);        	logger.warning("Auth failed (401) though supplied realm " +        			realm + " to " + curi.toString());        } else {        	// Look see if we have a credential that corresponds to this        	// realm in credential store.  Filter by type and credential        	// domain.  If not, let this curi die. Else, add it to the        	// curi and let it come around again. Add in the AuthScheme        	// we got too.  Its needed when we go to run the Auth on        	// second time around.        	CredentialStore cs =        		CredentialStore.getCredentialStore(getSettingsHandler());        	if (cs == null) {        		logger.severe("No credential store for " + curi);        	} else {                CrawlServer server = getController().getServerCache().                    getServerFor(curi);        		Set storeRfc2617Credentials = cs.subset(curi,        		    Rfc2617Credential.class, server.getName());        		if (storeRfc2617Credentials == null ||        				storeRfc2617Credentials.size() <= 0) {        			logger.info("No rfc2617 credentials for " + curi);        		} else {        			Rfc2617Credential found = Rfc2617Credential.					    getByRealm(storeRfc2617Credentials, realm, curi);        			if (found == null) {        				logger.info("No rfc2617 credentials for realm " +        						realm + " in " + curi);        			} else {        				found.attach(curi, authscheme.getRealm());        				logger.info("Found credential for realm " + realm +        				    " in store for " + curi.toString());        			}        		}        	}        }    }        /**     * @param method Method that got a 401.     * @param curi CrawlURI that got a 401.     * @return Returns first wholesome authscheme found else null.     */    protected AuthScheme getAuthScheme(final HttpMethod method,            final CrawlURI curi) {        Header [] headers = method.getResponseHeaders("WWW-Authenticate");        if (headers == null || headers.length <= 0) {            logger.info("We got a 401 but no WWW-Authenticate challenge: " +                curi.toString());            return null;        }        Map authschemes = null;        try {            authschemes = AuthChallengeParser.parseChallenges(headers);        } catch(MalformedChallengeException e) {            logger.info("Failed challenge parse: " + e.getMessage());        }        if (authschemes == null || authschemes.size() <= 0) {            logger.info("We got a 401 and WWW-Authenticate challenge" +                " but failed parse of the header " + curi.toString());            return null;        }                             AuthScheme result = null;        // Use the first auth found.        for (Iterator i = authschemes.keySet().iterator();                result == null && i.hasNext();) {        	String key = (String)i.next();            String challenge = (String)authschemes.get(key);            if (key == null || key.length() <= 0 || challenge == null ||                  challenge.length() <= 0) {            	logger.warning("Empty scheme: " + curi.toString() +                  ": " + headers);            }        	AuthScheme authscheme = null;        	if (key.equals("basic")) {        		authscheme = new BasicScheme();        	} else if (key.equals("digest")) {        		authscheme = new DigestScheme();        	} else {        		logger.info("Unsupported scheme: " + key);        		continue;        	}                        try {				authscheme.processChallenge(challenge);			} catch (MalformedChallengeException e) {				logger.info(e.getMessage() + " " + curi + " " + headers);                continue;			}        	if (authscheme.isConnectionBased()) {        		logger.info("Connection based " + authscheme);        		continue;        	}        	        	if (authscheme.getRealm() == null ||        			authscheme.getRealm().length() <= 0) {        		logger.info("Empty realm " + authscheme + " for " + curi);        		continue;        	}        	result = authscheme;        }                return result;    }            /**     * @param handler Settings Handler.     * @param curi CrawlURI that got a 401.     * @param type Class of credential to get from curi.     * @return Set of credentials attached to this curi.     */    private Set<Credential> getCredentials(SettingsHandler handler,             CrawlURI curi, Class type) {        Set<Credential> result = null;        if (curi.hasCredentialAvatars()) {            for (Iterator i = curi.getCredentialAvatars().iterator();                    i.hasNext();) {                CredentialAvatar ca = (CredentialAvatar)i.next();                if (ca.match(type)) {                    if (result == null) {                        result = new HashSet<Credential>();                    }                    result.add(ca.getCredential(handler, curi));                }            }        }        return result;    }    public void initialTasks() {        super.initialTasks();        this.getController().addCrawlStatusListener(this);        configureHttp();        // load cookies from a file if specified in the order file.        loadCookies();        // I tried to get the default KeyManagers but doesn't work unless you        // point at a physical keystore. Passing null seems to do the right        // thing so we'll go w/ that.        try {        	SSLContext context = SSLContext.getInstance("SSL");			context.init(null, new TrustManager[] {			    new ConfigurableX509TrustManager((String)			        getAttribute(ATTR_TRUST))}, null);	        this.sslfactory = context.getSocketFactory();		} catch (Exception e) {			logger.log(Level.WARNING, "Failed configure of ssl context "			    + e.getMessage(), e);		}    }        public void finalTasks() {        // At the end save cookies to the file specified in the order file.        saveCookies();        cleanupHttp();        super.finalTasks();    }    /**     * Perform any final cleanup related to the HttpClient instance.     */    protected void cleanupHttp() {        if(cookieDb!=null) {            try {                cookieDb.sync();                cookieDb.close();            } catch (DatabaseException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }        }    }    protected void configureHttp() throws RuntimeException {        // Get timeout.  Use it for socket and for connection timeout.        int timeout = (getSoTimeout(null) > 0)? getSoTimeout(null): 0;                // HttpConnectionManager cm = new ThreadLocalHttpConnectionManager();        HttpConnectionManager cm = new SingleHttpConnectionManager();                // TODO: The following settings should be made in the corresponding        // HttpConnectionManager, not here.        HttpConnectionManagerParams hcmp = cm.getParams();        hcmp.setConnectionTimeout(timeout);        hcmp.setStaleCheckingEnabled(true);        // Minimizes bandwidth usage.  Setting to true disables Nagle's        // algorithm.  IBM JVMs < 142 give an NPE setting this boolean        // on ssl sockets.        hcmp.setTcpNoDelay(false);                this.http = new HttpClient(cm);        HttpClientParams hcp = this.http.getParams();        // Set default socket timeout.        hcp.setSoTimeout(timeout);        // Set client to be version 1.0.        hcp.setVersion(HttpVersion.HTTP_1_0);		String addressStr = null;		try {			addressStr = (String) getAttribute(ATTR_LOCAL_ADDRESS);		} catch (Exception e1) {			// If exception, just use default.		}		if (addressStr != null && addressStr.length() > 0) {			try {				InetAddress localAddress = InetAddress.getByName(addressStr);				this.http.getHostConfiguration().setLocalAddress(localAddress);			} catch (UnknownHostException e) {				// Convert all to RuntimeException so get an exception out				// if initialization fails.				throw new RuntimeException("Unknown host " + addressStr				    + " in " + ATTR_LOCAL_ADDRESS);			}		}		configureHttpCookies();                // Configure how we want the method to act.        this.http.getParams().setParameter(            HttpMethodParams.SINGLE_COOKIE_HEADER, new Boolean(true));        this.http.getParams().setParameter(            HttpMethodParams.UNAMBIGUOUS_STATUS_LINE , new Boolean(false));        this.http.getParams().setParameter(            HttpMethodParams.STRICT_TRANSFER_ENCODING, new Boolean(false));        this.http.getParams().setIntParameter(            HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10);                HostConfiguration configOrNull = configureProxy(null);        if(configOrNull!=null) {            // global proxy settings are in effect            this.http.setHostConfiguration(configOrNull);        }                // Use our own protocol factory, one that gets IP to use from        // heritrix cache (They're cached in CrawlHost instances).        final ServerCache cache = getController().getServerCache();        hcmp.setParameter(SERVER_CACHE_KEY, cache);        hcmp.setParameter(SSL_FACTORY_KEY, this.sslfactory);	}    /**     * Set the HttpClient HttpState instance to use a BDB-backed     * StoredSortedMap for cookie storage, if that option is chosen.     */    private void configureHttpCookies() {        // If Bdb-backed cookies chosen, replace map in HttpState        if(((Boolean)getUncheckedAttribute(null, ATTR_BDB_COOKIES)).                booleanValue()) {            try {                EnhancedEnvironment env = getController().getBdbEnvironment();                StoredClassCatalog classCatalog = env.getClassCatalog();                DatabaseConfig dbConfig = new DatabaseConfig();                dbConfig.setTransactional(false);                dbConfig.setAllowCreate(true);                dbConfig.setDeferredWrite(true);                cookieDb = env.openDatabase(null, COOKIEDB_NAME, dbConfig);                StoredSortedMap cookiesMap = new StoredSortedMap(cookieDb,                        new StringBinding(), new SerialBinding(classCatalog,                                Cookie.class), true);                this.http.getState().setCookiesMap(cookiesMap);            } catch (DatabaseException e) {                // TODO Auto-generated catch block

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -