⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchhttp.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
        			}        		}        	}        }    }        /**     * @param method Method that got a 401.     * @param curi CrawlURI that got a 401.     * @return Returns first wholesome authscheme found else null.     */    protected AuthScheme getAuthScheme(final HttpMethod method,            final CrawlURI curi) {        Header [] headers = method.getResponseHeaders("WWW-Authenticate");        if (headers == null || headers.length <= 0) {            logger.info("We got a 401 but no WWW-Authenticate challenge: " +                curi.toString());            return null;        }        Map authschemes = null;        try {            authschemes = AuthChallengeParser.parseChallenges(headers);        } catch(MalformedChallengeException e) {            logger.info("Failed challenge parse: " + e.getMessage());        }        if (authschemes == null || authschemes.size() <= 0) {            logger.info("We got a 401 and WWW-Authenticate challenge" +                " but failed parse of the header " + curi.toString());            return null;        }                             AuthScheme result = null;        // Use the first auth found.        for (Iterator i = authschemes.keySet().iterator();                result == null && i.hasNext();) {        	String key = (String)i.next();            String challenge = (String)authschemes.get(key);            if (key == null || key.length() <= 0 || challenge == null ||                  challenge.length() <= 0) {            	logger.warning("Empty scheme: " + curi.toString() +                  ": " + headers);            }        	AuthScheme authscheme = null;        	if (key.equals("basic")) {        		authscheme = new BasicScheme();        	} else if (key.equals("digest")) {        		authscheme = new DigestScheme();        	} else {        		logger.info("Unsupported scheme: " + key);        		continue;        	}                        try {				authscheme.processChallenge(challenge);			} catch (MalformedChallengeException e) {				logger.info(e.getMessage() + " " + curi + " " + headers);                continue;			}        	if (authscheme.isConnectionBased()) {        		logger.info("Connection based " + authscheme);        		continue;        	}        	        	if (authscheme.getRealm() == null ||        			authscheme.getRealm().length() <= 0) {        		logger.info("Empty realm " + authscheme + " for " + curi);        		continue;        	}        	result = authscheme;        }                return result;    }            /**     * @param handler Settings Handler.     * @param curi CrawlURI that got a 401.     * @param type Class of credential to get from curi.     * @return Set of credentials attached to this curi.     */    private Set getCredentials(SettingsHandler handler, CrawlURI curi,            Class type) {        Set result = null;        if (curi.hasCredentialAvatars()) {            for (Iterator i = curi.getCredentialAvatars().iterator();                    i.hasNext();) {                CredentialAvatar ca = (CredentialAvatar)i.next();                if (ca.match(type)) {                    if (result == null) {                        result = new HashSet();                    }                    result.add(ca.getCredential(handler, curi));                }            }        }        return result;    }    public void initialTasks() {        super.initialTasks();        this.getController().addCrawlStatusListener(this);        configureHttp();        // load cookies from a file if specified in the order file.        loadCookies();        // I tried to get the default KeyManagers but doesn't work unless you        // point at a physical keystore. Passing null seems to do the right        // thing so we'll go w/ that.        try {        	SSLContext context = SSLContext.getInstance("SSL");			context.init(null, new TrustManager[] {			    new ConfigurableX509TrustManager((String)			        getAttribute(ATTR_TRUST))}, null);	        this.sslfactory = context.getSocketFactory();		} catch (Exception e) {			logger.log(Level.WARNING, "Failed configure of ssl context "			    + e.getMessage(), e);		}    }        public void finalTasks() {        // At the end save cookies to the file specified in the order file.        saveCookies();        cleanupHttp();        super.finalTasks();    }    /**     * Perform any final cleanup related to the HttpClient instance.     */    protected void cleanupHttp() {        if(cookieDb!=null) {            try {                cookieDb.close();            } catch (DatabaseException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }        }    }    protected void configureHttp() throws RuntimeException {        // Get timeout.  Use it for socket and for connection timeout.        int timeout = (getSoTimeout(null) > 0)? getSoTimeout(null): 0;                // HttpConnectionManager cm = new ThreadLocalHttpConnectionManager();        HttpConnectionManager cm = new SingleHttpConnectionManager();                // TODO: The following settings should be made in the corresponding        // HttpConnectionManager, not here.        HttpConnectionManagerParams hcmp = cm.getParams();        hcmp.setConnectionTimeout(timeout);        hcmp.setStaleCheckingEnabled(true);        // Minimizes bandwidth usage.  Setting to true disables Nagle's        // algorithm.  IBM JVMs < 142 give an NPE setting this boolean        // on ssl sockets.        hcmp.setTcpNoDelay(false);                this.http = new HttpClient(cm);        HttpClientParams hcp = this.http.getParams();        // Set default socket timeout.        hcp.setSoTimeout(timeout);        // Set client to be version 1.0.        hcp.setVersion(HttpVersion.HTTP_1_0);		String addressStr = null;		try {			addressStr = (String) getAttribute(ATTR_LOCAL_ADDRESS);		} catch (Exception e1) {			// If exception, just use default.		}		if (addressStr != null && addressStr.length() > 0) {			try {				InetAddress localAddress = InetAddress.getByName(addressStr);				this.http.getHostConfiguration().setLocalAddress(localAddress);			} catch (UnknownHostException e) {				// Convert all to RuntimeException so get an exception out				// if initialization fails.				throw new RuntimeException("Unknown host " + addressStr				    + " in " + ATTR_LOCAL_ADDRESS);			}		}		configureHttpCookies();                // Configure how we want the method to act.        this.http.getParams().setParameter(            HttpMethodParams.SINGLE_COOKIE_HEADER, new Boolean(true));        this.http.getParams().setParameter(            HttpMethodParams.UNAMBIGUOUS_STATUS_LINE , new Boolean(false));        this.http.getParams().setParameter(            HttpMethodParams.STRICT_TRANSFER_ENCODING, new Boolean(false));        this.http.getParams().setIntParameter(            HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10);                HostConfiguration configOrNull = configureProxy(null);        if(configOrNull!=null) {            // global proxy settings are in effect            this.http.setHostConfiguration(configOrNull);        }                // Use our own protocol factory, one that gets IP to use from        // heritrix cache (They're cached in CrawlHost instances).        final ServerCache cache = getController().getServerCache();        hcmp.setParameter(SERVER_CACHE_KEY, cache);        hcmp.setParameter(SSL_FACTORY_KEY, this.sslfactory);	}    /**     * Set the HttpClient HttpState instance to use a BDB-backed     * StoredSortedMap for cookie storage, if that option is chosen.     */    private void configureHttpCookies() {        // If Bdb-backed cookies chosen, replace map in HttpState        if(((Boolean)getUncheckedAttribute(null, ATTR_BDB_COOKIES)).                booleanValue()) {            try {                Environment env = getController().getBdbEnvironment();                StoredClassCatalog classCatalog = getController().getClassCatalog();                DatabaseConfig dbConfig = new DatabaseConfig();                dbConfig.setTransactional(false);                dbConfig.setAllowCreate(true);                cookieDb = env.openDatabase(null, COOKIEDB_NAME, dbConfig);                StoredSortedMap cookiesMap = new StoredSortedMap(cookieDb,                        new StringBinding(), new SerialBinding(classCatalog,                                Cookie.class), true);                this.http.getState().setCookiesMap(cookiesMap);            } catch (DatabaseException e) {                // TODO Auto-generated catch block                logger.severe(e.getMessage());                e.printStackTrace();            }        }    }    /**     * @param curi Current CrawlURI.  Used to get context.     * @return Socket timeout value.     */    private int getSoTimeout(CrawlURI curi) {        Integer res = null;        try {            res = (Integer) getAttribute(ATTR_SOTIMEOUT_MS, curi);        } catch (Exception e) {            res = DEFAULT_SOTIMEOUT_MS;        }        return res.intValue();    }    /**     * @param curi Current CrawlURI.  Used to get context.     * @return Timeout value for total request.     */    private int getTimeout(CrawlURI curi) {        Integer res;        try {            res = (Integer) getAttribute(ATTR_TIMEOUT_SECONDS, curi);        } catch (Exception e) {            res = DEFAULT_TIMEOUT_SECONDS;        }        return res.intValue();    }    private int getMaxFetchRate(CrawlURI curi) {        Integer res;        try {            res = (Integer)getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi);        }        catch (Exception e) {            res = DEFAULT_FETCH_BANDWIDTH_MAX;        }        return res.intValue();    }    private long getMaxLength(CrawlURI curi) {        Long res;        try {            res = (Long) getAttribute(ATTR_MAX_LENGTH_BYTES, curi);            if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) {                res = DEFAULT_MAX_LENGTH_BYTES;            }        } catch (Exception e) {            res = DEFAULT_MAX_LENGTH_BYTES;        }        return res.longValue();    }    /**     * Load cookies from a file before the first fetch.     * <p>     * The file is a text file in the Netscape's 'cookies.txt' file format.<br>     * Example entry of cookies.txt file:<br>     * <br>     * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>     * <br>     * Each line has 7 tab-separated fields:<br>     * <li>1. DOMAIN: The domain that created and have access to the cookie     * value.     * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given     * domain can access the cookie value.     * <li>3. PATH: The path within the domain that the cookie value is valid     * for.     * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure     * connection to access the cookie value.     * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)     * <li>6. NAME: The name of the cookie value     * <li>7. VALUE: The cookie value     *     * @param cookiesFile file in the Netscape's 'cookies.txt' format.     */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -