⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchhttp.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
/* FetchHTTP.java * * $Id: FetchHTTP.java 5093 2007-04-24 21:48:34Z gojomo $ * * Created on Jun 5, 2003 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.fetcher;import it.unimi.dsi.mg4j.util.MutableString;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.RandomAccessFile;import java.security.KeyManagementException;import java.security.KeyStoreException;import java.security.MessageDigest;import java.security.NoSuchAlgorithmException;import java.util.Collection;import java.util.HashSet;import java.util.Iterator;import java.util.List;import java.util.ListIterator;import java.util.Map;import java.util.Set;import java.util.logging.Level;import java.util.logging.Logger;import java.net.InetAddress;import java.net.UnknownHostException;import javax.management.AttributeNotFoundException;import javax.management.MBeanException;import javax.management.ReflectionException;import javax.net.ssl.SSLContext;import javax.net.ssl.SSLSocketFactory;import javax.net.ssl.TrustManager;import org.apache.commons.httpclient.Cookie;import org.apache.commons.httpclient.Header;import org.apache.commons.httpclient.HostConfiguration;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpConnection;import org.apache.commons.httpclient.HttpConnectionManager;import org.apache.commons.httpclient.HttpException;import org.apache.commons.httpclient.HttpMethod;import org.apache.commons.httpclient.HttpMethodBase;import org.apache.commons.httpclient.HttpState;import org.apache.commons.httpclient.HttpStatus;import org.apache.commons.httpclient.HttpVersion;import org.apache.commons.httpclient.auth.AuthChallengeParser;import org.apache.commons.httpclient.auth.AuthScheme;import org.apache.commons.httpclient.auth.BasicScheme;import org.apache.commons.httpclient.auth.DigestScheme;import org.apache.commons.httpclient.auth.MalformedChallengeException;import org.apache.commons.httpclient.cookie.CookiePolicy;import org.apache.commons.httpclient.params.HttpClientParams;import org.apache.commons.httpclient.params.HttpConnectionManagerParams;import org.apache.commons.httpclient.params.HttpMethodParams;import org.apache.commons.httpclient.protocol.Protocol;import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;import org.archive.crawler.Heritrix;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlHost;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.crawler.datamodel.CrawlServer;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.CredentialStore;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.datamodel.ServerCache;import org.archive.crawler.datamodel.credential.Credential;import org.archive.crawler.datamodel.credential.CredentialAvatar;import org.archive.crawler.datamodel.credential.Rfc2617Credential;import org.archive.crawler.deciderules.DecideRule;import org.archive.crawler.deciderules.DecideRuleSequence;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.Filter;import org.archive.crawler.framework.Processor;import org.archive.crawler.settings.MapType;import org.archive.crawler.settings.SettingsHandler;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.StringList;import org.archive.crawler.settings.Type;import org.archive.httpclient.ConfigurableX509TrustManager;import org.archive.httpclient.HttpRecorderGetMethod;import org.archive.httpclient.HttpRecorderMethod;import org.archive.httpclient.HttpRecorderPostMethod;import org.archive.httpclient.SingleHttpConnectionManager;import org.archive.io.ObjectPlusFilesInputStream;import org.archive.io.RecorderLengthExceededException;import org.archive.io.RecorderTimeoutException;import org.archive.io.RecorderTooMuchHeaderException;import org.archive.util.ArchiveUtils;import org.archive.util.HttpRecorder;import org.archive.util.bdbje.EnhancedEnvironment;import st.ata.util.AList;import com.sleepycat.bind.serial.SerialBinding;import com.sleepycat.bind.serial.StoredClassCatalog;import com.sleepycat.bind.tuple.StringBinding;import com.sleepycat.collections.StoredSortedMap;import com.sleepycat.je.Database;import com.sleepycat.je.DatabaseConfig;import com.sleepycat.je.DatabaseException;import com.sleepycat.je.Environment;/** * HTTP fetcher that uses <a * href="http://jakarta.apache.org/commons/httpclient/">Apache Jakarta Commons * HttpClient</a> library. * * @author Gordon Mohr * @author Igor Ranitovic * @author others * @version $Id: FetchHTTP.java 5093 2007-04-24 21:48:34Z gojomo $ */public class FetchHTTP extends Processorimplements CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener {    // be robust against trivial implementation changes    private static final long serialVersionUID =        ArchiveUtils.classnameBasedUID(FetchHTTP.class,1);        private static Logger logger = Logger.getLogger(FetchHTTP.class.getName());    public static final String ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST;    public static final String ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT;    public static final String ATTR_TIMEOUT_SECONDS = "timeout-seconds";    public static final String ATTR_SOTIMEOUT_MS = "sotimeout-ms";    public static final String ATTR_MAX_LENGTH_BYTES = "max-length-bytes";    public static final String ATTR_LOAD_COOKIES = "load-cookies-from-file";    public static final String ATTR_SAVE_COOKIES = "save-cookies-to-file";    public static final String ATTR_ACCEPT_HEADERS = "accept-headers";    public static final String ATTR_DEFAULT_ENCODING = "default-encoding";    public static final String ATTR_DIGEST_CONTENT = "digest-content";    public static final String ATTR_DIGEST_ALGORITHM = "digest-algorithm";    public static final String ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth";       /**     * SSL trust level setting attribute name.     */    public static final String ATTR_TRUST = "trust-level";        private static Integer DEFAULT_TIMEOUT_SECONDS = new Integer(1200);    private static Integer DEFAULT_SOTIMEOUT_MS = new Integer(20000);    private static Long DEFAULT_MAX_LENGTH_BYTES = new Long(0);    private static Integer DEFAULT_FETCH_BANDWIDTH_MAX = 0;    /**     * This is the default value pre-1.4. Needs special handling else     * treated as negative number doing math later in processing.     */    private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L;    /**     * Default character encoding to use for pages that do not specify.     */    private static String DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING;    /**     * Default whether to perform on-the-fly digest hashing of content-bodies.     */    static Boolean DEFAULT_DIGEST_CONTENT = new Boolean(true);              /**     * The different digest algorithms to choose between,      * SHA-1 or MD-5 at the moment.      */    public static final String SHA1 = "sha1";    public static final String MD5 = "md5";    public static String [] DIGEST_ALGORITHMS = {SHA1, MD5};       /**     * Default algorithm to use for message disgesting.     */    public static final String  DEFAULT_DIGEST_ALGORITHM = SHA1;         private transient HttpClient http = null;    /**     * How many 'instant retries' of HttpRecoverableExceptions have occurred     *      * Would like it to be 'long', but longs aren't atomic     */    private int recoveryRetries = 0;    /**     * Count of crawl uris handled.     * Would like to be 'long', but longs aren't atomic     */    private int curisHandled = 0;            /**     * Rules to apply mid-fetch, just after receipt of the response     * headers before we start to download body.     */    public static final String ATTR_MIDFETCH_DECIDE_RULES = "midfetch-decide-rules";        /**     * What to log if midfetch abort.     */    private static final String MIDFETCH_ABORT_LOG = "midFetchAbort";        public static final String ATTR_SEND_CONNECTION_CLOSE =        "send-connection-close";    private static final Header HEADER_SEND_CONNECTION_CLOSE =        new Header("Connection", "close");    public static final String ATTR_SEND_REFERER = "send-referer";    public static final String ATTR_SEND_RANGE = "send-range";    public static final String ATTR_SEND_IF_MODIFIED_SINCE = "send-if-modified-since";    public static final String ATTR_SEND_IF_NONE_MATCH = "send-if-none-match";    public static final String REFERER = "Referer";    public static final String RANGE = "Range";    public static final String RANGE_PREFIX = "bytes=0-";    public static final String HTTP_SCHEME = "http";    public static final String HTTPS_SCHEME = "https";        public static final String ATTR_IGNORE_COOKIES = "ignore-cookies";    private static Boolean DEFAULT_IGNORE_COOKIES = new Boolean(false);    public static final String ATTR_BDB_COOKIES = "use-bdb-for-cookies";    private static Boolean DEFAULT_BDB_COOKIES = new Boolean(true);        public static final String ATTR_LOCAL_ADDRESS = "bind-address";        /**     * Database backing cookie map, if using BDB     */    protected Database cookieDb;     /**     * Name of cookie BDB Database     */    public static final String COOKIEDB_NAME = "http_cookies";        static {    	Protocol.registerProtocol("http", new Protocol("http",            new HeritrixProtocolSocketFactory(), 80));    	try {			Protocol.registerProtocol("https",			    new Protocol("https", ((ProtocolSocketFactory)			        new HeritrixSSLProtocolSocketFactory()), 443));		} catch (KeyManagementException e) {			e.printStackTrace();		} catch (KeyStoreException e) {			e.printStackTrace();		} catch (NoSuchAlgorithmException e) {			e.printStackTrace();		}    }    static final String SERVER_CACHE_KEY = "heritrix.server.cache";    static final String SSL_FACTORY_KEY = "heritrix.ssl.factory";        /***     * Socket factory that has the configurable trust manager installed.     */    private SSLSocketFactory sslfactory = null;        /**     * Constructor.     *     * @param name Name of this processor.     */    public FetchHTTP(String name) {        super(name, "HTTP Fetcher");        addElementToDefinition(            new DecideRuleSequence(ATTR_MIDFETCH_DECIDE_RULES,                 "DecideRules which, if final decision is REJECT, " +                "abort fetch after headers before all content is" +                "read."));                addElementToDefinition(new SimpleType(ATTR_TIMEOUT_SECONDS,            "If the fetch is not completed in this number of seconds, "            + "even if it is making progress, give up. The URI will be "            + "annotated as timeTrunc. Set to zero for no timeout. "            + "(This is not recommended: threads could wait indefinitely "            + "for the fetch to end.)",            DEFAULT_TIMEOUT_SECONDS));        Type e = addElementToDefinition(new SimpleType(ATTR_SOTIMEOUT_MS,            "If a socket is unresponsive for this number of milliseconds, " +            "give up on that connects/read. (This does not necessarily give " +            "up on the fetch immediately; connects are subject to retries " +            "and reads will be retried until " + ATTR_TIMEOUT_SECONDS +            " have elapsed. Set to zero for no socket timeout. (This is " +            "note recommended: a socket operation could hand indefinitely.",                DEFAULT_SOTIMEOUT_MS));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_FETCH_BANDWIDTH_MAX,            "The maximum KB/sec to use when fetching data from a server. " +            "0 means no maximum.  Default: "+ DEFAULT_FETCH_BANDWIDTH_MAX             + ".", DEFAULT_FETCH_BANDWIDTH_MAX));        e.setExpertSetting(true);        e.setOverrideable(true);        addElementToDefinition(new SimpleType(ATTR_MAX_LENGTH_BYTES,            "Maximum length in bytes to fetch.\n" +            "Fetch is truncated at this length. A value of 0 means no limit.",            DEFAULT_MAX_LENGTH_BYTES));        e = addElementToDefinition(new SimpleType(ATTR_IGNORE_COOKIES,

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -