📄 fetchhttp.java
字号:
/* FetchHTTP.java * * $Id: FetchHTTP.java 5093 2007-04-24 21:48:34Z gojomo $ * * Created on Jun 5, 2003 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.crawler.fetcher;import it.unimi.dsi.mg4j.util.MutableString;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.RandomAccessFile;import java.security.KeyManagementException;import java.security.KeyStoreException;import java.security.MessageDigest;import java.security.NoSuchAlgorithmException;import java.util.Collection;import java.util.HashSet;import java.util.Iterator;import java.util.List;import java.util.ListIterator;import java.util.Map;import java.util.Set;import java.util.logging.Level;import java.util.logging.Logger;import java.net.InetAddress;import java.net.UnknownHostException;import javax.management.AttributeNotFoundException;import javax.management.MBeanException;import javax.management.ReflectionException;import javax.net.ssl.SSLContext;import javax.net.ssl.SSLSocketFactory;import javax.net.ssl.TrustManager;import org.apache.commons.httpclient.Cookie;import org.apache.commons.httpclient.Header;import org.apache.commons.httpclient.HostConfiguration;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpConnection;import org.apache.commons.httpclient.HttpConnectionManager;import org.apache.commons.httpclient.HttpException;import org.apache.commons.httpclient.HttpMethod;import org.apache.commons.httpclient.HttpMethodBase;import org.apache.commons.httpclient.HttpState;import org.apache.commons.httpclient.HttpStatus;import org.apache.commons.httpclient.HttpVersion;import org.apache.commons.httpclient.auth.AuthChallengeParser;import org.apache.commons.httpclient.auth.AuthScheme;import org.apache.commons.httpclient.auth.BasicScheme;import org.apache.commons.httpclient.auth.DigestScheme;import org.apache.commons.httpclient.auth.MalformedChallengeException;import org.apache.commons.httpclient.cookie.CookiePolicy;import org.apache.commons.httpclient.params.HttpClientParams;import org.apache.commons.httpclient.params.HttpConnectionManagerParams;import org.apache.commons.httpclient.params.HttpMethodParams;import org.apache.commons.httpclient.protocol.Protocol;import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;import org.archive.crawler.Heritrix;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlHost;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.crawler.datamodel.CrawlServer;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.CredentialStore;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.datamodel.ServerCache;import org.archive.crawler.datamodel.credential.Credential;import org.archive.crawler.datamodel.credential.CredentialAvatar;import org.archive.crawler.datamodel.credential.Rfc2617Credential;import org.archive.crawler.deciderules.DecideRule;import org.archive.crawler.deciderules.DecideRuleSequence;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.Filter;import org.archive.crawler.framework.Processor;import org.archive.crawler.settings.MapType;import org.archive.crawler.settings.SettingsHandler;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.StringList;import org.archive.crawler.settings.Type;import org.archive.httpclient.ConfigurableX509TrustManager;import org.archive.httpclient.HttpRecorderGetMethod;import org.archive.httpclient.HttpRecorderMethod;import org.archive.httpclient.HttpRecorderPostMethod;import org.archive.httpclient.SingleHttpConnectionManager;import org.archive.io.ObjectPlusFilesInputStream;import org.archive.io.RecorderLengthExceededException;import org.archive.io.RecorderTimeoutException;import org.archive.io.RecorderTooMuchHeaderException;import org.archive.util.ArchiveUtils;import org.archive.util.HttpRecorder;import org.archive.util.bdbje.EnhancedEnvironment;import st.ata.util.AList;import com.sleepycat.bind.serial.SerialBinding;import com.sleepycat.bind.serial.StoredClassCatalog;import com.sleepycat.bind.tuple.StringBinding;import com.sleepycat.collections.StoredSortedMap;import com.sleepycat.je.Database;import com.sleepycat.je.DatabaseConfig;import com.sleepycat.je.DatabaseException;import com.sleepycat.je.Environment;/** * HTTP fetcher that uses <a * href="http://jakarta.apache.org/commons/httpclient/">Apache Jakarta Commons * HttpClient</a> library. * * @author Gordon Mohr * @author Igor Ranitovic * @author others * @version $Id: FetchHTTP.java 5093 2007-04-24 21:48:34Z gojomo $ */public class FetchHTTP extends Processorimplements CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener { // be robust against trivial implementation changes private static final long serialVersionUID = ArchiveUtils.classnameBasedUID(FetchHTTP.class,1); private static Logger logger = Logger.getLogger(FetchHTTP.class.getName()); public static final String ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST; public static final String ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT; public static final String ATTR_TIMEOUT_SECONDS = "timeout-seconds"; public static final String ATTR_SOTIMEOUT_MS = "sotimeout-ms"; public static final String ATTR_MAX_LENGTH_BYTES = "max-length-bytes"; public static final String ATTR_LOAD_COOKIES = "load-cookies-from-file"; public static final String ATTR_SAVE_COOKIES = "save-cookies-to-file"; public static final String ATTR_ACCEPT_HEADERS = "accept-headers"; public static final String ATTR_DEFAULT_ENCODING = "default-encoding"; public static final String ATTR_DIGEST_CONTENT = "digest-content"; public static final String ATTR_DIGEST_ALGORITHM = "digest-algorithm"; public static final String ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth"; /** * SSL trust level setting attribute name. */ public static final String ATTR_TRUST = "trust-level"; private static Integer DEFAULT_TIMEOUT_SECONDS = new Integer(1200); private static Integer DEFAULT_SOTIMEOUT_MS = new Integer(20000); private static Long DEFAULT_MAX_LENGTH_BYTES = new Long(0); private static Integer DEFAULT_FETCH_BANDWIDTH_MAX = 0; /** * This is the default value pre-1.4. Needs special handling else * treated as negative number doing math later in processing. */ private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L; /** * Default character encoding to use for pages that do not specify. */ private static String DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING; /** * Default whether to perform on-the-fly digest hashing of content-bodies. */ static Boolean DEFAULT_DIGEST_CONTENT = new Boolean(true); /** * The different digest algorithms to choose between, * SHA-1 or MD-5 at the moment. */ public static final String SHA1 = "sha1"; public static final String MD5 = "md5"; public static String [] DIGEST_ALGORITHMS = {SHA1, MD5}; /** * Default algorithm to use for message disgesting. */ public static final String DEFAULT_DIGEST_ALGORITHM = SHA1; private transient HttpClient http = null; /** * How many 'instant retries' of HttpRecoverableExceptions have occurred * * Would like it to be 'long', but longs aren't atomic */ private int recoveryRetries = 0; /** * Count of crawl uris handled. * Would like to be 'long', but longs aren't atomic */ private int curisHandled = 0; /** * Rules to apply mid-fetch, just after receipt of the response * headers before we start to download body. */ public static final String ATTR_MIDFETCH_DECIDE_RULES = "midfetch-decide-rules"; /** * What to log if midfetch abort. */ private static final String MIDFETCH_ABORT_LOG = "midFetchAbort"; public static final String ATTR_SEND_CONNECTION_CLOSE = "send-connection-close"; private static final Header HEADER_SEND_CONNECTION_CLOSE = new Header("Connection", "close"); public static final String ATTR_SEND_REFERER = "send-referer"; public static final String ATTR_SEND_RANGE = "send-range"; public static final String ATTR_SEND_IF_MODIFIED_SINCE = "send-if-modified-since"; public static final String ATTR_SEND_IF_NONE_MATCH = "send-if-none-match"; public static final String REFERER = "Referer"; public static final String RANGE = "Range"; public static final String RANGE_PREFIX = "bytes=0-"; public static final String HTTP_SCHEME = "http"; public static final String HTTPS_SCHEME = "https"; public static final String ATTR_IGNORE_COOKIES = "ignore-cookies"; private static Boolean DEFAULT_IGNORE_COOKIES = new Boolean(false); public static final String ATTR_BDB_COOKIES = "use-bdb-for-cookies"; private static Boolean DEFAULT_BDB_COOKIES = new Boolean(true); public static final String ATTR_LOCAL_ADDRESS = "bind-address"; /** * Database backing cookie map, if using BDB */ protected Database cookieDb; /** * Name of cookie BDB Database */ public static final String COOKIEDB_NAME = "http_cookies"; static { Protocol.registerProtocol("http", new Protocol("http", new HeritrixProtocolSocketFactory(), 80)); try { Protocol.registerProtocol("https", new Protocol("https", ((ProtocolSocketFactory) new HeritrixSSLProtocolSocketFactory()), 443)); } catch (KeyManagementException e) { e.printStackTrace(); } catch (KeyStoreException e) { e.printStackTrace(); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); } } static final String SERVER_CACHE_KEY = "heritrix.server.cache"; static final String SSL_FACTORY_KEY = "heritrix.ssl.factory"; /*** * Socket factory that has the configurable trust manager installed. */ private SSLSocketFactory sslfactory = null; /** * Constructor. * * @param name Name of this processor. */ public FetchHTTP(String name) { super(name, "HTTP Fetcher"); addElementToDefinition( new DecideRuleSequence(ATTR_MIDFETCH_DECIDE_RULES, "DecideRules which, if final decision is REJECT, " + "abort fetch after headers before all content is" + "read.")); addElementToDefinition(new SimpleType(ATTR_TIMEOUT_SECONDS, "If the fetch is not completed in this number of seconds, " + "even if it is making progress, give up. The URI will be " + "annotated as timeTrunc. Set to zero for no timeout. " + "(This is not recommended: threads could wait indefinitely " + "for the fetch to end.)", DEFAULT_TIMEOUT_SECONDS)); Type e = addElementToDefinition(new SimpleType(ATTR_SOTIMEOUT_MS, "If a socket is unresponsive for this number of milliseconds, " + "give up on that connects/read. (This does not necessarily give " + "up on the fetch immediately; connects are subject to retries " + "and reads will be retried until " + ATTR_TIMEOUT_SECONDS + " have elapsed. Set to zero for no socket timeout. (This is " + "note recommended: a socket operation could hand indefinitely.", DEFAULT_SOTIMEOUT_MS)); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_FETCH_BANDWIDTH_MAX, "The maximum KB/sec to use when fetching data from a server. " + "0 means no maximum. Default: "+ DEFAULT_FETCH_BANDWIDTH_MAX + ".", DEFAULT_FETCH_BANDWIDTH_MAX)); e.setExpertSetting(true); e.setOverrideable(true); addElementToDefinition(new SimpleType(ATTR_MAX_LENGTH_BYTES, "Maximum length in bytes to fetch.\n" + "Fetch is truncated at this length. A value of 0 means no limit.", DEFAULT_MAX_LENGTH_BYTES)); e = addElementToDefinition(new SimpleType(ATTR_IGNORE_COOKIES,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -