📄 crawluri.java
字号:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * CrawlURI.java * Created on Apr 16, 2003 * * $Header$ */package org.archive.crawler.datamodel;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.util.ArrayList;import java.util.Collection;import java.util.HashSet;import java.util.Iterator;import java.util.List;import java.util.Set;import java.util.concurrent.CopyOnWriteArrayList;import org.apache.commons.httpclient.HttpStatus;import org.apache.commons.httpclient.URIException;import org.archive.crawler.datamodel.credential.CredentialAvatar;import org.archive.crawler.datamodel.credential.Rfc2617Credential;import org.archive.crawler.extractor.Link;import org.archive.crawler.framework.Processor;import org.archive.crawler.framework.ProcessorChain;import org.archive.crawler.util.Transform;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.Base32;import org.archive.util.HttpRecorder;import st.ata.util.AList;import st.ata.util.HashtableAList;/** * Represents a candidate URI and the associated state it * collects as it is crawled. * * <p>Core state is in instance variables but a flexible * attribute list is also available. Use this 'bucket' to carry * custom processing extracted data and state across CrawlURI * processing. See the {@link #putString(String, String)}, * {@link #getString(String)}, etc. * * @author Gordon Mohr */public class CrawlURI extends CandidateURIimplements FetchStatusCodes { private static final long serialVersionUID = 7874096757350100472L; public static final int UNCALCULATED = -1; // INHERITED FROM CANDIDATEURI // uuri: core identity: the "usable URI" to be crawled // isSeed // inScopeVersion // pathFromSeed // via // Processing progress transient private Processor nextProcessor; transient private ProcessorChain nextProcessorChain; private int fetchStatus = 0; // default to unattempted private int deferrals = 0; // count of postponements for prerequisites private int fetchAttempts = 0; // the number of fetch attempts that have been made transient private int threadNumber; // dynamic context /** @deprecated */ private int linkHopCount = UNCALCULATED; // from seeds /** @deprecated */ private int embedHopCount = UNCALCULATED; // from a sure link; reset upon any link traversal // User agent to masquerade as when crawling this URI. If null, globals should be used private String userAgent = null; // Once a link extractor has finished processing this curi this will be // set as true transient private boolean linkExtractorFinished = false; /** * Protection against outlink overflow. * Change value by setting alternate maximum in heritrix.properties. */ public static final int MAX_OUTLINKS = Integer. parseInt(System.getProperty(CrawlURI.class.getName() + ".maxOutLinks", "6000")); transient private int discardedOutlinks = 0; //////////////////////////////////////////////////////////////////// private long contentSize = UNCALCULATED; private long contentLength = UNCALCULATED; /** * Current http recorder. * * Gets set upon successful request. Reset at start of processing chain. */ private transient HttpRecorder httpRecorder = null; /** * Content type of a successfully fetched URI. * * May be null even on successfully fetched URI. */ private String contentType = null; /** * True if this CrawlURI has been deemed a prerequisite by the * {@link org.archive.crawler.prefetch.PreconditionEnforcer}. * * This flag is used at least inside in the precondition enforcer so that * subsequent prerequisite tests know to let this CrawlURI through because * its a prerequisite needed by an earlier prerequisite tests (e.g. If * this is a robots.txt, then the subsequent login credentials prereq * test must not throw it out because its not a login curi). */ private boolean prerequisite = false; /** * Set to true if this <code>curi</code> is to be POST'd rather than GET-d. */ private boolean post = false; /** * Monotonically increasing number within a crawl; * useful for tending towards breadth-first ordering. * Will sometimes be truncated to 48 bits, so behavior * over 281 trillion instantiated CrawlURIs may be * buggy */ protected long ordinal; /** * Cache of this candidate uuri as a string. * * Profiling shows us spending about 1-2% of total elapsed time in * toString. */ private String cachedCrawlURIString = null; /** * Array to hold keys of alist members that persist across URI processings. * Any key mentioned in this list will not be cleared out at the end * of a pass down the processing chain. */ private static final List<Object> alistPersistentMember = new CopyOnWriteArrayList<Object>( new String [] {A_CREDENTIAL_AVATARS_KEY}); /** * A digest (hash, usually SHA1) of retrieved content-body. * */ private byte[] contentDigest = null; private String contentDigestScheme = null; /** * Create a new instance of CrawlURI from a {@link UURI}. * * @param uuri the UURI to base this CrawlURI on. */ public CrawlURI(UURI uuri) { super(uuri); } /** * Create a new instance of CrawlURI from a {@link CandidateURI} * * @param caUri the CandidateURI to base this CrawlURI on. * @param o Monotonically increasing number within a crawl. */ @SuppressWarnings("deprecation") public CrawlURI(CandidateURI caUri, long o) { super(caUri.getUURI(), caUri.getPathFromSeed(), caUri.getVia(), caUri.getViaContext()); ordinal = o; setIsSeed(caUri.isSeed()); setSchedulingDirective(caUri.getSchedulingDirective()); setAList(caUri.getAList()); } /** * Takes a status code and converts it into a human readable string. * * @param code the status code * @return a human readable string declaring what the status code is. */ public static String fetchStatusCodesToString(int code){ switch(code){ // DNS case S_DNS_SUCCESS : return "DNS-1-OK"; // HTTP Informational 1xx case 100 : return "HTTP-100-Info-Continue"; case 101 : return "HTTP-101-Info-Switching Protocols"; // HTTP Successful 2xx case 200 : return "HTTP-200-Success-OK"; case 201 : return "HTTP-201-Success-Created"; case 202 : return "HTTP-202-Success-Accepted"; case 203 : return "HTTP-203-Success-Non-Authoritative"; case 204 : return "HTTP-204-Success-No Content "; case 205 : return "HTTP-205-Success-Reset Content"; case 206 : return "HTTP-206-Success-Partial Content"; // HTTP Redirection 3xx case 300 : return "HTTP-300-Redirect-Multiple Choices"; case 301 : return "HTTP-301-Redirect-Moved Permanently"; case 302 : return "HTTP-302-Redirect-Found"; case 303 : return "HTTP-303-Redirect-See Other"; case 304 : return "HTTP-304-Redirect-Not Modified"; case 305 : return "HTTP-305-Redirect-Use Proxy"; case 307 : return "HTTP-307-Redirect-Temporary Redirect"; // HTTP Client Error 4xx case 400 : return "HTTP-400-ClientErr-Bad Request"; case 401 : return "HTTP-401-ClientErr-Unauthorized"; case 402 : return "HTTP-402-ClientErr-Payment Required"; case 403 : return "HTTP-403-ClientErr-Forbidden"; case 404 : return "HTTP-404-ClientErr-Not Found"; case 405 : return "HTTP-405-ClientErr-Method Not Allowed"; case 407 : return "HTTP-406-ClientErr-Not Acceptable"; case 408 : return "HTTP-407-ClientErr-Proxy Authentication Required"; case 409 : return "HTTP-408-ClientErr-Request Timeout"; case 410 : return "HTTP-409-ClientErr-Conflict"; case 406 : return "HTTP-410-ClientErr-Gone"; case 411 : return "HTTP-411-ClientErr-Length Required"; case 412 : return "HTTP-412-ClientErr-Precondition Failed"; case 413 : return "HTTP-413-ClientErr-Request Entity Too Large"; case 414 : return "HTTP-414-ClientErr-Request-URI Too Long"; case 415 : return "HTTP-415-ClientErr-Unsupported Media Type"; case 416 : return "HTTP-416-ClientErr-Requested Range Not Satisfiable"; case 417 : return "HTTP-417-ClientErr-Expectation Failed"; // HTTP Server Error 5xx case 500 : return "HTTP-500-ServerErr-Internal Server Error"; case 501 : return "HTTP-501-ServerErr-Not Implemented"; case 502 : return "HTTP-502-ServerErr-Bad Gateway"; case 503 : return "HTTP-503-ServerErr-Service Unavailable"; case 504 : return "HTTP-504-ServerErr-Gateway Timeout"; case 505 : return "HTTP-505-ServerErr-HTTP Version Not Supported"; // Heritrix internal codes (all negative numbers case S_BLOCKED_BY_USER: return "Heritrix(" + S_BLOCKED_BY_USER + ")-Blocked by user"; case S_BLOCKED_BY_CUSTOM_PROCESSOR: return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR + ")-Blocked by custom prefetch processor"; case S_DELETED_BY_USER: return "Heritrix(" + S_DELETED_BY_USER + ")-Deleted by user"; case S_CONNECT_FAILED: return "Heritrix(" + S_CONNECT_FAILED + ")-Connection failed"; case S_CONNECT_LOST: return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost"; case S_DEEMED_CHAFF: return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff"; case S_DEFERRED: return "Heritrix(" + S_DEFERRED + ")-Deferred"; case S_DOMAIN_UNRESOLVABLE: return "Heritrix(" + S_DOMAIN_UNRESOLVABLE + ")-Domain unresolvable"; case S_OUT_OF_SCOPE: return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope"; case S_DOMAIN_PREREQUISITE_FAILURE: return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE + ")-Domain prerequisite failure"; case S_ROBOTS_PREREQUISITE_FAILURE: return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE + ")-Robots prerequisite failure"; case S_OTHER_PREREQUISITE_FAILURE: return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE + ")-Other prerequisite failure"; case S_PREREQUISITE_UNSCHEDULABLE_FAILURE: return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE + ")-Prerequisite unschedulable failure"; case S_ROBOTS_PRECLUDED: return "Heritrix(" + S_ROBOTS_PRECLUDED + ")-Robots precluded"; case S_RUNTIME_EXCEPTION: return "Heritrix(" + S_RUNTIME_EXCEPTION + ")-Runtime exception"; case S_SERIOUS_ERROR: return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error"; case S_TIMEOUT: return "Heritrix(" + S_TIMEOUT + ")-Timeout"; case S_TOO_MANY_EMBED_HOPS: return "Heritrix(" + S_TOO_MANY_EMBED_HOPS + ")-Too many embed hops"; case S_TOO_MANY_LINK_HOPS: return "Heritrix(" + S_TOO_MANY_LINK_HOPS + ")-Too many link hops"; case S_TOO_MANY_RETRIES: return "Heritrix(" + S_TOO_MANY_RETRIES + ")-Too many retries"; case S_UNATTEMPTED: return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted"; case S_UNFETCHABLE_URI: return "Heritrix(" + S_UNFETCHABLE_URI + ")-Unfetchable URI"; case S_PROCESSING_THREAD_KILLED: return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-" + "Processing thread killed"; // Unknown return code default : return Integer.toString(code); } } /** * Return the overall/fetch status of this CrawlURI for its * current trip through the processing loop. * * @return a value from FetchStatusCodes */ public int getFetchStatus(){ return fetchStatus; } /** * Set the overall/fetch status of this CrawlURI for * its current trip through the processing loop. * * @param newstatus a value from FetchStatusCodes */ public void setFetchStatus(int newstatus){ fetchStatus = newstatus; } /** * Get the number of attempts at getting the document referenced by this * URI. * * @return the number of attempts at getting the document referenced by this * URI. */ public int getFetchAttempts() { return fetchAttempts; } /** * Increment the number of attempts at getting the document referenced by * this URI. * * @return the number of attempts at getting the document referenced by this * URI. */ public int incrementFetchAttempts() { // TODO: rename, this is actually processing-loop-attempts return fetchAttempts++; } /** * Reset fetchAttempts counter. */ public void resetFetchAttempts() { this.fetchAttempts = 0; } /** * Reset deferrals counter. */ public void resetDeferrals() { this.deferrals = 0; } /** * Get the next processor to process this URI. * * @return the processor that should process this URI next. */ public Processor nextProcessor() { return nextProcessor; } /** * Get the processor chain that should be processing this URI after the * current chain is finished with it. * * @return the next processor chain to process this URI. */ public ProcessorChain nextProcessorChain() { return nextProcessorChain; } /** * Set the next processor to process this URI. * * @param processor the next processor to process this URI. */ public void setNextProcessor(Processor processor) { nextProcessor = processor; } /** * Set the next processor chain to process this URI. * * @param nextProcessorChain the next processor chain to process this URI. */ public void setNextProcessorChain(ProcessorChain nextProcessorChain) { this.nextProcessorChain = nextProcessorChain; } /** * Do all actions associated with setting a <code>CrawlURI</code> as * requiring a prerequisite. * * @param lastProcessorChain Last processor chain reference. This chain is * where this <code>CrawlURI</code> goes next. * @param preq Object to set a prerequisite. * @throws URIException */ public void markPrerequisite(String preq, ProcessorChain lastProcessorChain) throws URIException { Link link = createLink(preq,Link.PREREQ_MISC,Link.PREREQ_HOP); setPrerequisiteUri(link); incrementDeferrals(); setFetchStatus(S_DEFERRED); skipToProcessorChain(lastProcessorChain); } /** * Set a prerequisite for this URI. * <p> * A prerequisite is a URI that must be crawled before this URI can be * crawled. * * @param link Link to set as prereq. */ public void setPrerequisiteUri(Object link) { putObject(A_PREREQUISITE_URI, link); } /** * Get the prerequisite for this URI. * <p> * A prerequisite is a URI that must be crawled before this URI can be * crawled. * * @return the prerequisite for this URI or null if no prerequisite. */ public Object getPrerequisiteUri() { return getObject(A_PREREQUISITE_URI); } /** * @return True if this CrawlURI has a prerequisite. */ public boolean hasPrerequisiteUri() { return containsKey(A_PREREQUISITE_URI); } /** * Returns true if this CrawlURI is a prerequisite. * * @return true if this CrawlURI is a prerequisite. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -