📄 crawluri.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * CrawlURI.java * Created on Apr 16, 2003 * * $Header$ */package org.archive.crawler.datamodel;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.util.ArrayList;import java.util.Collection;import java.util.HashSet;import java.util.Iterator;import java.util.List;import java.util.Set;import java.util.concurrent.CopyOnWriteArrayList;import org.apache.commons.httpclient.HttpStatus;import org.apache.commons.httpclient.URIException;import org.archive.crawler.datamodel.credential.CredentialAvatar;import org.archive.crawler.datamodel.credential.Rfc2617Credential;import org.archive.crawler.extractor.Link;import org.archive.crawler.framework.Processor;import org.archive.crawler.framework.ProcessorChain;import org.archive.crawler.util.Transform;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.Base32;import org.archive.util.HttpRecorder;import st.ata.util.AList;import st.ata.util.HashtableAList;/** * Represents a candidate URI and the associated state it * collects as it is crawled. * * <p>Core state is in instance variables but a flexible * attribute list is also available. Use this 'bucket' to carry * custom processing extracted data and state across CrawlURI * processing.  See the {@link #putString(String, String)}, * {@link #getString(String)}, etc.  * * @author Gordon Mohr */public class CrawlURI extends CandidateURIimplements FetchStatusCodes {    private static final long serialVersionUID = 7874096757350100472L;    public static final int UNCALCULATED = -1;        // INHERITED FROM CANDIDATEURI    // uuri: core identity: the "usable URI" to be crawled    // isSeed    // inScopeVersion    // pathFromSeed    // via    // Processing progress    transient private Processor nextProcessor;    transient private ProcessorChain nextProcessorChain;    private int fetchStatus = 0;    // default to unattempted    private int deferrals = 0;     // count of postponements for prerequisites    private int fetchAttempts = 0; // the number of fetch attempts that have been made    transient private int threadNumber;    // dynamic context    /** @deprecated */    private int linkHopCount = UNCALCULATED; // from seeds    /** @deprecated */    private int embedHopCount = UNCALCULATED; // from a sure link; reset upon any link traversal    // User agent to masquerade as when crawling this URI. If null, globals should be used    private String userAgent = null;    // Once a link extractor has finished processing this curi this will be    // set as true    transient private boolean linkExtractorFinished = false;    /**     * Protection against outlink overflow.     * Change value by setting alternate maximum in heritrix.properties.     */    public static final int MAX_OUTLINKS = Integer.        parseInt(System.getProperty(CrawlURI.class.getName() + ".maxOutLinks",            "6000"));        transient private int discardedOutlinks = 0;     ////////////////////////////////////////////////////////////////////    private long contentSize = UNCALCULATED;    private long contentLength = UNCALCULATED;    /**     * Current http recorder.     *     * Gets set upon successful request.  Reset at start of processing chain.     */    private transient HttpRecorder httpRecorder = null;    /**     * Content type of a successfully fetched URI.     *     * May be null even on successfully fetched URI.     */    private String contentType = null;    /**     * True if this CrawlURI has been deemed a prerequisite by the     * {@link org.archive.crawler.prefetch.PreconditionEnforcer}.     *     * This flag is used at least inside in the precondition enforcer so that     * subsequent prerequisite tests know to let this CrawlURI through because     * its a prerequisite needed by an earlier prerequisite tests (e.g. If     * this is a robots.txt, then the subsequent login credentials prereq     * test must not throw it out because its not a login curi).     */    private boolean prerequisite = false;    /**     * Set to true if this <code>curi</code> is to be POST'd rather than GET-d.     */    private boolean post = false;    /**      * Monotonically increasing number within a crawl;     * useful for tending towards breadth-first ordering.     * Will sometimes be truncated to 48 bits, so behavior     * over 281 trillion instantiated CrawlURIs may be      * buggy     */    protected long ordinal;    /**     * Cache of this candidate uuri as a string.     *     * Profiling shows us spending about 1-2% of total elapsed time in     * toString.     */    private String cachedCrawlURIString = null;        /**     * Array to hold keys of alist members that persist across URI processings.     * Any key mentioned in this list will not be cleared out at the end     * of a pass down the processing chain.     */    private static final List<Object> alistPersistentMember     = new CopyOnWriteArrayList<Object>(            new String [] {A_CREDENTIAL_AVATARS_KEY});    /**     * A digest (hash, usually SHA1) of retrieved content-body.      *      */    private byte[] contentDigest = null;    private String contentDigestScheme = null;    /**     * Create a new instance of CrawlURI from a {@link UURI}.     *     * @param uuri the UURI to base this CrawlURI on.     */    public CrawlURI(UURI uuri) {        super(uuri);    }    /**     * Create a new instance of CrawlURI from a {@link CandidateURI}     *     * @param caUri the CandidateURI to base this CrawlURI on.     * @param o Monotonically increasing number within a crawl.     */    @SuppressWarnings("deprecation")    public CrawlURI(CandidateURI caUri, long o) {        super(caUri.getUURI(), caUri.getPathFromSeed(), caUri.getVia(),            caUri.getViaContext());        ordinal = o;        setIsSeed(caUri.isSeed());        setSchedulingDirective(caUri.getSchedulingDirective());        setAList(caUri.getAList());    }    /**     * Takes a status code and converts it into a human readable string.     *     * @param code the status code     * @return a human readable string declaring what the status code is.     */    public static String fetchStatusCodesToString(int code){        switch(code){            // DNS            case S_DNS_SUCCESS : return "DNS-1-OK";            // HTTP Informational 1xx            case 100  : return "HTTP-100-Info-Continue";            case 101  : return "HTTP-101-Info-Switching Protocols";            // HTTP Successful 2xx            case 200  : return "HTTP-200-Success-OK";            case 201  : return "HTTP-201-Success-Created";            case 202  : return "HTTP-202-Success-Accepted";            case 203  : return "HTTP-203-Success-Non-Authoritative";            case 204  : return "HTTP-204-Success-No Content ";            case 205  : return "HTTP-205-Success-Reset Content";            case 206  : return "HTTP-206-Success-Partial Content";            // HTTP Redirection 3xx            case 300  : return "HTTP-300-Redirect-Multiple Choices";            case 301  : return "HTTP-301-Redirect-Moved Permanently";            case 302  : return "HTTP-302-Redirect-Found";            case 303  : return "HTTP-303-Redirect-See Other";            case 304  : return "HTTP-304-Redirect-Not Modified";            case 305  : return "HTTP-305-Redirect-Use Proxy";            case 307  : return "HTTP-307-Redirect-Temporary Redirect";            // HTTP Client Error 4xx            case 400  : return "HTTP-400-ClientErr-Bad Request";            case 401  : return "HTTP-401-ClientErr-Unauthorized";            case 402  : return "HTTP-402-ClientErr-Payment Required";            case 403  : return "HTTP-403-ClientErr-Forbidden";            case 404  : return "HTTP-404-ClientErr-Not Found";            case 405  : return "HTTP-405-ClientErr-Method Not Allowed";            case 407  : return "HTTP-406-ClientErr-Not Acceptable";            case 408  : return "HTTP-407-ClientErr-Proxy Authentication Required";            case 409  : return "HTTP-408-ClientErr-Request Timeout";            case 410  : return "HTTP-409-ClientErr-Conflict";            case 406  : return "HTTP-410-ClientErr-Gone";            case 411  : return "HTTP-411-ClientErr-Length Required";            case 412  : return "HTTP-412-ClientErr-Precondition Failed";            case 413  : return "HTTP-413-ClientErr-Request Entity Too Large";            case 414  : return "HTTP-414-ClientErr-Request-URI Too Long";            case 415  : return "HTTP-415-ClientErr-Unsupported Media Type";            case 416  : return "HTTP-416-ClientErr-Requested Range Not Satisfiable";            case 417  : return "HTTP-417-ClientErr-Expectation Failed";            // HTTP Server Error 5xx            case 500  : return "HTTP-500-ServerErr-Internal Server Error";            case 501  : return "HTTP-501-ServerErr-Not Implemented";            case 502  : return "HTTP-502-ServerErr-Bad Gateway";            case 503  : return "HTTP-503-ServerErr-Service Unavailable";            case 504  : return "HTTP-504-ServerErr-Gateway Timeout";            case 505  : return "HTTP-505-ServerErr-HTTP Version Not Supported";            // Heritrix internal codes (all negative numbers            case S_BLOCKED_BY_USER:                return "Heritrix(" + S_BLOCKED_BY_USER + ")-Blocked by user";            case S_BLOCKED_BY_CUSTOM_PROCESSOR:                return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR +                ")-Blocked by custom prefetch processor";            case S_DELETED_BY_USER:                return "Heritrix(" + S_DELETED_BY_USER + ")-Deleted by user";            case S_CONNECT_FAILED:                return "Heritrix(" + S_CONNECT_FAILED + ")-Connection failed";            case S_CONNECT_LOST:                return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost";            case S_DEEMED_CHAFF:                return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff";            case S_DEFERRED:                return "Heritrix(" + S_DEFERRED + ")-Deferred";            case S_DOMAIN_UNRESOLVABLE:                return "Heritrix(" + S_DOMAIN_UNRESOLVABLE                        + ")-Domain unresolvable";            case S_OUT_OF_SCOPE:                return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope";            case S_DOMAIN_PREREQUISITE_FAILURE:                return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE                        + ")-Domain prerequisite failure";            case S_ROBOTS_PREREQUISITE_FAILURE:                return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE                        + ")-Robots prerequisite failure";            case S_OTHER_PREREQUISITE_FAILURE:                return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE                        + ")-Other prerequisite failure";            case S_PREREQUISITE_UNSCHEDULABLE_FAILURE:                return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE                        + ")-Prerequisite unschedulable failure";            case S_ROBOTS_PRECLUDED:                return "Heritrix(" + S_ROBOTS_PRECLUDED + ")-Robots precluded";            case S_RUNTIME_EXCEPTION:                return "Heritrix(" + S_RUNTIME_EXCEPTION                        + ")-Runtime exception";            case S_SERIOUS_ERROR:                return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error";            case S_TIMEOUT:                return "Heritrix(" + S_TIMEOUT + ")-Timeout";            case S_TOO_MANY_EMBED_HOPS:                return "Heritrix(" + S_TOO_MANY_EMBED_HOPS                        + ")-Too many embed hops";            case S_TOO_MANY_LINK_HOPS:                return "Heritrix(" + S_TOO_MANY_LINK_HOPS                        + ")-Too many link hops";            case S_TOO_MANY_RETRIES:                return "Heritrix(" + S_TOO_MANY_RETRIES + ")-Too many retries";            case S_UNATTEMPTED:                return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted";            case S_UNFETCHABLE_URI:                return "Heritrix(" + S_UNFETCHABLE_URI + ")-Unfetchable URI";            case S_PROCESSING_THREAD_KILLED:                return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-" +                    "Processing thread killed";            // Unknown return code            default : return Integer.toString(code);        }    }    /**     * Return the overall/fetch status of this CrawlURI for its     * current trip through the processing loop.     *     * @return a value from FetchStatusCodes     */    public int getFetchStatus(){        return fetchStatus;    }    /**     * Set the overall/fetch status of this CrawlURI for     * its current trip through the processing loop.     *     * @param newstatus a value from FetchStatusCodes     */    public void setFetchStatus(int newstatus){        fetchStatus = newstatus;    }    /**     * Get the number of attempts at getting the document referenced by this     * URI.     *     * @return the number of attempts at getting the document referenced by this     *         URI.     */    public int getFetchAttempts() {        return fetchAttempts;    }    /**     * Increment the number of attempts at getting the document referenced by     * this URI.     *     * @return the number of attempts at getting the document referenced by this     *         URI.     */    public int incrementFetchAttempts() {        // TODO: rename, this is actually processing-loop-attempts        return fetchAttempts++;    }    /**     * Reset fetchAttempts counter.     */    public void resetFetchAttempts() {        this.fetchAttempts = 0;    }    /**     * Reset deferrals counter.     */    public void resetDeferrals() {        this.deferrals = 0;    }    /**     * Get the next processor to process this URI.     *     * @return the processor that should process this URI next.     */    public Processor nextProcessor() {        return nextProcessor;    }    /**     * Get the processor chain that should be processing this URI after the     * current chain is finished with it.     *     * @return the next processor chain to process this URI.     */    public ProcessorChain nextProcessorChain() {        return nextProcessorChain;    }    /**     * Set the next processor to process this URI.     *     * @param processor the next processor to process this URI.     */    public void setNextProcessor(Processor processor) {        nextProcessor = processor;    }    /**     * Set the next processor chain to process this URI.     *     * @param nextProcessorChain the next processor chain to process this URI.     */    public void setNextProcessorChain(ProcessorChain nextProcessorChain) {        this.nextProcessorChain = nextProcessorChain;    }    /**     * Do all actions associated with setting a <code>CrawlURI</code> as     * requiring a prerequisite.     *     * @param lastProcessorChain Last processor chain reference.  This chain is     * where this <code>CrawlURI</code> goes next.     * @param preq Object to set a prerequisite.     * @throws URIException     */    public void markPrerequisite(String preq,            ProcessorChain lastProcessorChain) throws URIException {        Link link = createLink(preq,Link.PREREQ_MISC,Link.PREREQ_HOP);        setPrerequisiteUri(link);        incrementDeferrals();        setFetchStatus(S_DEFERRED);        skipToProcessorChain(lastProcessorChain);    }    /**     * Set a prerequisite for this URI.     * <p>     * A prerequisite is a URI that must be crawled before this URI can be     * crawled.     *     * @param link Link to set as prereq.     */    public void setPrerequisiteUri(Object link) {        putObject(A_PREREQUISITE_URI, link);    }    /**     * Get the prerequisite for this URI.     * <p>     * A prerequisite is a URI that must be crawled before this URI can be     * crawled.     *     * @return the prerequisite for this URI or null if no prerequisite.     */    public Object getPrerequisiteUri() {        return getObject(A_PREREQUISITE_URI);    }        /**     * @return True if this CrawlURI has a prerequisite.     */    public boolean hasPrerequisiteUri() {        return containsKey(A_PREREQUISITE_URI);    }    /**     * Returns true if this CrawlURI is a prerequisite.     *     * @return true if this CrawlURI is a prerequisite.     */
12 3 下一页
💿 文件大小 10016 K
👤 上传用户 qqpp2q
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#Heritrix #robots #META #web
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -