📄 adaptiverevisitfrontier.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* AdaptiveRevisitFrontier.java** Created on Sep 13, 2004** Copyright (C) 2004 Kristinn Sigur?sson.** This file is part of the Heritrix web crawler (crawler.archive.org).** Heritrix is free software; you can redistribute it and/or modify* it under the terms of the GNU Lesser Public License as published by* the Free Software Foundation; either version 2.1 of the License, or* any later version.** Heritrix is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU Lesser Public License for more details.** You should have received a copy of the GNU Lesser Public License* along with Heritrix; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/package org.archive.crawler.frontier;import java.io.File;import java.io.IOException;import java.io.PrintWriter;import java.io.Serializable;import java.io.StringWriter;import java.io.Writer;import java.util.ArrayList;import java.util.Date;import java.util.Iterator;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.AttributeNotFoundException;import org.apache.commons.httpclient.HttpStatus;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlServer;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.datamodel.UriUniqFilter;import org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.Frontier;import org.archive.crawler.framework.FrontierMarker;import org.archive.crawler.framework.exceptions.EndedException;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;import org.archive.crawler.settings.ModuleType;import org.archive.crawler.settings.RegularExpressionConstraint;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.crawler.url.Canonicalizer;import org.archive.crawler.util.BdbUriUniqFilter;import org.archive.net.UURI;import org.archive.queue.MemQueue;import org.archive.queue.Queue;import org.archive.util.ArchiveUtils;/** * A Frontier that will repeatedly visit all encountered URIs.  * <p> * Wait time between visits is configurable and varies based on observed  * changes of documents. * <p> * The Frontier borrows many things from HostQueuesFrontier, but implements  * an entirely different strategy in issuing URIs and consequently in keeping a * record of discovered URIs. * * @author Kristinn Sigurdsson */public class AdaptiveRevisitFrontier extends ModuleType implements Frontier, FetchStatusCodes, CoreAttributeConstants,        AdaptiveRevisitAttributeConstants, CrawlStatusListener, HasUriReceiver {    private static final long serialVersionUID = -8666872690438543671L;    private static final Logger logger =        Logger.getLogger(AdaptiveRevisitFrontier.class.getName());    /** How many multiples of last fetch elapsed time to wait before recontacting     * same server */    public final static String ATTR_DELAY_FACTOR = "delay-factor";    private final static Float DEFAULT_DELAY_FACTOR = new Float(5);        /** Always wait this long after one completion before recontacting     * same server, regardless of multiple */    public final static String ATTR_MIN_DELAY = "min-delay-ms";    // 2 seconds    private final static Integer DEFAULT_MIN_DELAY = new Integer(2000);        /** Never wait more than this long, regardless of multiple */    public final static String ATTR_MAX_DELAY = "max-delay-ms";        // 30 seconds    private final static Integer DEFAULT_MAX_DELAY = new Integer(30000);        /** Maximum times to emit a CrawlURI without final disposition */    public final static String ATTR_MAX_RETRIES = "max-retries";    private final static Integer DEFAULT_MAX_RETRIES = new Integer(30);    /** For retryable problems, seconds to wait before a retry */    public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";        // 15 minutes    private final static Long DEFAULT_RETRY_DELAY = new Long(900);        /** Maximum simultaneous requests in process to a host (queue) */    public final static String ATTR_HOST_VALENCE = "host-valence";    private final static Integer DEFAULT_HOST_VALENCE = new Integer(1);     /** Number of hops of embeds (ERX) to bump to front of host queue */    public final static String ATTR_PREFERENCE_EMBED_HOPS =        "preference-embed-hops";    private final static Integer DEFAULT_PREFERENCE_EMBED_HOPS = new Integer(0);         /** Queue assignment to force on CrawlURIs. Intended to be used      *  via overrides*/    public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";    protected final static String DEFAULT_FORCE_QUEUE = "";    /** Acceptable characters in forced queue names.     *  Word chars, dash, period, comma, colon */    protected final static String ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*";    /** Should the queue assignment ignore www in hostnames, effectively      *  stripping them away.      */    public final static String ATTR_QUEUE_IGNORE_WWW = "queue-ignore-www";    protected final static Boolean DEFAULT_QUEUE_IGNORE_WWW = new Boolean(false);        /** Should the Frontier use a seperate 'already included' datastructure     *  or rely on the queues'.      */    public final static String ATTR_USE_URI_UNIQ_FILTER = "use-uri-uniq-filter";    protected final static Boolean DEFAULT_USE_URI_UNIQ_FILTER = new Boolean(false);        private CrawlController controller;        private AdaptiveRevisitQueueList hostQueues;        private UriUniqFilter alreadyIncluded;    private ThreadLocalQueue threadWaiting = new ThreadLocalQueue();    /** Policy for assigning CrawlURIs to named queues */    private QueueAssignmentPolicy queueAssignmentPolicy = null;        // top-level stats    private long succeededFetchCount = 0;    private long failedFetchCount = 0;    // URI's that are disregarded (for example because of robot.txt rules)    private long disregardedUriCount = 0;    private long totalProcessedBytes = 0;        // Flags indicating operator-specified crawl pause/end     private boolean shouldPause = false;    private boolean shouldTerminate = false;        public AdaptiveRevisitFrontier(String name) {        this(name, "AdaptiveRevisitFrontier. EXPERIMENTAL Frontier that " +                "will repeatedly visit all " +                "encountered URIs. Wait time between visits is configurable" +                " and is determined by seperate Processor(s). See " +                "WaitEvaluators " +                "See documentation for ARFrontier limitations.");            }    public AdaptiveRevisitFrontier(String name, String description) {        super(Frontier.ATTR_NAME, description);        addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,                "How many multiples of last fetch elapsed time to wait before " +                "recontacting same server", DEFAULT_DELAY_FACTOR));            addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,                "Never wait more than this long, regardless of multiple",                DEFAULT_MAX_DELAY));            addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,                "Always wait this long after one completion before recontacting " +                "same server, regardless of multiple", DEFAULT_MIN_DELAY));             addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES,                "How often to retry fetching a URI that failed to be retrieved.\n" +                "If zero, the crawler will get the robots.txt only.",                DEFAULT_MAX_RETRIES));            addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY,                    "How long to wait by default until we retry fetching a" +                    " URI that failed to be retrieved (seconds). ",                    DEFAULT_RETRY_DELAY));            addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,                    "Number of embedded (or redirected) hops up to which " +                    "a URI has higher priority scheduling. For example, if set " +                    "to 1 (the default), items such as inline images (1-hop " +                    "embedded resources) will be scheduled ahead of all regular " +                    "links (or many-hop resources, like nested frames). If set to " +                    "zero, no preferencing will occur, and embeds/redirects are " +                    "scheduled the same as regular links.",                    DEFAULT_PREFERENCE_EMBED_HOPS));            Type t;            t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE,                    "Maximum number of simultaneous requests to a single" +                    " host.",                    DEFAULT_HOST_VALENCE));            t.setExpertSetting(true);            t = addElementToDefinition(new SimpleType(ATTR_QUEUE_IGNORE_WWW,                    "If true then documents from x.com, www.x.com and any " +                    "www[0-9]+.x.com will be assigned to the same queue.",                    DEFAULT_QUEUE_IGNORE_WWW));            t.setExpertSetting(true);            t = addElementToDefinition(new SimpleType(                    ATTR_FORCE_QUEUE,                    "The queue name into which to force URIs. Should "                    + "be left blank at global level.  Specify a "                    + "per-domain/per-host override to force URIs into "                    + "a particular named queue, regardless of the assignment "                    + "policy in effect (domain or ip-based politeness). "                    + "This could be used on domains known to all be from "                    + "the same small set of IPs (eg blogspot, dailykos, etc.) "                    + "to simulate IP-based politeness, or it could be used if "                    + "you wanted to enforce politeness over a whole domain, even "                    + "though the subdomains are split across many IPs.",                    DEFAULT_FORCE_QUEUE));            t.setOverrideable(true);            t.setExpertSetting(true);            t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE,                    Level.WARNING, "This field must contain only alphanumeric "                    + "characters plus period, dash, comma, colon, or underscore."));            t = addElementToDefinition(new SimpleType(ATTR_USE_URI_UNIQ_FILTER,                    "If true then the Frontier will use a seperate " +                    "datastructure to detect and eliminate duplicates.\n" +                    "This is required for Canonicalization rules to work.",                    DEFAULT_USE_URI_UNIQ_FILTER));            t.setExpertSetting(true);            t.setOverrideable(false);        // Register persistent CrawlURI items         CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY);        CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING);    }    public synchronized void initialize(CrawlController c)            throws FatalConfigurationException, IOException {        controller = c;        controller.addCrawlStatusListener(this);        queueAssignmentPolicy = new HostnameQueueAssignmentPolicy();                hostQueues = new AdaptiveRevisitQueueList(c.getBdbEnvironment(),            c.getBdbEnvironment().getClassCatalog());                if(((Boolean)getUncheckedAttribute(                null,ATTR_USE_URI_UNIQ_FILTER)).booleanValue()){            alreadyIncluded = createAlreadyIncluded();        } else {            alreadyIncluded = null;        }                loadSeeds();    }    /**     * Create a UriUniqFilter that will serve as record      * of already seen URIs.     *     * @return A UURISet that will serve as a record of already seen URIs     * @throws IOException     */    protected UriUniqFilter createAlreadyIncluded() throws IOException {        UriUniqFilter uuf = new BdbUriUniqFilter(                this.controller.getBdbEnvironment());        uuf.setDestination(this);        return uuf;    }        /**     * Loads the seeds     * <p>     * This method is called by initialize() and kickUpdate()     */    public void loadSeeds() {        Writer ignoredWriter = new StringWriter();        // Get the seeds to refresh.        Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter);        while (iter.hasNext()) {            CandidateURI caUri =                CandidateURI.createSeedCandidateURI((UURI)iter.next());            caUri.setSchedulingDirective(CandidateURI.MEDIUM);            schedule(caUri);        }        batchFlush();        // save ignored items (if any) where they can be consulted later        AbstractFrontier.saveIgnoredItems(                ignoredWriter.toString(),                 controller.getDisk());    }        public String getClassKey(CandidateURI cauri) {        String queueKey = (String)getUncheckedAttribute(cauri,                ATTR_FORCE_QUEUE);            if ("".equals(queueKey)) {                // Typical case, barring overrides                queueKey =                    queueAssignmentPolicy.getClassKey(controller, cauri);                // The queueAssignmentPolicy is always based on Hostnames                // We may need to remove any www[0-9]{0,}\. prefixes from the                // hostnames                if(((Boolean)getUncheckedAttribute(                        cauri,ATTR_QUEUE_IGNORE_WWW)).booleanValue()){                    queueKey = queueKey.replaceAll("^www[0-9]{0,}\\.","");                }            }            return queueKey;    }
12 3 4 下一页
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -