📄 adaptiverevisitfrontier.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* AdaptiveRevisitFrontier.java** Created on Sep 13, 2004** Copyright (C) 2004 Kristinn Sigur?sson.** This file is part of the Heritrix web crawler (crawler.archive.org).** Heritrix is free software; you can redistribute it and/or modify* it under the terms of the GNU Lesser Public License as published by* the Free Software Foundation; either version 2.1 of the License, or* any later version.** Heritrix is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the* GNU Lesser Public License for more details.** You should have received a copy of the GNU Lesser Public License* along with Heritrix; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/package org.archive.crawler.frontier;import java.io.File;import java.io.IOException;import java.io.PrintWriter;import java.io.Serializable;import java.io.StringWriter;import java.io.Writer;import java.util.ArrayList;import java.util.Date;import java.util.Iterator;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Pattern;import javax.management.AttributeNotFoundException;import org.apache.commons.httpclient.HttpStatus;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlServer;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.datamodel.UriUniqFilter;import org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.Frontier;import org.archive.crawler.framework.FrontierMarker;import org.archive.crawler.framework.ToeThread;import org.archive.crawler.framework.exceptions.EndedException;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;import org.archive.crawler.settings.ModuleType;import org.archive.crawler.settings.RegularExpressionConstraint;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.crawler.url.Canonicalizer;import org.archive.crawler.util.BdbUriUniqFilter;import org.archive.net.UURI;import org.archive.queue.MemQueue;import org.archive.queue.Queue;import org.archive.util.ArchiveUtils;/** * A Frontier that will repeatedly visit all encountered URIs.  * <p> * Wait time between visits is configurable and varies based on observed  * changes of documents. * <p> * The Frontier borrows many things from HostQueuesFrontier, but implements  * an entirely different strategy in issuing URIs and consequently in keeping a * record of discovered URIs. * * @author Kristinn Sigurdsson */public class AdaptiveRevisitFrontier extends ModuleType implements Frontier, FetchStatusCodes, CoreAttributeConstants,        AdaptiveRevisitAttributeConstants, CrawlStatusListener, HasUriReceiver {    private static final long serialVersionUID = -8666872690438543671L;    private static final Logger logger =        Logger.getLogger(AdaptiveRevisitFrontier.class.getName());    /** How many multiples of last fetch elapsed time to wait before recontacting     * same server */    public final static String ATTR_DELAY_FACTOR = "delay-factor";    private final static Float DEFAULT_DELAY_FACTOR = new Float(5);        /** Always wait this long after one completion before recontacting     * same server, regardless of multiple */    public final static String ATTR_MIN_DELAY = "min-delay-ms";    // 2 seconds    private final static Integer DEFAULT_MIN_DELAY = new Integer(2000);        /** Never wait more than this long, regardless of multiple */    public final static String ATTR_MAX_DELAY = "max-delay-ms";        // 30 seconds    private final static Integer DEFAULT_MAX_DELAY = new Integer(30000);        /** Maximum times to emit a CrawlURI without final disposition */    public final static String ATTR_MAX_RETRIES = "max-retries";    private final static Integer DEFAULT_MAX_RETRIES = new Integer(30);    /** For retryable problems, seconds to wait before a retry */    public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";        // 15 minutes    private final static Long DEFAULT_RETRY_DELAY = new Long(900);        /** Maximum simultaneous requests in process to a host (queue) */    public final static String ATTR_HOST_VALENCE = "host-valence";    private final static Integer DEFAULT_HOST_VALENCE = new Integer(1);     /** Number of hops of embeds (ERX) to bump to front of host queue */    public final static String ATTR_PREFERENCE_EMBED_HOPS =        "preference-embed-hops";    private final static Integer DEFAULT_PREFERENCE_EMBED_HOPS = new Integer(0);         /** Queue assignment to force on CrawlURIs. Intended to be used      *  via overrides*/    public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";    protected final static String DEFAULT_FORCE_QUEUE = "";    /** Acceptable characters in forced queue names.     *  Word chars, dash, period, comma, colon */    protected final static String ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*";    /** Should the queue assignment ignore www in hostnames, effectively      *  stripping them away.      */    public final static String ATTR_QUEUE_IGNORE_WWW = "queue-ignore-www";    protected final static Boolean DEFAULT_QUEUE_IGNORE_WWW = new Boolean(false);        /** Should the Frontier use a seperate 'already included' datastructure     *  or rely on the queues'.      */    public final static String ATTR_USE_URI_UNIQ_FILTER = "use-uri-uniq-filter";    protected final static Boolean DEFAULT_USE_URI_UNIQ_FILTER = new Boolean(false);        /** The Class to use for QueueAssignmentPolicy     */    public final static String ATTR_QUEUE_ASSIGNMENT_POLICY = "queue-assignment-policy";    protected final static String DEFAULT_QUEUE_ASSIGNMENT_POLICY = HostnameQueueAssignmentPolicy.class.getName();        private CrawlController controller;        private AdaptiveRevisitQueueList hostQueues;        private UriUniqFilter alreadyIncluded;    private ThreadLocalQueue threadWaiting = new ThreadLocalQueue();    /** Policy for assigning CrawlURIs to named queues */    private QueueAssignmentPolicy queueAssignmentPolicy = null;        // top-level stats    private long succeededFetchCount = 0;    private long failedFetchCount = 0;    // URI's that are disregarded (for example because of robot.txt rules)    private long disregardedUriCount = 0;    private long totalProcessedBytes = 0;        // Flags indicating operator-specified crawl pause/end     private boolean shouldPause = false;    private boolean shouldTerminate = false;        public AdaptiveRevisitFrontier(String name) {        this(name, "AdaptiveRevisitFrontier. EXPERIMENTAL Frontier that " +                "will repeatedly visit all " +                "encountered URIs. Wait time between visits is configurable" +                " and is determined by seperate Processor(s). See " +                "WaitEvaluators " +                "See documentation for ARFrontier limitations.");            }    public AdaptiveRevisitFrontier(String name, String description) {        super(Frontier.ATTR_NAME, description);        addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,                "How many multiples of last fetch elapsed time to wait before " +                "recontacting same server", DEFAULT_DELAY_FACTOR));            addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,                "Never wait more than this long, regardless of multiple",                DEFAULT_MAX_DELAY));            addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,                "Always wait this long after one completion before recontacting " +                "same server, regardless of multiple", DEFAULT_MIN_DELAY));             addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES,                "How often to retry fetching a URI that failed to be retrieved.\n" +                "If zero, the crawler will get the robots.txt only.",                DEFAULT_MAX_RETRIES));            addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY,                    "How long to wait by default until we retry fetching a" +                    " URI that failed to be retrieved (seconds). ",                    DEFAULT_RETRY_DELAY));            addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,                    "Number of embedded (or redirected) hops up to which " +                    "a URI has higher priority scheduling. For example, if set " +                    "to 1 (the default), items such as inline images (1-hop " +                    "embedded resources) will be scheduled ahead of all regular " +                    "links (or many-hop resources, like nested frames). If set to " +                    "zero, no preferencing will occur, and embeds/redirects are " +                    "scheduled the same as regular links.",                    DEFAULT_PREFERENCE_EMBED_HOPS));            Type t;            t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE,                    "Maximum number of simultaneous requests to a single" +                    " host.",                    DEFAULT_HOST_VALENCE));            t.setExpertSetting(true);            t = addElementToDefinition(new SimpleType(ATTR_QUEUE_IGNORE_WWW,                    "If true then documents from x.com, www.x.com and any " +                    "www[0-9]+.x.com will be assigned to the same queue.",                    DEFAULT_QUEUE_IGNORE_WWW));            t.setExpertSetting(true);            t = addElementToDefinition(new SimpleType(                    ATTR_FORCE_QUEUE,                    "The queue name into which to force URIs. Should "                    + "be left blank at global level.  Specify a "                    + "per-domain/per-host override to force URIs into "                    + "a particular named queue, regardless of the assignment "                    + "policy in effect (domain or ip-based politeness). "                    + "This could be used on domains known to all be from "                    + "the same small set of IPs (eg blogspot, dailykos, etc.) "                    + "to simulate IP-based politeness, or it could be used if "                    + "you wanted to enforce politeness over a whole domain, even "                    + "though the subdomains are split across many IPs.",                    DEFAULT_FORCE_QUEUE));            t.setOverrideable(true);            t.setExpertSetting(true);            t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE,                    Level.WARNING, "This field must contain only alphanumeric "                    + "characters plus period, dash, comma, colon, or underscore."));            t = addElementToDefinition(new SimpleType(ATTR_USE_URI_UNIQ_FILTER,                    "If true then the Frontier will use a seperate " +                    "datastructure to detect and eliminate duplicates.\n" +                    "This is required for Canonicalization rules to work.",                    DEFAULT_USE_URI_UNIQ_FILTER));            t.setExpertSetting(true);            t.setOverrideable(false);            // Read the list of permissible choices from heritrix.properties.            // Its a list of space- or comma-separated values.            String queueStr = System.getProperty(AbstractFrontier.class.getName() +                    "." + ATTR_QUEUE_ASSIGNMENT_POLICY,                    HostnameQueueAssignmentPolicy.class.getName() + " " +                    IPQueueAssignmentPolicy.class.getName() + " " +                    BucketQueueAssignmentPolicy.class.getName() + " " +                    SurtAuthorityQueueAssignmentPolicy.class.getName() + " " +                    TopmostAssignedSurtQueueAssignmentPolicy.class.getName());            Pattern p = Pattern.compile("\\s*,\\s*|\\s+");            String [] queues = p.split(queueStr);            if (queues.length <= 0) {                throw new RuntimeException("Failed parse of " +                        " assignment queue policy string: " + queueStr);            }            t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,                "Defines how to assign URIs to queues. Can assign by host, " +                "by ip, and into one of a fixed set of buckets (1k). NOTE: " +                "Use of policies other than the default " +                "HostnameQueueAssignmentPolicy is untested and provided " +                "for use at your own risk. Further, changing this policy " +                "during a crawl, or between restarts using the same data " +                "directory, is likely to cause unrecoverable problems.",                DEFAULT_QUEUE_ASSIGNMENT_POLICY, queues));            t.setExpertSetting(true);        // Register persistent CrawlURI items         CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY);        CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING);    }    public synchronized void initialize(CrawlController c)            throws FatalConfigurationException, IOException {        controller = c;        controller.addCrawlStatusListener(this);		String clsName = (String) getUncheckedAttribute(null,ATTR_QUEUE_ASSIGNMENT_POLICY);		try {			queueAssignmentPolicy = (QueueAssignmentPolicy) Class.forName(clsName).newInstance();		} catch (Exception e) {			throw new RuntimeException(e);		}                hostQueues = new AdaptiveRevisitQueueList(c.getBdbEnvironment(),            c.getBdbEnvironment().getClassCatalog());                if(((Boolean)getUncheckedAttribute(                null,ATTR_USE_URI_UNIQ_FILTER)).booleanValue()){            alreadyIncluded = createAlreadyIncluded();        } else {            alreadyIncluded = null;        }                loadSeeds();    }    /**     * Create a UriUniqFilter that will serve as record      * of already seen URIs.     *     * @return A UURISet that will serve as a record of already seen URIs     * @throws IOException     */    protected UriUniqFilter createAlreadyIncluded() throws IOException {        UriUniqFilter uuf = new BdbUriUniqFilter(                this.controller.getBdbEnvironment());        uuf.setDestination(this);        return uuf;    }        /**     * Loads the seeds     * <p>     * This method is called by initialize() and kickUpdate()     */    public void loadSeeds() {        Writer ignoredWriter = new StringWriter();        // Get the seeds to refresh.        Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter);        while (iter.hasNext()) {            CandidateURI caUri =                CandidateURI.createSeedCandidateURI((UURI)iter.next());            caUri.setSchedulingDirective(CandidateURI.MEDIUM);            schedule(caUri);        }        batchFlush();        // save ignored items (if any) where they can be consulted later        AbstractFrontier.saveIgnoredItems(                ignoredWriter.toString(),
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -