⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 abstractfrontier.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
/* AbstractFrontier * * $Id: AbstractFrontier.java 5053 2007-04-10 02:34:20Z gojomo $ * * Created on Aug 17, 2004 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.frontier;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.PrintWriter;import java.io.Serializable;import java.io.StringWriter;import java.io.Writer;import java.util.Iterator;import java.util.List;import java.util.concurrent.atomic.AtomicLong;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Pattern;import javax.management.AttributeNotFoundException;import org.apache.commons.httpclient.HttpStatus;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlHost;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.crawler.datamodel.CrawlServer;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.Frontier;import org.archive.crawler.framework.ToeThread;import org.archive.crawler.framework.exceptions.EndedException;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.settings.ModuleType;import org.archive.crawler.settings.RegularExpressionConstraint;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.crawler.url.Canonicalizer;import org.archive.net.UURI;import org.archive.util.ArchiveUtils;/** * Shared facilities for Frontier implementations. *  * @author gojomo */public abstract class AbstractFrontier extends ModuleTypeimplements CrawlStatusListener, Frontier, FetchStatusCodes,        CoreAttributeConstants, Serializable {    private static final Logger logger = Logger            .getLogger(AbstractFrontier.class.getName());    protected transient CrawlController controller;    /** ordinal numbers to assign to created CrawlURIs */    protected AtomicLong nextOrdinal = new AtomicLong(1);     /** should the frontier hold any threads asking for URIs? */    protected boolean shouldPause = false;    /**     * should the frontier send an EndedException to any threads asking for     * URIs?     */    protected transient boolean shouldTerminate = false;    /**     * how many multiples of last fetch elapsed time to wait before recontacting     * same server     */    public final static String ATTR_DELAY_FACTOR = "delay-factor";    protected final static Float DEFAULT_DELAY_FACTOR = new Float(5);    /**     * always wait this long after one completion before recontacting same     * server, regardless of multiple     */    public final static String ATTR_MIN_DELAY = "min-delay-ms";    // 3 secs.    protected final static Integer DEFAULT_MIN_DELAY = new Integer(3000);    /** never wait more than this long, regardless of multiple */    public final static String ATTR_MAX_DELAY = "max-delay-ms";    // 30 secs    protected final static Integer DEFAULT_MAX_DELAY = new Integer(30000);    /** number of hops of embeds (ERX) to bump to front of host queue */    public final static String ATTR_PREFERENCE_EMBED_HOPS =        "preference-embed-hops";    protected final static Integer DEFAULT_PREFERENCE_EMBED_HOPS =        new Integer(1);    /** maximum per-host bandwidth usage */    public final static String ATTR_MAX_HOST_BANDWIDTH_USAGE =        "max-per-host-bandwidth-usage-KB-sec";    protected final static Integer DEFAULT_MAX_HOST_BANDWIDTH_USAGE =        new Integer(0);    /** maximum overall bandwidth usage */    public final static String ATTR_MAX_OVERALL_BANDWIDTH_USAGE =        "total-bandwidth-usage-KB-sec";    protected final static Integer DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE =        new Integer(0);    /** for retryable problems, seconds to wait before a retry */    public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";    // 15 mins    protected final static Long DEFAULT_RETRY_DELAY = new Long(900);    /** maximum times to emit a CrawlURI without final disposition */    public final static String ATTR_MAX_RETRIES = "max-retries";    protected final static Integer DEFAULT_MAX_RETRIES = new Integer(30);    public final static String ATTR_QUEUE_ASSIGNMENT_POLICY =        "queue-assignment-policy";    /** queue assignment to force onto CrawlURIs; intended to be overridden */    public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";    protected final static String DEFAULT_FORCE_QUEUE = "";    // word chars, dash, period, comma, colon    protected final static String ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*";            /** whether pause, rather than finish, when crawl appears done */    public final static String ATTR_PAUSE_AT_FINISH = "pause-at-finish";    // TODO: change default to true once well-tested    protected final static Boolean DEFAULT_PAUSE_AT_FINISH = Boolean.FALSE;        /** whether to pause at crawl start */    public final static String ATTR_PAUSE_AT_START = "pause-at-start";    protected final static Boolean DEFAULT_PAUSE_AT_START = Boolean.FALSE;        /** whether to pause at crawl start */    public final static String ATTR_SOURCE_TAG_SEEDS = "source-tag-seeds";    protected final static Boolean DEFAULT_SOURCE_TAG_SEEDS = Boolean.FALSE;    /**     * Recover log on or off attribute.     */    protected final static String ATTR_RECOVERY_ENABLED =        "recovery-log-enabled";    protected final static Boolean DEFAULT_ATTR_RECOVERY_ENABLED =        Boolean.TRUE;    // top-level stats    protected long queuedUriCount = 0; // total URIs queued to be visited    protected long succeededFetchCount = 0;    protected long failedFetchCount = 0;    protected long disregardedUriCount = 0; //URIs that are disregarded (for                                          // example because of robot.txt rules)    /**     * Used when bandwidth constraint are used.     */    protected long totalProcessedBytes = 0;    private transient long nextURIEmitTime = 0;    protected long processedBytesAfterLastEmittedURI = 0;        protected int lastMaxBandwidthKB = 0;    /** Policy for assigning CrawlURIs to named queues */    protected transient QueueAssignmentPolicy queueAssignmentPolicy = null;    /**     * Crawl replay logger.     *      * Currently captures Frontier/URI transitions.     * Can be null if user chose not to run a recovery.log.     */    private transient FrontierJournal recover = null;    /** file collecting report of ignored seed-file entries (if any) */    public static final String IGNORED_SEEDS_FILENAME = "seeds.ignored";    /**     * @param name Name of this frontier.     * @param description Description for this frontier.     */    public AbstractFrontier(String name, String description) {        super(name, description);        addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,                "How many multiples of last fetch elapsed time to wait before "                        + "recontacting same server", DEFAULT_DELAY_FACTOR));        addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,                "Never wait more than this long.", DEFAULT_MAX_DELAY));        addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,                "Always wait this long after one completion before recontacting "                        + "same server.", DEFAULT_MIN_DELAY));        addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES,                "How often to retry fetching a URI that failed to be retrieved. "                        + "If zero, the crawler will get the robots.txt only.",                DEFAULT_MAX_RETRIES));        addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY,                "How long to wait by default until we retry fetching a"                        + " URI that failed to be retrieved (seconds). ",                DEFAULT_RETRY_DELAY));        addElementToDefinition(new SimpleType(                ATTR_PREFERENCE_EMBED_HOPS,                "Number of embedded (or redirected) hops up to which "                + "a URI has higher priority scheduling. For example, if set "                + "to 1 (the default), items such as inline images (1-hop "                + "embedded resources) will be scheduled ahead of all regular "                + "links (or many-hop resources, like nested frames). If set to "                + "zero, no preferencing will occur, and embeds/redirects are "                + "scheduled the same as regular links.",                DEFAULT_PREFERENCE_EMBED_HOPS));        Type t;        t = addElementToDefinition(new SimpleType(                ATTR_MAX_OVERALL_BANDWIDTH_USAGE,                "The maximum average bandwidth the crawler is allowed to use. "                + "The actual read speed is not affected by this setting, it only "                + "holds back new URIs from being processed when the bandwidth "                + "usage has been to high. 0 means no bandwidth limitation.",                DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE));        t.setOverrideable(false);        t = addElementToDefinition(new SimpleType(                ATTR_MAX_HOST_BANDWIDTH_USAGE,                "The maximum average bandwidth the crawler is allowed to use per "                + "host. The actual read speed is not affected by this setting, "                + "it only holds back new URIs from being processed when the "                + "bandwidth usage has been to high. 0 means no bandwidth "                + "limitation.", DEFAULT_MAX_HOST_BANDWIDTH_USAGE));        t.setExpertSetting(true);        // Read the list of permissible choices from heritrix.properties.        // Its a list of space- or comma-separated values.        String queueStr = System.getProperty(AbstractFrontier.class.getName() +                "." + ATTR_QUEUE_ASSIGNMENT_POLICY,                HostnameQueueAssignmentPolicy.class.getName() + " " +                IPQueueAssignmentPolicy.class.getName() + " " +                BucketQueueAssignmentPolicy.class.getName() + " " +                SurtAuthorityQueueAssignmentPolicy.class.getName());        Pattern p = Pattern.compile("\\s*,\\s*|\\s+");        String [] queues = p.split(queueStr);        if (queues.length <= 0) {            throw new RuntimeException("Failed parse of " +                    " assignment queue policy string: " + queueStr);        }        t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,                "Defines how to assign URIs to queues. Can assign by host, " +                "by ip, and into one of a fixed set of buckets (1k).",                queues[0], queues));        t.setExpertSetting(true);        t.setOverrideable(false);        t = addElementToDefinition(new SimpleType(                ATTR_FORCE_QUEUE,                "The queue name into which to force URIs. Should "                + "be left blank at global level.  Specify a "                + "per-domain/per-host override to force URIs into "                + "a particular named queue, regardless of the assignment "                + "policy in effect (domain or ip-based politeness). "                + "This could be used on domains known to all be from "                + "the same small set of IPs (eg blogspot, dailykos, etc.) "                + "to simulate IP-based politeness, or it could be used if "                + "you wanted to enforce politeness over a whole domain, even "                + "though the subdomains are split across many IPs.",                DEFAULT_FORCE_QUEUE));        t.setOverrideable(true);        t.setExpertSetting(true);        t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE,                Level.WARNING, "This field must contain only alphanumeric "                + "characters plus period, dash, comma, colon, or underscore."));        t = addElementToDefinition(new SimpleType(                ATTR_PAUSE_AT_START,                "Whether to pause when the crawl begins, before any URIs " +                "are tried. This gives the operator a chance to verify or " +                "adjust the crawl before actual work begins. " +                "Default is false.", DEFAULT_PAUSE_AT_START));        t = addElementToDefinition(new SimpleType(                ATTR_PAUSE_AT_FINISH,                "Whether to pause when the crawl appears finished, rather "                + "than immediately end the crawl. This gives the operator an "                + "opportunity to view crawl results, and possibly add URIs or "                + "adjust settings, while the crawl state is still available. "                + "Default is false.", DEFAULT_PAUSE_AT_FINISH));        t.setOverrideable(false);                t = addElementToDefinition(new SimpleType(                ATTR_SOURCE_TAG_SEEDS,                "Whether to tag seeds with their own URI as a heritable " +                "'source' String, which will be carried-forward to all URIs " +                "discovered on paths originating from that seed. When " +                "present, such source tags appear in the second-to-last " +                "crawl.log field.", DEFAULT_SOURCE_TAG_SEEDS));        t.setOverrideable(false);                t = addElementToDefinition(new SimpleType(ATTR_RECOVERY_ENABLED,                "Set to false to disable recovery log writing.  Do this if " +                "you you are using the checkpoint feature for recovering " +                "crashed crawls.", DEFAULT_ATTR_RECOVERY_ENABLED));        t.setExpertSetting(true);        // No sense in it being overrideable.        t.setOverrideable(false);    }    public void start() {        if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_START))                .booleanValue()) {            // trigger crawl-wide pause            controller.requestCrawlPause();        } else {            // simply begin            unpause();         }    }        synchronized public void pause() {        shouldPause = true;    }    synchronized public void unpause() {        shouldPause = false;        notifyAll();    }    public void initialize(CrawlController c)            throws FatalConfigurationException, IOException {        c.addCrawlStatusListener(this);        File logsDisk = null;        try {            logsDisk = c.getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -