📄 abstractfrontier.java
字号:
/* AbstractFrontier * * $Id: AbstractFrontier.java 5053 2007-04-10 02:34:20Z gojomo $ * * Created on Aug 17, 2004 * * Copyright (C) 2004 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.crawler.frontier;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.PrintWriter;import java.io.Serializable;import java.io.StringWriter;import java.io.Writer;import java.util.Iterator;import java.util.List;import java.util.concurrent.atomic.AtomicLong;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Pattern;import javax.management.AttributeNotFoundException;import org.apache.commons.httpclient.HttpStatus;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlHost;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.crawler.datamodel.CrawlServer;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.Frontier;import org.archive.crawler.framework.ToeThread;import org.archive.crawler.framework.exceptions.EndedException;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.settings.ModuleType;import org.archive.crawler.settings.RegularExpressionConstraint;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.crawler.url.Canonicalizer;import org.archive.net.UURI;import org.archive.util.ArchiveUtils;/** * Shared facilities for Frontier implementations. * * @author gojomo */public abstract class AbstractFrontier extends ModuleTypeimplements CrawlStatusListener, Frontier, FetchStatusCodes, CoreAttributeConstants, Serializable { private static final Logger logger = Logger .getLogger(AbstractFrontier.class.getName()); protected transient CrawlController controller; /** ordinal numbers to assign to created CrawlURIs */ protected AtomicLong nextOrdinal = new AtomicLong(1); /** should the frontier hold any threads asking for URIs? */ protected boolean shouldPause = false; /** * should the frontier send an EndedException to any threads asking for * URIs? */ protected transient boolean shouldTerminate = false; /** * how many multiples of last fetch elapsed time to wait before recontacting * same server */ public final static String ATTR_DELAY_FACTOR = "delay-factor"; protected final static Float DEFAULT_DELAY_FACTOR = new Float(5); /** * always wait this long after one completion before recontacting same * server, regardless of multiple */ public final static String ATTR_MIN_DELAY = "min-delay-ms"; // 3 secs. protected final static Integer DEFAULT_MIN_DELAY = new Integer(3000); /** never wait more than this long, regardless of multiple */ public final static String ATTR_MAX_DELAY = "max-delay-ms"; // 30 secs protected final static Integer DEFAULT_MAX_DELAY = new Integer(30000); /** number of hops of embeds (ERX) to bump to front of host queue */ public final static String ATTR_PREFERENCE_EMBED_HOPS = "preference-embed-hops"; protected final static Integer DEFAULT_PREFERENCE_EMBED_HOPS = new Integer(1); /** maximum per-host bandwidth usage */ public final static String ATTR_MAX_HOST_BANDWIDTH_USAGE = "max-per-host-bandwidth-usage-KB-sec"; protected final static Integer DEFAULT_MAX_HOST_BANDWIDTH_USAGE = new Integer(0); /** maximum overall bandwidth usage */ public final static String ATTR_MAX_OVERALL_BANDWIDTH_USAGE = "total-bandwidth-usage-KB-sec"; protected final static Integer DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE = new Integer(0); /** for retryable problems, seconds to wait before a retry */ public final static String ATTR_RETRY_DELAY = "retry-delay-seconds"; // 15 mins protected final static Long DEFAULT_RETRY_DELAY = new Long(900); /** maximum times to emit a CrawlURI without final disposition */ public final static String ATTR_MAX_RETRIES = "max-retries"; protected final static Integer DEFAULT_MAX_RETRIES = new Integer(30); public final static String ATTR_QUEUE_ASSIGNMENT_POLICY = "queue-assignment-policy"; /** queue assignment to force onto CrawlURIs; intended to be overridden */ public final static String ATTR_FORCE_QUEUE = "force-queue-assignment"; protected final static String DEFAULT_FORCE_QUEUE = ""; // word chars, dash, period, comma, colon protected final static String ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*"; /** whether pause, rather than finish, when crawl appears done */ public final static String ATTR_PAUSE_AT_FINISH = "pause-at-finish"; // TODO: change default to true once well-tested protected final static Boolean DEFAULT_PAUSE_AT_FINISH = Boolean.FALSE; /** whether to pause at crawl start */ public final static String ATTR_PAUSE_AT_START = "pause-at-start"; protected final static Boolean DEFAULT_PAUSE_AT_START = Boolean.FALSE; /** whether to pause at crawl start */ public final static String ATTR_SOURCE_TAG_SEEDS = "source-tag-seeds"; protected final static Boolean DEFAULT_SOURCE_TAG_SEEDS = Boolean.FALSE; /** * Recover log on or off attribute. */ protected final static String ATTR_RECOVERY_ENABLED = "recovery-log-enabled"; protected final static Boolean DEFAULT_ATTR_RECOVERY_ENABLED = Boolean.TRUE; // top-level stats protected long queuedUriCount = 0; // total URIs queued to be visited protected long succeededFetchCount = 0; protected long failedFetchCount = 0; protected long disregardedUriCount = 0; //URIs that are disregarded (for // example because of robot.txt rules) /** * Used when bandwidth constraint are used. */ protected long totalProcessedBytes = 0; private transient long nextURIEmitTime = 0; protected long processedBytesAfterLastEmittedURI = 0; protected int lastMaxBandwidthKB = 0; /** Policy for assigning CrawlURIs to named queues */ protected transient QueueAssignmentPolicy queueAssignmentPolicy = null; /** * Crawl replay logger. * * Currently captures Frontier/URI transitions. * Can be null if user chose not to run a recovery.log. */ private transient FrontierJournal recover = null; /** file collecting report of ignored seed-file entries (if any) */ public static final String IGNORED_SEEDS_FILENAME = "seeds.ignored"; /** * @param name Name of this frontier. * @param description Description for this frontier. */ public AbstractFrontier(String name, String description) { super(name, description); addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR, "How many multiples of last fetch elapsed time to wait before " + "recontacting same server", DEFAULT_DELAY_FACTOR)); addElementToDefinition(new SimpleType(ATTR_MAX_DELAY, "Never wait more than this long.", DEFAULT_MAX_DELAY)); addElementToDefinition(new SimpleType(ATTR_MIN_DELAY, "Always wait this long after one completion before recontacting " + "same server.", DEFAULT_MIN_DELAY)); addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES, "How often to retry fetching a URI that failed to be retrieved. " + "If zero, the crawler will get the robots.txt only.", DEFAULT_MAX_RETRIES)); addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY, "How long to wait by default until we retry fetching a" + " URI that failed to be retrieved (seconds). ", DEFAULT_RETRY_DELAY)); addElementToDefinition(new SimpleType( ATTR_PREFERENCE_EMBED_HOPS, "Number of embedded (or redirected) hops up to which " + "a URI has higher priority scheduling. For example, if set " + "to 1 (the default), items such as inline images (1-hop " + "embedded resources) will be scheduled ahead of all regular " + "links (or many-hop resources, like nested frames). If set to " + "zero, no preferencing will occur, and embeds/redirects are " + "scheduled the same as regular links.", DEFAULT_PREFERENCE_EMBED_HOPS)); Type t; t = addElementToDefinition(new SimpleType( ATTR_MAX_OVERALL_BANDWIDTH_USAGE, "The maximum average bandwidth the crawler is allowed to use. " + "The actual read speed is not affected by this setting, it only " + "holds back new URIs from being processed when the bandwidth " + "usage has been to high. 0 means no bandwidth limitation.", DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE)); t.setOverrideable(false); t = addElementToDefinition(new SimpleType( ATTR_MAX_HOST_BANDWIDTH_USAGE, "The maximum average bandwidth the crawler is allowed to use per " + "host. The actual read speed is not affected by this setting, " + "it only holds back new URIs from being processed when the " + "bandwidth usage has been to high. 0 means no bandwidth " + "limitation.", DEFAULT_MAX_HOST_BANDWIDTH_USAGE)); t.setExpertSetting(true); // Read the list of permissible choices from heritrix.properties. // Its a list of space- or comma-separated values. String queueStr = System.getProperty(AbstractFrontier.class.getName() + "." + ATTR_QUEUE_ASSIGNMENT_POLICY, HostnameQueueAssignmentPolicy.class.getName() + " " + IPQueueAssignmentPolicy.class.getName() + " " + BucketQueueAssignmentPolicy.class.getName() + " " + SurtAuthorityQueueAssignmentPolicy.class.getName()); Pattern p = Pattern.compile("\\s*,\\s*|\\s+"); String [] queues = p.split(queueStr); if (queues.length <= 0) { throw new RuntimeException("Failed parse of " + " assignment queue policy string: " + queueStr); } t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY, "Defines how to assign URIs to queues. Can assign by host, " + "by ip, and into one of a fixed set of buckets (1k).", queues[0], queues)); t.setExpertSetting(true); t.setOverrideable(false); t = addElementToDefinition(new SimpleType( ATTR_FORCE_QUEUE, "The queue name into which to force URIs. Should " + "be left blank at global level. Specify a " + "per-domain/per-host override to force URIs into " + "a particular named queue, regardless of the assignment " + "policy in effect (domain or ip-based politeness). " + "This could be used on domains known to all be from " + "the same small set of IPs (eg blogspot, dailykos, etc.) " + "to simulate IP-based politeness, or it could be used if " + "you wanted to enforce politeness over a whole domain, even " + "though the subdomains are split across many IPs.", DEFAULT_FORCE_QUEUE)); t.setOverrideable(true); t.setExpertSetting(true); t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE, Level.WARNING, "This field must contain only alphanumeric " + "characters plus period, dash, comma, colon, or underscore.")); t = addElementToDefinition(new SimpleType( ATTR_PAUSE_AT_START, "Whether to pause when the crawl begins, before any URIs " + "are tried. This gives the operator a chance to verify or " + "adjust the crawl before actual work begins. " + "Default is false.", DEFAULT_PAUSE_AT_START)); t = addElementToDefinition(new SimpleType( ATTR_PAUSE_AT_FINISH, "Whether to pause when the crawl appears finished, rather " + "than immediately end the crawl. This gives the operator an " + "opportunity to view crawl results, and possibly add URIs or " + "adjust settings, while the crawl state is still available. " + "Default is false.", DEFAULT_PAUSE_AT_FINISH)); t.setOverrideable(false); t = addElementToDefinition(new SimpleType( ATTR_SOURCE_TAG_SEEDS, "Whether to tag seeds with their own URI as a heritable " + "'source' String, which will be carried-forward to all URIs " + "discovered on paths originating from that seed. When " + "present, such source tags appear in the second-to-last " + "crawl.log field.", DEFAULT_SOURCE_TAG_SEEDS)); t.setOverrideable(false); t = addElementToDefinition(new SimpleType(ATTR_RECOVERY_ENABLED, "Set to false to disable recovery log writing. Do this if " + "you you are using the checkpoint feature for recovering " + "crashed crawls.", DEFAULT_ATTR_RECOVERY_ENABLED)); t.setExpertSetting(true); // No sense in it being overrideable. t.setOverrideable(false); } public void start() { if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_START)) .booleanValue()) { // trigger crawl-wide pause controller.requestCrawlPause(); } else { // simply begin unpause(); } } synchronized public void pause() { shouldPause = true; } synchronized public void unpause() { shouldPause = false; notifyAll(); } public void initialize(CrawlController c) throws FatalConfigurationException, IOException { c.addCrawlStatusListener(this); File logsDisk = null; try { logsDisk = c.getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -