📄 adaptiverevisitfrontier.java
字号:
/* AdaptiveRevisitFrontier.java** Created on Sep 13, 2004** Copyright (C) 2004 Kristinn Sigur?sson.** This file is part of the Heritrix web crawler (crawler.archive.org).** Heritrix is free software; you can redistribute it and/or modify* it under the terms of the GNU Lesser Public License as published by* the Free Software Foundation; either version 2.1 of the License, or* any later version.** Heritrix is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the* GNU Lesser Public License for more details.** You should have received a copy of the GNU Lesser Public License* along with Heritrix; if not, write to the Free Software* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/package org.archive.crawler.frontier;import java.io.File;import java.io.IOException;import java.io.PrintWriter;import java.io.Serializable;import java.io.StringWriter;import java.io.Writer;import java.util.ArrayList;import java.util.Date;import java.util.Iterator;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.AttributeNotFoundException;import org.apache.commons.httpclient.HttpStatus;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlServer;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.datamodel.UriUniqFilter;import org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.Frontier;import org.archive.crawler.framework.FrontierMarker;import org.archive.crawler.framework.exceptions.EndedException;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;import org.archive.crawler.settings.ModuleType;import org.archive.crawler.settings.RegularExpressionConstraint;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.crawler.url.Canonicalizer;import org.archive.crawler.util.BdbUriUniqFilter;import org.archive.net.UURI;import org.archive.queue.MemQueue;import org.archive.queue.Queue;import org.archive.util.ArchiveUtils;/** * A Frontier that will repeatedly visit all encountered URIs. * <p> * Wait time between visits is configurable and varies based on observed * changes of documents. * <p> * The Frontier borrows many things from HostQueuesFrontier, but implements * an entirely different strategy in issuing URIs and consequently in keeping a * record of discovered URIs. * * @author Kristinn Sigurdsson */public class AdaptiveRevisitFrontier extends ModuleType implements Frontier, FetchStatusCodes, CoreAttributeConstants, AdaptiveRevisitAttributeConstants, CrawlStatusListener, HasUriReceiver { private static final long serialVersionUID = -8666872690438543671L; private static final Logger logger = Logger.getLogger(AdaptiveRevisitFrontier.class.getName()); /** How many multiples of last fetch elapsed time to wait before recontacting * same server */ public final static String ATTR_DELAY_FACTOR = "delay-factor"; private final static Float DEFAULT_DELAY_FACTOR = new Float(5); /** Always wait this long after one completion before recontacting * same server, regardless of multiple */ public final static String ATTR_MIN_DELAY = "min-delay-ms"; // 2 seconds private final static Integer DEFAULT_MIN_DELAY = new Integer(2000); /** Never wait more than this long, regardless of multiple */ public final static String ATTR_MAX_DELAY = "max-delay-ms"; // 30 seconds private final static Integer DEFAULT_MAX_DELAY = new Integer(30000); /** Maximum times to emit a CrawlURI without final disposition */ public final static String ATTR_MAX_RETRIES = "max-retries"; private final static Integer DEFAULT_MAX_RETRIES = new Integer(30); /** For retryable problems, seconds to wait before a retry */ public final static String ATTR_RETRY_DELAY = "retry-delay-seconds"; // 15 minutes private final static Long DEFAULT_RETRY_DELAY = new Long(900); /** Maximum simultaneous requests in process to a host (queue) */ public final static String ATTR_HOST_VALENCE = "host-valence"; private final static Integer DEFAULT_HOST_VALENCE = new Integer(1); /** Number of hops of embeds (ERX) to bump to front of host queue */ public final static String ATTR_PREFERENCE_EMBED_HOPS = "preference-embed-hops"; private final static Integer DEFAULT_PREFERENCE_EMBED_HOPS = new Integer(0); /** Queue assignment to force on CrawlURIs. Intended to be used * via overrides*/ public final static String ATTR_FORCE_QUEUE = "force-queue-assignment"; protected final static String DEFAULT_FORCE_QUEUE = ""; /** Acceptable characters in forced queue names. * Word chars, dash, period, comma, colon */ protected final static String ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*"; /** Should the queue assignment ignore www in hostnames, effectively * stripping them away. */ public final static String ATTR_QUEUE_IGNORE_WWW = "queue-ignore-www"; protected final static Boolean DEFAULT_QUEUE_IGNORE_WWW = new Boolean(false); /** Should the Frontier use a seperate 'already included' datastructure * or rely on the queues'. */ public final static String ATTR_USE_URI_UNIQ_FILTER = "use-uri-uniq-filter"; protected final static Boolean DEFAULT_USE_URI_UNIQ_FILTER = new Boolean(false); private CrawlController controller; private AdaptiveRevisitQueueList hostQueues; private UriUniqFilter alreadyIncluded; private ThreadLocalQueue threadWaiting = new ThreadLocalQueue(); /** Policy for assigning CrawlURIs to named queues */ private QueueAssignmentPolicy queueAssignmentPolicy = null; // top-level stats private long succeededFetchCount = 0; private long failedFetchCount = 0; // URI's that are disregarded (for example because of robot.txt rules) private long disregardedUriCount = 0; private long totalProcessedBytes = 0; // Flags indicating operator-specified crawl pause/end private boolean shouldPause = false; private boolean shouldTerminate = false; public AdaptiveRevisitFrontier(String name) { this(name, "AdaptiveRevisitFrontier. EXPERIMENTAL Frontier that " + "will repeatedly visit all " + "encountered URIs. Wait time between visits is configurable" + " and is determined by seperate Processor(s). See " + "WaitEvaluators " + "See documentation for ARFrontier limitations."); } public AdaptiveRevisitFrontier(String name, String description) { super(Frontier.ATTR_NAME, description); addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR, "How many multiples of last fetch elapsed time to wait before " + "recontacting same server", DEFAULT_DELAY_FACTOR)); addElementToDefinition(new SimpleType(ATTR_MAX_DELAY, "Never wait more than this long, regardless of multiple", DEFAULT_MAX_DELAY)); addElementToDefinition(new SimpleType(ATTR_MIN_DELAY, "Always wait this long after one completion before recontacting " + "same server, regardless of multiple", DEFAULT_MIN_DELAY)); addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES, "How often to retry fetching a URI that failed to be retrieved.\n" + "If zero, the crawler will get the robots.txt only.", DEFAULT_MAX_RETRIES)); addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY, "How long to wait by default until we retry fetching a" + " URI that failed to be retrieved (seconds). ", DEFAULT_RETRY_DELAY)); addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS, "Number of embedded (or redirected) hops up to which " + "a URI has higher priority scheduling. For example, if set " + "to 1 (the default), items such as inline images (1-hop " + "embedded resources) will be scheduled ahead of all regular " + "links (or many-hop resources, like nested frames). If set to " + "zero, no preferencing will occur, and embeds/redirects are " + "scheduled the same as regular links.", DEFAULT_PREFERENCE_EMBED_HOPS)); Type t; t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE, "Maximum number of simultaneous requests to a single" + " host.", DEFAULT_HOST_VALENCE)); t.setExpertSetting(true); t = addElementToDefinition(new SimpleType(ATTR_QUEUE_IGNORE_WWW, "If true then documents from x.com, www.x.com and any " + "www[0-9]+.x.com will be assigned to the same queue.", DEFAULT_QUEUE_IGNORE_WWW)); t.setExpertSetting(true); t = addElementToDefinition(new SimpleType( ATTR_FORCE_QUEUE, "The queue name into which to force URIs. Should " + "be left blank at global level. Specify a " + "per-domain/per-host override to force URIs into " + "a particular named queue, regardless of the assignment " + "policy in effect (domain or ip-based politeness). " + "This could be used on domains known to all be from " + "the same small set of IPs (eg blogspot, dailykos, etc.) " + "to simulate IP-based politeness, or it could be used if " + "you wanted to enforce politeness over a whole domain, even " + "though the subdomains are split across many IPs.", DEFAULT_FORCE_QUEUE)); t.setOverrideable(true); t.setExpertSetting(true); t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE, Level.WARNING, "This field must contain only alphanumeric " + "characters plus period, dash, comma, colon, or underscore.")); t = addElementToDefinition(new SimpleType(ATTR_USE_URI_UNIQ_FILTER, "If true then the Frontier will use a seperate " + "datastructure to detect and eliminate duplicates.\n" + "This is required for Canonicalization rules to work.", DEFAULT_USE_URI_UNIQ_FILTER)); t.setExpertSetting(true); t.setOverrideable(false); // Register persistent CrawlURI items CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY); CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING); } public synchronized void initialize(CrawlController c) throws FatalConfigurationException, IOException { controller = c; controller.addCrawlStatusListener(this); queueAssignmentPolicy = new HostnameQueueAssignmentPolicy(); hostQueues = new AdaptiveRevisitQueueList(c.getBdbEnvironment(), c.getBdbEnvironment().getClassCatalog()); if(((Boolean)getUncheckedAttribute( null,ATTR_USE_URI_UNIQ_FILTER)).booleanValue()){ alreadyIncluded = createAlreadyIncluded(); } else { alreadyIncluded = null; } loadSeeds(); } /** * Create a UriUniqFilter that will serve as record * of already seen URIs. * * @return A UURISet that will serve as a record of already seen URIs * @throws IOException */ protected UriUniqFilter createAlreadyIncluded() throws IOException { UriUniqFilter uuf = new BdbUriUniqFilter( this.controller.getBdbEnvironment()); uuf.setDestination(this); return uuf; } /** * Loads the seeds * <p> * This method is called by initialize() and kickUpdate() */ public void loadSeeds() { Writer ignoredWriter = new StringWriter(); // Get the seeds to refresh. Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter); while (iter.hasNext()) { CandidateURI caUri = CandidateURI.createSeedCandidateURI((UURI)iter.next()); caUri.setSchedulingDirective(CandidateURI.MEDIUM); schedule(caUri); } batchFlush(); // save ignored items (if any) where they can be consulted later AbstractFrontier.saveIgnoredItems( ignoredWriter.toString(), controller.getDisk()); } public String getClassKey(CandidateURI cauri) { String queueKey = (String)getUncheckedAttribute(cauri, ATTR_FORCE_QUEUE); if ("".equals(queueKey)) { // Typical case, barring overrides queueKey = queueAssignmentPolicy.getClassKey(controller, cauri); // The queueAssignmentPolicy is always based on Hostnames // We may need to remove any www[0-9]{0,}\. prefixes from the // hostnames if(((Boolean)getUncheckedAttribute( cauri,ATTR_QUEUE_IGNORE_WWW)).booleanValue()){ queueKey = queueKey.replaceAll("^www[0-9]{0,}\\.",""); } } return queueKey; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -