📄 frontier.html
字号:
<html><head><META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>8. Writing a Frontier</title><link href="../docbook.css" rel="stylesheet" type="text/css"><meta content="DocBook XSL Stylesheets V1.67.2" name="generator"><link rel="start" href="index.html" title="Heritrix developer documentation"><link rel="up" href="index.html" title="Heritrix developer documentation"><link rel="prev" href="uri.html" title="7. Some notes on the URI classes"><link rel="next" href="writefilter.html" title="9. Writing a Filter"></head><body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF"><div class="navheader"><table summary="Navigation header" width="100%"><tr><th align="center" colspan="3">8. Writing a Frontier</th></tr><tr><td align="left" width="20%"><a accesskey="p" href="uri.html">Prev</a> </td><th align="center" width="60%"> </th><td align="right" width="20%"> <a accesskey="n" href="writefilter.html">Next</a></td></tr></table><hr></div><div class="sect1" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="frontier"></a>8. Writing a Frontier</h2></div></div></div><p>As mentioned before, the Frontier is a pluggable module responsible for deciding which URI to process next, and when. The Frontier is also responsible for keeping track of other aspects of the crawls internal state which in turn can be used for logging and reporting. Even though the responsibilities of the Frontier might not look overwhelming, it is one of the hardest modules to write well. You should really investigate if your needs could not be met by the existing Frontier, or at least mostly met by subclassing an existing Frontier. With these warnings in mind, let's go ahead and create a really simple Frontier.</p><p><pre class="programlisting">package mypackage;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.datamodel.UURI;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.Frontier;import org.archive.crawler.framework.FrontierMarker;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;import org.archive.crawler.settings.ModuleType;/** * A simple Frontier implementation for tutorial purposes */public class MyFrontier extends ModuleType implements Frontier, FetchStatusCodes { // A list of the discovered URIs that should be crawled. List pendingURIs = new ArrayList(); // A list of prerequisites that needs to be met before any other URI is // allowed to be crawled, e.g. DNS-lookups List prerequisites = new ArrayList(); // A hash of already crawled URIs so that every URI is crawled only once. Map alreadyIncluded = new HashMap(); // Reference to the CrawlController. CrawlController controller; // Flag to note if a URI is being processed. boolean uriInProcess = false; // top-level stats long successCount = 0; long failedCount = 0; long disregardedCount = 0; long totalProcessedBytes = 0; public MyFrontier(String name) { super(Frontier.ATTR_NAME, "A simple frontier."); } public void initialize(CrawlController controller) throws FatalConfigurationException, IOException { this.controller = controller; // Initialize the pending queue with the seeds this.controller.getScope().refreshSeeds(); List seeds = this.controller.getScope().getSeedlist(); synchronized(seeds) { for (Iterator i = seeds.iterator(); i.hasNext();) { UURI u = (UURI) i.next(); CandidateURI caUri = new CandidateURI(u); caUri.setSeed(); schedule(caUri); } } } public synchronized CrawlURI next(int timeout) throws InterruptedException { if (!uriInProcess && !isEmpty()) { uriInProcess = true; CrawlURI curi; if (!prerequisites.isEmpty()) { curi = CrawlURI.from((CandidateURI) prerequisites.remove(0)); } else { curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0)); } curi.setServer(controller.getServerCache().getServerFor(curi)); return curi; } else { wait(timeout); return null; } } public boolean isEmpty() { return pendingURIs.isEmpty() && prerequisites.isEmpty(); } public synchronized void schedule(CandidateURI caURI) { // Schedule a uri for crawling if it is not already crawled if (!alreadyIncluded.containsKey(caURI.getURIString())) { if(caURI.needsImmediateScheduling()) { prerequisites.add(caURI); } else { pendingURIs.add(caURI); } alreadyIncluded.put(caURI.getURIString(), caURI); } } public void batchSchedule(CandidateURI caURI) { schedule(caURI); } public void batchFlush() { } public synchronized void finished(CrawlURI cURI) { uriInProcess = false; if (cURI.isSuccess()) { successCount++; totalProcessedBytes += cURI.getContentSize(); controller.fireCrawledURISuccessfulEvent(cURI); cURI.stripToMinimal(); } else if (cURI.getFetchStatus() == S_DEFERRED) { cURI.processingCleanup(); alreadyIncluded.remove(cURI.getURIString()); schedule(cURI); } else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED || cURI.getFetchStatus() == S_OUT_OF_SCOPE || cURI.getFetchStatus() == S_BLOCKED_BY_USER || cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS || cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS || cURI.getFetchStatus() == S_DELETED_BY_USER) { controller.fireCrawledURIDisregardEvent(cURI); disregardedCount++; cURI.stripToMinimal(); } else { controller.fireCrawledURIFailureEvent(cURI); failedCount++; cURI.stripToMinimal(); } cURI.processingCleanup(); } public long discoveredUriCount() { return alreadyIncluded.size(); } public long queuedUriCount() { return pendingURIs.size() + prerequisites.size(); } public long finishedUriCount() { return successCount + failedCount + disregardedCount; } public long successfullyFetchedCount() { return successCount; } public long failedFetchCount() { return failedCount; } public long disregardedFetchCount() { return disregardedCount;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -