📄 frontier.html

📁 网络爬虫开源代码
💻 HTML
📖 第 1 页 / 共 2 页
字号:
12 下一页
<html><head><META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>8.&nbsp;Writing a Frontier</title><link href="../docbook.css" rel="stylesheet" type="text/css"><meta content="DocBook XSL Stylesheets V1.67.2" name="generator"><link rel="start" href="index.html" title="Heritrix developer documentation"><link rel="up" href="index.html" title="Heritrix developer documentation"><link rel="prev" href="uri.html" title="7.&nbsp;Some notes on the URI classes"><link rel="next" href="writefilter.html" title="9.&nbsp;Writing a Filter"></head><body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF"><div class="navheader"><table summary="Navigation header" width="100%"><tr><th align="center" colspan="3">8.&nbsp;Writing a Frontier</th></tr><tr><td align="left" width="20%"><a accesskey="p" href="uri.html">Prev</a>&nbsp;</td><th align="center" width="60%">&nbsp;</th><td align="right" width="20%">&nbsp;<a accesskey="n" href="writefilter.html">Next</a></td></tr></table><hr></div><div class="sect1" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="frontier"></a>8.&nbsp;Writing a Frontier</h2></div></div></div><p>As mentioned before, the Frontier is a pluggable module    responsible for deciding which URI to process next, and when. The Frontier    is also responsible for keeping track of other aspects of the crawls    internal state which in turn can be used for logging and reporting. Even    though the responsibilities of the Frontier might not look overwhelming,    it is one of the hardest modules to write well. You should really    investigate if your needs could not be met by the existing Frontier, or    at least mostly met by subclassing an existing Frontier. With    these warnings in mind, let's go ahead and create a really simple    Frontier.</p><p><pre class="programlisting">package mypackage;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.FetchStatusCodes;import org.archive.crawler.datamodel.UURI;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.Frontier;import org.archive.crawler.framework.FrontierMarker;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;import org.archive.crawler.settings.ModuleType;/** * A simple Frontier implementation for tutorial purposes */public class MyFrontier extends ModuleType implements Frontier,        FetchStatusCodes {    // A list of the discovered URIs that should be crawled.    List pendingURIs = new ArrayList();        // A list of prerequisites that needs to be met before any other URI is    // allowed to be crawled, e.g. DNS-lookups    List prerequisites = new ArrayList();        // A hash of already crawled URIs so that every URI is crawled only once.    Map alreadyIncluded = new HashMap();        // Reference to the CrawlController.    CrawlController controller;    // Flag to note if a URI is being processed.    boolean uriInProcess = false;        // top-level stats    long successCount = 0;    long failedCount = 0;    long disregardedCount = 0;    long totalProcessedBytes = 0;    public MyFrontier(String name) {        super(Frontier.ATTR_NAME, "A simple frontier.");    }    public void initialize(CrawlController controller)            throws FatalConfigurationException, IOException {        this.controller = controller;                // Initialize the pending queue with the seeds        this.controller.getScope().refreshSeeds();        List seeds = this.controller.getScope().getSeedlist();        synchronized(seeds) {            for (Iterator i = seeds.iterator(); i.hasNext();) {                UURI u = (UURI) i.next();                CandidateURI caUri = new CandidateURI(u);                caUri.setSeed();                schedule(caUri);            }        }    }    public synchronized CrawlURI next(int timeout) throws InterruptedException {        if (!uriInProcess &amp;&amp; !isEmpty()) {            uriInProcess = true;            CrawlURI curi;            if (!prerequisites.isEmpty()) {                curi = CrawlURI.from((CandidateURI) prerequisites.remove(0));            } else {                curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0));            }            curi.setServer(controller.getServerCache().getServerFor(curi));            return curi;        } else {            wait(timeout);            return null;        }    }    public boolean isEmpty() {        return pendingURIs.isEmpty() &amp;&amp; prerequisites.isEmpty();    }    public synchronized void schedule(CandidateURI caURI) {        // Schedule a uri for crawling if it is not already crawled        if (!alreadyIncluded.containsKey(caURI.getURIString())) {            if(caURI.needsImmediateScheduling()) {                prerequisites.add(caURI);            } else {                pendingURIs.add(caURI);            }            alreadyIncluded.put(caURI.getURIString(), caURI);        }    }    public void batchSchedule(CandidateURI caURI) {        schedule(caURI);    }    public void batchFlush() {    }    public synchronized void finished(CrawlURI cURI) {        uriInProcess = false;        if (cURI.isSuccess()) {            successCount++;            totalProcessedBytes += cURI.getContentSize();            controller.fireCrawledURISuccessfulEvent(cURI);            cURI.stripToMinimal();        } else if (cURI.getFetchStatus() == S_DEFERRED) {            cURI.processingCleanup();            alreadyIncluded.remove(cURI.getURIString());            schedule(cURI);        } else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED                || cURI.getFetchStatus() == S_OUT_OF_SCOPE                || cURI.getFetchStatus() == S_BLOCKED_BY_USER                || cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS                || cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS                || cURI.getFetchStatus() == S_DELETED_BY_USER) {            controller.fireCrawledURIDisregardEvent(cURI);            disregardedCount++;            cURI.stripToMinimal();        } else {            controller.fireCrawledURIFailureEvent(cURI);            failedCount++;            cURI.stripToMinimal();        }        cURI.processingCleanup();    }    public long discoveredUriCount() {        return alreadyIncluded.size();    }    public long queuedUriCount() {        return pendingURIs.size() + prerequisites.size();    }    public long finishedUriCount() {        return successCount + failedCount + disregardedCount;    }    public long successfullyFetchedCount() {        return successCount;    }    public long failedFetchCount() {        return failedCount;    }    public long disregardedFetchCount() {        return disregardedCount;
12 下一页
💿 文件大小 20230 K
👤 上传用户 singwolf
📂 所属分类 Java编程
🏷️ 相关标签

#网络爬虫 #开源 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -