webcrawler.java

来自「java concurrency in practice 源码. JAVA」· Java 代码 · 共 88 行

JAVA
88
字号
package net.jcip.examples;import java.net.URL;import java.util.*;import java.util.concurrent.*;import net.jcip.annotations.*;import static java.util.concurrent.TimeUnit.MILLISECONDS;/** * WebCrawler * <p/> * Using TrackingExecutorService to save unfinished tasks for later execution * * @author Brian Goetz and Tim Peierls */public abstract class WebCrawler {    private volatile TrackingExecutor exec;    @GuardedBy("this") private final Set<URL> urlsToCrawl = new HashSet<URL>();    private final ConcurrentMap<URL, Boolean> seen = new ConcurrentHashMap<URL, Boolean>();    private static final long TIMEOUT = 500;    private static final TimeUnit UNIT = MILLISECONDS;    public WebCrawler(URL startUrl) {        urlsToCrawl.add(startUrl);    }    public synchronized void start() {        exec = new TrackingExecutor(Executors.newCachedThreadPool());        for (URL url : urlsToCrawl) submitCrawlTask(url);        urlsToCrawl.clear();    }    public synchronized void stop() throws InterruptedException {        try {            saveUncrawled(exec.shutdownNow());            if (exec.awaitTermination(TIMEOUT, UNIT))                saveUncrawled(exec.getCancelledTasks());        } finally {            exec = null;        }    }    protected abstract List<URL> processPage(URL url);    private void saveUncrawled(List<Runnable> uncrawled) {        for (Runnable task : uncrawled)            urlsToCrawl.add(((CrawlTask) task).getPage());    }    private void submitCrawlTask(URL u) {        exec.execute(new CrawlTask(u));    }    private class CrawlTask implements Runnable {        private final URL url;        CrawlTask(URL url) {            this.url = url;        }        private int count = 1;        boolean alreadyCrawled() {            return seen.putIfAbsent(url, true) != null;        }        void markUncrawled() {            seen.remove(url);            System.out.printf("marking %s uncrawled%n", url);        }        public void run() {            for (URL link : processPage(url)) {                if (Thread.currentThread().isInterrupted())                    return;                submitCrawlTask(link);            }        }        public URL getPage() {            return url;        }    }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?