crawler.java

来自「一个用java语言编写的网络爬虫程序」· Java 代码 · 共 1,285 行 · 第 1/3 页
JAVA
1,285 行
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University  *  * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software  * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import websphinx.util.PriorityQueue;import websphinx.util.Timer;import java.util.Vector;import java.util.Enumeration;import java.util.Hashtable;import java.util.StringTokenizer;import java.net.URL;import java.net.MalformedURLException;import java.io.IOException;//#ifdef JDK1.1 import java.io.Serializable;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;//#endif JDK1.1/** * Web crawler. * <P> * To write a crawler, extend this class and override  * shouldVisit () and visit() to create your own crawler. * <P> * To use a crawler: * <OL> * <LI>Initialize the crawler by calling * setRoot() (or one of its variants) and setting other  * crawl parameters. * <LI>Register any classifiers you need with addClassifier(). * <LI>Connect event listeners to monitor the crawler, *     such as websphinx.EventLog, websphinx.workbench.WebGraph, *     or websphinx.workbench.Statistics. * <LI>Call run() to start the crawler. * </OL> * A running crawler consists of a priority queue of  * Links waiting to be visited and a set of threads  * retrieving pages in parallel.  When a page is downloaded, * it is processed as follows: * <OL> * <LI><B>classify()</B>: The page is passed to the classify() method of  * every registered classifier, in increasing order of * their priority values.  Classifiers typically attach * informative labels to the page and its links, such as "homepage" * or "root page". * <LI><B>visit()</B>: The page is passed to the crawler's * visit() method for user-defined processing. * <LI><B>expand()</B>: The page is passed to the crawler's * expand() method to be expanded.  The default implementation * tests every unvisited hyperlink on the page with shouldVisit(),  * and puts * each link approved by shouldVisit() into the crawling queue. * </OL> * By default, when expanding the links of a page, the crawler  * only considers hyperlinks (not applets or inline images, for instance) that * point to Web pages (not mailto: links, for instance).  If you want * shouldVisit() to test every link on the page, use setLinkType(Crawler.ALL_LINKS). *  */public class Crawler implements Runnable//#ifdef JDK1.1 , Serializable //#endif JDK1.1{    //#ifdef JDK1.1     private static final long serialVersionUID = -3757789861952010450L;    //#endif JDK1.1    /**     * Specify WEB as the crawl domain to allow the crawler     * to visit any page on the World Wide Web.     */    public static final String[] WEB = null;    /**     * Specify SERVER as the crawl domain to limit the crawler     * to visit only pages on the same Web server (hostname     * and port number) as the root link from which it started.     */    public static final String[] SERVER = {"local"};    /**     * Specify SUBTREE as the crawl domain to limit the crawler     * to visit only pages which are descendants of the root link      * from which it started.     */    public static final String[] SUBTREE = {"sibling", "descendent"};    /**     * Specify HYPERLINKS as the link type to allow the crawler     * to visit only hyperlinks (A, AREA, and FRAME tags which     * point to http:, ftp:, file:, or gopher: URLs).     */    public static final String[] HYPERLINKS = {"hyperlink"};    /**     * Specify HYPERLINKS_AND_IMAGES as the link type to allow the crawler     * to visit only hyperlinks and inline images.     */    public static final String[] HYPERLINKS_AND_IMAGES = {"hyperlink", "image"};    /**     * Specify ALL_LINKS as the link type to allow the crawler     * to visit any kind of link     */    public static final String[] ALL_LINKS = null;        // Crawler parameters    private String name = getClass().getName();   // crawler's name    private transient Link[] roots = null;    private String[] rootHrefs = null;   // exists only when serializing crawler    private String[] domain = WEB;    private boolean synchronous = false;    private boolean depthFirst = true;    private String[] type = HYPERLINKS;    private boolean ignoreVisitedLinks = true;    private int maxDepth = 5;    private DownloadParameters dp = new DownloadParameters ()                                  .changeUserAgent (name);    private Vector classifiers = new Vector ();    private LinkPredicate linkPredicate;    private PagePredicate pagePredicate;    private Action action;        // Transient state    private transient Link[] crawledRoots = null;    private transient int state = CrawlEvent.CLEARED;        private transient Worm[] worms;        // background threads    private transient PriorityQueue fetchQueue;           // links waiting to be downloaded    private transient PriorityQueue crawlQueue;          // all links that have been expanded but not          // processed (used only if crawler is in synchronous mode)    private transient int numLinksTested;        // number of links tested by shouldVisit()    private transient int numPagesVisited;        // number of pages passed to visit()    private transient int numPagesLeft;          // all links that have been expanded but not processed          // == crawlQueue.size ()    // FIX: convert to immutable linked lists    private transient Vector crawlListeners;        // list of CrawlListeners    private transient Vector linkListeners;        // list of LinkListeners    private transient Hashtable visitedPages;        // visited pages (a set of URLs)    private transient RobotExclusion robotExclusion;        // robot exclusion cache    /**     * Make a new Crawler.     */    public Crawler () {        addClassifier (new StandardClassifier());        init ();    }    /*     * Initialize the transient fields of the crawler.     */    private void init () {        state = CrawlEvent.CLEARED;                numLinksTested = 0;        numPagesVisited = 0;        numPagesLeft = 0;                worms = null;        crawlQueue = new PriorityQueue();        fetchQueue = new PriorityQueue();        crawlListeners = new Vector ();        linkListeners = new Vector ();        visitedPages = new Hashtable ();        robotExclusion = new RobotExclusion (getName ());    }    /*     * Write a Crawler to an output stream.     */       //#ifdef JDK1.1     private void writeObject (ObjectOutputStream out)             throws IOException {        if (roots != null) {            rootHrefs = new String[roots.length];            for (int i=0; i<roots.length; ++i)                rootHrefs[i] = roots[i].getURL().toString();        }        else            rootHrefs = null;        out.defaultWriteObject ();        rootHrefs = null;    }//#endif JDK1.1    /*     * Read a Crawler from an input stream.     *///#ifdef JDK1.1     private void readObject (ObjectInputStream in)            throws IOException, ClassNotFoundException {        in.defaultReadObject ();        if (rootHrefs != null) {            roots = new Link [rootHrefs.length];            for (int i=0; i<rootHrefs.length; ++i)                roots[i] = new Link (rootHrefs[i]);        }        else            roots = null;        domain = useStandard (WEB, domain);        domain = useStandard (SERVER, domain);        domain = useStandard (SUBTREE, domain);        type = useStandard (HYPERLINKS, type);        type = useStandard (HYPERLINKS_AND_IMAGES, type);        type = useStandard (ALL_LINKS, type);                         init ();        if (linkPredicate != null)            linkPredicate.connected (this);        if (pagePredicate != null)            pagePredicate.connected (this);        if (action != null)            action.connected (this);            }    private static String[] useStandard (String[] standard, String[] s) {        if (s == null || standard == null || standard == s)            return s;        if (s.length != standard.length)            return s;        for (int i=0; i<s.length; ++i)            if (!s[i].equals (standard[i]))                return s;        return standard;    }//#endif JDK1.1    /**     * Start crawling.  Returns either when the crawl is done, or      * when pause() or stop() is called.  Because this method implements the     * java.lang.Runnable interface, a crawler can be run in the     * background thread.     */    public void run () {        crawledRoots = roots;        if (state == CrawlEvent.STOPPED)            clear ();                    if (state == CrawlEvent.CLEARED && crawledRoots != null) {            // give each root a default priority based on its position in the array            float priority = 0;            float increment = 1.0f/crawledRoots.length;            for (int i=0; i<crawledRoots.length; ++i) {                crawledRoots[i].setPriority (priority);                priority += increment;            }            submit (crawledRoots);        }                    state = CrawlEvent.STARTED;        sendCrawlEvent (state);                synchronized (crawlQueue) {                        Timer timer = new CrawlTimer (this);            int timeout = dp.getCrawlTimeout();            if (timeout > 0)                timer.set (timeout*1000, false);            int nWorms = Math.max (dp.getMaxThreads (), 1);            worms = new Worm[nWorms];            for (int i=0; i<nWorms; ++i) {                worms[i] = new Worm (this, i);                worms[i].start ();            }            try {                while (state == CrawlEvent.STARTED) {                    if (numPagesLeft == 0) {                        // ran out of links to crawl                        state = CrawlEvent.STOPPED;                        sendCrawlEvent (state);                    }                    else if (synchronous) {                        // Synchronous mode.                        // Main thread calls process() on each link                        // in crawlQueue, in priority order.                        Link link = (Link)crawlQueue.getMin ();                        if (link.getStatus () == LinkEvent.DOWNLOADED)                            process (link);                        else                            crawlQueue.wait ();                    }                    else                        // Asynchronous crawling.                        // Main thread does nothing but wait, while                        // background threads call process().                        crawlQueue.wait ();                }            } catch (InterruptedException e) {}            timer.cancel ();                            for (int i=0; i<worms.length; ++i)                worms[i].die ();            if (state == CrawlEvent.PAUSED) {                // put partly-processed links back in fetchQueue                synchronized (fetchQueue) {                    for (int i=0; i<worms.length; ++i)                        if (worms[i].link != null)                            fetchQueue.put (worms[i].link);                }            }            worms = null;        }    }    /**     * Initialize the crawler for a fresh crawl.  Clears the crawling queue     * and sets all crawling statistics to 0.  Stops the crawler     * if it is currently running.     */    public void clear () {        stop ();        numPagesVisited = 0;        numLinksTested = 0;        clearVisited ();        if (crawledRoots != null)            for (int i=0; i < crawledRoots.length; ++i)                crawledRoots[i].disconnect ();        crawledRoots = null;        state = CrawlEvent.CLEARED;        sendCrawlEvent (state);    }    /**     * Pause the crawl in progress.  If the crawler is running, then     * it finishes processing the current page, then returns.  The queues remain as-is,     * so calling run() again will resume the crawl exactly where it left off.     * pause() can be called from any thread.     */    public void pause () {        if (state == CrawlEvent.STARTED) {            synchronized (crawlQueue) {                state = CrawlEvent.PAUSED;                crawlQueue.notify ();            }            sendCrawlEvent (state);        }    }    /**     * Stop the crawl in progress.  If the crawler is running, then     * it finishes processing the current page, then returns.     * Empties the crawling queue.     */    public void stop () {        if (state == CrawlEvent.STARTED || state == CrawlEvent.PAUSED) {            synchronized (crawlQueue) {                synchronized (fetchQueue) {                    state = CrawlEvent.STOPPED;                    fetchQueue.clear ();                    crawlQueue.clear ();                    numPagesLeft = 0;                    crawlQueue.notify ();                }            }            sendCrawlEvent (state);        }    }    /*     * Timeout the crawl in progress.  Used internally by     * the CrawlTimer.     */    void timedOut () {        if (state == CrawlEvent.STARTED) {            synchronized (crawlQueue) {                synchronized (fetchQueue) {                    state = CrawlEvent.TIMED_OUT;                    fetchQueue.clear ();                    crawlQueue.clear ();                    numPagesLeft = 0;                    crawlQueue.notify ();                }            }            sendCrawlEvent (state);        }    }        /**     * Get state of crawler.     * @return one of CrawlEvent.STARTED, CrawlEvent.PAUSED, STOPPED, CLEARED.     */    public int getState () {        return state;    }
crawler.java - 源码说明

本页面展示了「一个用java语言编写的网络爬虫程序」中的 crawler.java 源码文件，采用 Java 编程语言编写，共 1,285 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?