⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawler.java

📁 一个用java语言编写的网络爬虫程序
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
        }        sendLinkEvent (link, LinkEvent.QUEUED);    }    /**     * Submit an array of Links for crawling.  If the crawler is running,     * these links will eventually be retrieved and passed to visit().     * @param links Links to put in queue     */    public void submit (Link[] links) {        for (int i=0; i<links.length; ++i)            submit (links[i]);    }    /**     * Enumerate crawling queue.     * @return an enumeration of Link objects which are waiting to be visited.     */    // FIX: enumerate in priority order    public Enumeration enumerateQueue () {        return crawlQueue.elements ();    }    /*     * Classifiers     *     */    /**     * Adds a classifier to this crawler.  If the     * classifier is already found in the set, does nothing.     * @param c a classifier     */    public void addClassifier (Classifier c) {        if (!classifiers.contains (c)) {            float cpriority = c.getPriority ();                        for (int i=0; i<classifiers.size(); ++i) {                Classifier d = (Classifier)classifiers.elementAt (i);                if (cpriority < d.getPriority ()) {                    classifiers.insertElementAt (c, i);                    return;                }            }            classifiers.addElement (c);        }    }    /**     * Removes a classifier from the set of classifiers.       * If c is not found in the set, does nothing.     *     * @param c a classifier     */    public void removeClassifier (Classifier c) {        classifiers.removeElement (c);    }    /**     * Clears the set of classifiers.     */    public void removeAllClassifiers () {        classifiers.removeAllElements ();    }    /**     * Enumerates the set of classifiers.     *     * @return An enumeration of the classifiers.     */    public Enumeration enumerateClassifiers () {        return classifiers.elements();    }    /**     * Get the set of classifiers     *     * @return An array containing the registered classifiers.     */    public Classifier[] getClassifiers () {        Classifier[] c = new Classifier[classifiers.size()];        classifiers.copyInto (c);        return c;    }    /*     * Event listeners     *     */    /**     * Adds a listener to the set of CrawlListeners for this crawler.     * If the listener is already found in the set, does nothing.     *     * @param listen a listener     */    public void addCrawlListener (CrawlListener listen) {        if (!crawlListeners.contains (listen))            crawlListeners.addElement (listen);    }    /**     * Removes a listener from the set of CrawlListeners.  If it is not found in the set,     * does nothing.     *     * @param listen a listener     */    public void removeCrawlListener (CrawlListener listen) {        crawlListeners.removeElement (listen);    }    /**     * Adds a listener to the set of LinkListeners for this crawler.     * If the listener is already found in the set, does nothing.     *     * @param listen a listener     */    public void addLinkListener (LinkListener listen) {        if (!linkListeners.contains (listen))            linkListeners.addElement (listen);    }    /**     * Removes a listener from the set of LinkListeners.  If it is not found in the set,     * does nothing.     *     * @param listen a listener     */    public void removeLinkListener (LinkListener listen) {        linkListeners.removeElement (listen);    }    /**     * Send a CrawlEvent to all CrawlListeners registered with this crawler.     * @param id Event id     */    protected void sendCrawlEvent (int id) {        CrawlEvent evt = new CrawlEvent (this, id);        for (int j=0, len=crawlListeners.size(); j<len; ++j) {            CrawlListener listen = (CrawlListener)crawlListeners.elementAt(j);            switch (id) {              case CrawlEvent.STARTED:                 listen.started (evt);                break;              case CrawlEvent.STOPPED:                 listen.stopped (evt);                break;              case CrawlEvent.CLEARED:                 listen.cleared (evt);                break;              case CrawlEvent.TIMED_OUT:                 listen.timedOut (evt);                break;              case CrawlEvent.PAUSED:                 listen.paused (evt);                break;            }        }    }    /**     * Send a LinkEvent to all LinkListeners registered with this crawler.     * @param l Link related to event     * @param id Event id     */    protected void sendLinkEvent (Link l, int id) {        LinkEvent evt = new LinkEvent (this, id, l);        l.setStatus (id);        for (int j=0, len=linkListeners.size(); j<len; ++j) {            LinkListener listen = (LinkListener)linkListeners.elementAt(j);            listen.crawled (evt);        }    }    /**     * Send an exceptional LinkEvent to all LinkListeners registered with this crawler.     * @param l Link related to event     * @param id Event id     * @param exception Exception associated with event     */    protected void sendLinkEvent (Link l, int id, Throwable exception) {        LinkEvent evt = new LinkEvent (this, id, l, exception);        l.setStatus (id);        l.setLabel ("exception", exception.toString ());        for (int j=0, len=linkListeners.size(); j<len; ++j) {            LinkListener listen = (LinkListener)linkListeners.elementAt(j);            listen.crawled (evt);        }    }    /*     * Visited pages table     *     */    /**     * Test whether the page corresponding to a link has been visited     * (or queued for visiting).     * @param link  Link to test     * @return true if link has been passed to walk() during this crawl     */    public boolean visited (Link link) {        return visitedPages.containsKey (link.getPageURL().toString());    }    /**     * Register that a link has been visited.     * @param link  Link that has been visited     */    protected void markVisited (Link link) {        visitedPages.put (link.getPageURL().toString(), this);    }    /**     * Clear the set of visited links.     */    protected void clearVisited () {        visitedPages.clear ();    }    /*     * Fetch loop     *     */    void fetch (Worm w) {        Timer timer = new WormTimer (w);        while (!w.dead) {            //System.err.println (w + ": fetching a link");            // pull the highest-priority link from the fetch queue            synchronized (fetchQueue) {                while (!w.dead                       && (w.link = (Link)fetchQueue.deleteMin ()) == null) {                    try {                        fetchQueue.wait ();                    } catch (InterruptedException e) {}                }            }            if (w.dead)                return;                            //System.err.println (w + ": processing " + w.link.toDescription());                        try {                // download the link to get a page                DownloadParameters dp;                Page page;                dp = w.link.getDownloadParameters();                if (dp == null)                    dp = this.dp;                int timeout = dp.getDownloadTimeout();                sendLinkEvent (w.link, LinkEvent.RETRIEVING);                try {                                        if (timeout > 0)                        timer.set (timeout*1000, false);                    if (dp.getObeyRobotExclusion()                         && robotExclusion.disallowed (w.link.getURL()))                        throw new IOException ("disallowed by Robot Exclusion Standard (robots.txt)");                    page = new Page (w.link, new HTMLParser(dp));                                    } finally {                    timer.cancel ();                }                                    if (w.dead)                    return;                                    sendLinkEvent (w.link, LinkEvent.DOWNLOADED);                if (synchronous) {                    // Synchronous mode.                    // Main thread will call process() when                    // this link's turn arrives (in priority order).                    // Wake up the main thread.                    synchronized (crawlQueue) {                        crawlQueue.notify ();                    }                }                else {                    // Asynchronous mode.                    // Each worm calls process() on its link.                     process (w.link);                }                                w.link = null;                // loop around and fetch another link            } catch (ThreadDeath e) {                throw e;  // have to continue dying             } catch (Throwable e) {                // Some other exception occurred, either during the page fetch                // or in some user code.  Mark up the link with the error.                if (w.dead)                    return;                                    sendLinkEvent (w.link, LinkEvent.ERROR, e);                synchronized (crawlQueue) {                    crawlQueue.delete (w.link);                    --numPagesLeft;                    w.link = null;                    crawlQueue.notify ();                }            }        }    }    void process (Link link) {        Page page = link.getPage ();        // classify the page        for (int j=0, len=classifiers.size(); j<len; ++j) {            Classifier cl = (Classifier)classifiers.elementAt(j);            cl.classify (page);        }        // invoke callbacks on the page        ++numPagesVisited;        if (pagePredicate == null || pagePredicate.shouldActOn (page)) {            if (action != null)                action.visit (page);            visit (page);        }        expand (page);                // send out the event        sendLinkEvent (link, LinkEvent.VISITED);                // discard link        synchronized (crawlQueue) {            crawlQueue.delete (link);            --numPagesLeft;            crawlQueue.notify ();        }    }    void fetchTimedOut (Worm w, int interval) {        if (w.dead)            return;        w.die ();        sendLinkEvent (w.link, LinkEvent.ERROR,                        new IOException ("Timeout after " + interval + " seconds"));        synchronized (crawlQueue) {            crawlQueue.delete (w.link);            --numPagesLeft;                        worms[w.i] = new Worm (this, w.i);            worms[w.i].start ();                        crawlQueue.notify ();        }    }//#ifdef JDK1.1  // FIX: more error checking here  public static void main (String[] args) throws Exception {    java.io.ObjectInputStream in =       new java.io.ObjectInputStream (new java.io.FileInputStream (args[0]));    Crawler loadedCrawler = (Crawler)in.readObject ();    in.close ();    EventLog.monitor (loadedCrawler);    loadedCrawler.run ();  }//#endif JDK1.1}/* Simple Thread subclass that invokes a crawler's fetch loop. */class Worm extends Thread {    Crawler crawler; // crawler in charge of this worm    int i;           // index of this worm in crawler.worms[]    Link link;       // link this worm is currently working on    boolean dead = false; // true if this worm has been killed    public Worm (Crawler crawler, int i) {        super (crawler.getName() + " worm " + i);        setDaemon (true);        this.crawler = crawler;        this.i = i;    }    public void run () {        crawler.fetch (this);    }        public void die () {        dead = true;        stop ();    }        }class WormTimer extends Timer {    Worm worm;    public WormTimer (Worm worm) {        this.worm = worm;    }    protected void alarm () {        worm.crawler.fetchTimedOut (worm, getInterval()/1000);    }}class CrawlTimer extends Timer {    Crawler crawler;        public CrawlTimer (Crawler crawler) {        this.crawler = crawler;    }        protected void alarm () {         crawler.timedOut ();    }        }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -