📄 crawler.java
字号:
} sendLinkEvent (link, LinkEvent.QUEUED); } /** * Submit an array of Links for crawling. If the crawler is running, * these links will eventually be retrieved and passed to visit(). * @param links Links to put in queue */ public void submit (Link[] links) { for (int i=0; i<links.length; ++i) submit (links[i]); } /** * Enumerate crawling queue. * @return an enumeration of Link objects which are waiting to be visited. */ // FIX: enumerate in priority order public Enumeration enumerateQueue () { return crawlQueue.elements (); } /* * Classifiers * */ /** * Adds a classifier to this crawler. If the * classifier is already found in the set, does nothing. * @param c a classifier */ public void addClassifier (Classifier c) { if (!classifiers.contains (c)) { float cpriority = c.getPriority (); for (int i=0; i<classifiers.size(); ++i) { Classifier d = (Classifier)classifiers.elementAt (i); if (cpriority < d.getPriority ()) { classifiers.insertElementAt (c, i); return; } } classifiers.addElement (c); } } /** * Removes a classifier from the set of classifiers. * If c is not found in the set, does nothing. * * @param c a classifier */ public void removeClassifier (Classifier c) { classifiers.removeElement (c); } /** * Clears the set of classifiers. */ public void removeAllClassifiers () { classifiers.removeAllElements (); } /** * Enumerates the set of classifiers. * * @return An enumeration of the classifiers. */ public Enumeration enumerateClassifiers () { return classifiers.elements(); } /** * Get the set of classifiers * * @return An array containing the registered classifiers. */ public Classifier[] getClassifiers () { Classifier[] c = new Classifier[classifiers.size()]; classifiers.copyInto (c); return c; } /* * Event listeners * */ /** * Adds a listener to the set of CrawlListeners for this crawler. * If the listener is already found in the set, does nothing. * * @param listen a listener */ public void addCrawlListener (CrawlListener listen) { if (!crawlListeners.contains (listen)) crawlListeners.addElement (listen); } /** * Removes a listener from the set of CrawlListeners. If it is not found in the set, * does nothing. * * @param listen a listener */ public void removeCrawlListener (CrawlListener listen) { crawlListeners.removeElement (listen); } /** * Adds a listener to the set of LinkListeners for this crawler. * If the listener is already found in the set, does nothing. * * @param listen a listener */ public void addLinkListener (LinkListener listen) { if (!linkListeners.contains (listen)) linkListeners.addElement (listen); } /** * Removes a listener from the set of LinkListeners. If it is not found in the set, * does nothing. * * @param listen a listener */ public void removeLinkListener (LinkListener listen) { linkListeners.removeElement (listen); } /** * Send a CrawlEvent to all CrawlListeners registered with this crawler. * @param id Event id */ protected void sendCrawlEvent (int id) { CrawlEvent evt = new CrawlEvent (this, id); for (int j=0, len=crawlListeners.size(); j<len; ++j) { CrawlListener listen = (CrawlListener)crawlListeners.elementAt(j); switch (id) { case CrawlEvent.STARTED: listen.started (evt); break; case CrawlEvent.STOPPED: listen.stopped (evt); break; case CrawlEvent.CLEARED: listen.cleared (evt); break; case CrawlEvent.TIMED_OUT: listen.timedOut (evt); break; case CrawlEvent.PAUSED: listen.paused (evt); break; } } } /** * Send a LinkEvent to all LinkListeners registered with this crawler. * @param l Link related to event * @param id Event id */ protected void sendLinkEvent (Link l, int id) { LinkEvent evt = new LinkEvent (this, id, l); l.setStatus (id); for (int j=0, len=linkListeners.size(); j<len; ++j) { LinkListener listen = (LinkListener)linkListeners.elementAt(j); listen.crawled (evt); } } /** * Send an exceptional LinkEvent to all LinkListeners registered with this crawler. * @param l Link related to event * @param id Event id * @param exception Exception associated with event */ protected void sendLinkEvent (Link l, int id, Throwable exception) { LinkEvent evt = new LinkEvent (this, id, l, exception); l.setStatus (id); l.setLabel ("exception", exception.toString ()); for (int j=0, len=linkListeners.size(); j<len; ++j) { LinkListener listen = (LinkListener)linkListeners.elementAt(j); listen.crawled (evt); } } /* * Visited pages table * */ /** * Test whether the page corresponding to a link has been visited * (or queued for visiting). * @param link Link to test * @return true if link has been passed to walk() during this crawl */ public boolean visited (Link link) { return visitedPages.containsKey (link.getPageURL().toString()); } /** * Register that a link has been visited. * @param link Link that has been visited */ protected void markVisited (Link link) { visitedPages.put (link.getPageURL().toString(), this); } /** * Clear the set of visited links. */ protected void clearVisited () { visitedPages.clear (); } /* * Fetch loop * */ void fetch (Worm w) { Timer timer = new WormTimer (w); while (!w.dead) { //System.err.println (w + ": fetching a link"); // pull the highest-priority link from the fetch queue synchronized (fetchQueue) { while (!w.dead && (w.link = (Link)fetchQueue.deleteMin ()) == null) { try { fetchQueue.wait (); } catch (InterruptedException e) {} } } if (w.dead) return; //System.err.println (w + ": processing " + w.link.toDescription()); try { // download the link to get a page DownloadParameters dp; Page page; dp = w.link.getDownloadParameters(); if (dp == null) dp = this.dp; int timeout = dp.getDownloadTimeout(); sendLinkEvent (w.link, LinkEvent.RETRIEVING); try { if (timeout > 0) timer.set (timeout*1000, false); if (dp.getObeyRobotExclusion() && robotExclusion.disallowed (w.link.getURL())) throw new IOException ("disallowed by Robot Exclusion Standard (robots.txt)"); page = new Page (w.link, new HTMLParser(dp)); } finally { timer.cancel (); } if (w.dead) return; sendLinkEvent (w.link, LinkEvent.DOWNLOADED); if (synchronous) { // Synchronous mode. // Main thread will call process() when // this link's turn arrives (in priority order). // Wake up the main thread. synchronized (crawlQueue) { crawlQueue.notify (); } } else { // Asynchronous mode. // Each worm calls process() on its link. process (w.link); } w.link = null; // loop around and fetch another link } catch (ThreadDeath e) { throw e; // have to continue dying } catch (Throwable e) { // Some other exception occurred, either during the page fetch // or in some user code. Mark up the link with the error. if (w.dead) return; sendLinkEvent (w.link, LinkEvent.ERROR, e); synchronized (crawlQueue) { crawlQueue.delete (w.link); --numPagesLeft; w.link = null; crawlQueue.notify (); } } } } void process (Link link) { Page page = link.getPage (); // classify the page for (int j=0, len=classifiers.size(); j<len; ++j) { Classifier cl = (Classifier)classifiers.elementAt(j); cl.classify (page); } // invoke callbacks on the page ++numPagesVisited; if (pagePredicate == null || pagePredicate.shouldActOn (page)) { if (action != null) action.visit (page); visit (page); } expand (page); // send out the event sendLinkEvent (link, LinkEvent.VISITED); // discard link synchronized (crawlQueue) { crawlQueue.delete (link); --numPagesLeft; crawlQueue.notify (); } } void fetchTimedOut (Worm w, int interval) { if (w.dead) return; w.die (); sendLinkEvent (w.link, LinkEvent.ERROR, new IOException ("Timeout after " + interval + " seconds")); synchronized (crawlQueue) { crawlQueue.delete (w.link); --numPagesLeft; worms[w.i] = new Worm (this, w.i); worms[w.i].start (); crawlQueue.notify (); } }//#ifdef JDK1.1 // FIX: more error checking here public static void main (String[] args) throws Exception { java.io.ObjectInputStream in = new java.io.ObjectInputStream (new java.io.FileInputStream (args[0])); Crawler loadedCrawler = (Crawler)in.readObject (); in.close (); EventLog.monitor (loadedCrawler); loadedCrawler.run (); }//#endif JDK1.1}/* Simple Thread subclass that invokes a crawler's fetch loop. */class Worm extends Thread { Crawler crawler; // crawler in charge of this worm int i; // index of this worm in crawler.worms[] Link link; // link this worm is currently working on boolean dead = false; // true if this worm has been killed public Worm (Crawler crawler, int i) { super (crawler.getName() + " worm " + i); setDaemon (true); this.crawler = crawler; this.i = i; } public void run () { crawler.fetch (this); } public void die () { dead = true; stop (); } }class WormTimer extends Timer { Worm worm; public WormTimer (Worm worm) { this.worm = worm; } protected void alarm () { worm.crawler.fetchTimedOut (worm, getInterval()/1000); }}class CrawlTimer extends Timer { Crawler crawler; public CrawlTimer (Crawler crawler) { this.crawler = crawler; } protected void alarm () { crawler.timedOut (); } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -