📄 crawler.java
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University * * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import websphinx.util.PriorityQueue;import websphinx.util.Timer;import java.util.Vector;import java.util.Enumeration;import java.util.Hashtable;import java.util.StringTokenizer;import java.net.URL;import java.net.MalformedURLException;import java.io.IOException;//#ifdef JDK1.1 import java.io.Serializable;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;//#endif JDK1.1/** * Web crawler. * <P> * To write a crawler, extend this class and override * shouldVisit () and visit() to create your own crawler. * <P> * To use a crawler: * <OL> * <LI>Initialize the crawler by calling * setRoot() (or one of its variants) and setting other * crawl parameters. * <LI>Register any classifiers you need with addClassifier(). * <LI>Connect event listeners to monitor the crawler, * such as websphinx.EventLog, websphinx.workbench.WebGraph, * or websphinx.workbench.Statistics. * <LI>Call run() to start the crawler. * </OL> * A running crawler consists of a priority queue of * Links waiting to be visited and a set of threads * retrieving pages in parallel. When a page is downloaded, * it is processed as follows: * <OL> * <LI><B>classify()</B>: The page is passed to the classify() method of * every registered classifier, in increasing order of * their priority values. Classifiers typically attach * informative labels to the page and its links, such as "homepage" * or "root page". * <LI><B>visit()</B>: The page is passed to the crawler's * visit() method for user-defined processing. * <LI><B>expand()</B>: The page is passed to the crawler's * expand() method to be expanded. The default implementation * tests every unvisited hyperlink on the page with shouldVisit(), * and puts * each link approved by shouldVisit() into the crawling queue. * </OL> * By default, when expanding the links of a page, the crawler * only considers hyperlinks (not applets or inline images, for instance) that * point to Web pages (not mailto: links, for instance). If you want * shouldVisit() to test every link on the page, use setLinkType(Crawler.ALL_LINKS). * */public class Crawler implements Runnable//#ifdef JDK1.1 , Serializable //#endif JDK1.1{ //#ifdef JDK1.1 private static final long serialVersionUID = -3757789861952010450L; //#endif JDK1.1 /** * Specify WEB as the crawl domain to allow the crawler * to visit any page on the World Wide Web. */ public static final String[] WEB = null; /** * Specify SERVER as the crawl domain to limit the crawler * to visit only pages on the same Web server (hostname * and port number) as the root link from which it started. */ public static final String[] SERVER = {"local"}; /** * Specify SUBTREE as the crawl domain to limit the crawler * to visit only pages which are descendants of the root link * from which it started. */ public static final String[] SUBTREE = {"sibling", "descendent"}; /** * Specify HYPERLINKS as the link type to allow the crawler * to visit only hyperlinks (A, AREA, and FRAME tags which * point to http:, ftp:, file:, or gopher: URLs). */ public static final String[] HYPERLINKS = {"hyperlink"}; /** * Specify HYPERLINKS_AND_IMAGES as the link type to allow the crawler * to visit only hyperlinks and inline images. */ public static final String[] HYPERLINKS_AND_IMAGES = {"hyperlink", "image"}; /** * Specify ALL_LINKS as the link type to allow the crawler * to visit any kind of link */ public static final String[] ALL_LINKS = null; // Crawler parameters private String name = getClass().getName(); // crawler's name private transient Link[] roots = null; private String[] rootHrefs = null; // exists only when serializing crawler private String[] domain = WEB; private boolean synchronous = false; private boolean depthFirst = true; private String[] type = HYPERLINKS; private boolean ignoreVisitedLinks = true; private int maxDepth = 5; private DownloadParameters dp = new DownloadParameters () .changeUserAgent (name); private Vector classifiers = new Vector (); private LinkPredicate linkPredicate; private PagePredicate pagePredicate; private Action action; // Transient state private transient Link[] crawledRoots = null; private transient int state = CrawlEvent.CLEARED; private transient Worm[] worms; // background threads private transient PriorityQueue fetchQueue; // links waiting to be downloaded private transient PriorityQueue crawlQueue; // all links that have been expanded but not // processed (used only if crawler is in synchronous mode) private transient int numLinksTested; // number of links tested by shouldVisit() private transient int numPagesVisited; // number of pages passed to visit() private transient int numPagesLeft; // all links that have been expanded but not processed // == crawlQueue.size () // FIX: convert to immutable linked lists private transient Vector crawlListeners; // list of CrawlListeners private transient Vector linkListeners; // list of LinkListeners private transient Hashtable visitedPages; // visited pages (a set of URLs) private transient RobotExclusion robotExclusion; // robot exclusion cache /** * Make a new Crawler. */ public Crawler () { addClassifier (new StandardClassifier()); init (); } /* * Initialize the transient fields of the crawler. */ private void init () { state = CrawlEvent.CLEARED; numLinksTested = 0; numPagesVisited = 0; numPagesLeft = 0; worms = null; crawlQueue = new PriorityQueue(); fetchQueue = new PriorityQueue(); crawlListeners = new Vector (); linkListeners = new Vector (); visitedPages = new Hashtable (); robotExclusion = new RobotExclusion (getName ()); } /* * Write a Crawler to an output stream. */ //#ifdef JDK1.1 private void writeObject (ObjectOutputStream out) throws IOException { if (roots != null) { rootHrefs = new String[roots.length]; for (int i=0; i<roots.length; ++i) rootHrefs[i] = roots[i].getURL().toString(); } else rootHrefs = null; out.defaultWriteObject (); rootHrefs = null; }//#endif JDK1.1 /* * Read a Crawler from an input stream. *///#ifdef JDK1.1 private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject (); if (rootHrefs != null) { roots = new Link [rootHrefs.length]; for (int i=0; i<rootHrefs.length; ++i) roots[i] = new Link (rootHrefs[i]); } else roots = null; domain = useStandard (WEB, domain); domain = useStandard (SERVER, domain); domain = useStandard (SUBTREE, domain); type = useStandard (HYPERLINKS, type); type = useStandard (HYPERLINKS_AND_IMAGES, type); type = useStandard (ALL_LINKS, type); init (); if (linkPredicate != null) linkPredicate.connected (this); if (pagePredicate != null) pagePredicate.connected (this); if (action != null) action.connected (this); } private static String[] useStandard (String[] standard, String[] s) { if (s == null || standard == null || standard == s) return s; if (s.length != standard.length) return s; for (int i=0; i<s.length; ++i) if (!s[i].equals (standard[i])) return s; return standard; }//#endif JDK1.1 /** * Start crawling. Returns either when the crawl is done, or * when pause() or stop() is called. Because this method implements the * java.lang.Runnable interface, a crawler can be run in the * background thread. */ public void run () { crawledRoots = roots; if (state == CrawlEvent.STOPPED) clear (); if (state == CrawlEvent.CLEARED && crawledRoots != null) { // give each root a default priority based on its position in the array float priority = 0; float increment = 1.0f/crawledRoots.length; for (int i=0; i<crawledRoots.length; ++i) { crawledRoots[i].setPriority (priority); priority += increment; } submit (crawledRoots); } state = CrawlEvent.STARTED; sendCrawlEvent (state); synchronized (crawlQueue) { Timer timer = new CrawlTimer (this); int timeout = dp.getCrawlTimeout(); if (timeout > 0) timer.set (timeout*1000, false); int nWorms = Math.max (dp.getMaxThreads (), 1); worms = new Worm[nWorms]; for (int i=0; i<nWorms; ++i) { worms[i] = new Worm (this, i); worms[i].start (); } try { while (state == CrawlEvent.STARTED) { if (numPagesLeft == 0) { // ran out of links to crawl state = CrawlEvent.STOPPED; sendCrawlEvent (state); } else if (synchronous) { // Synchronous mode. // Main thread calls process() on each link // in crawlQueue, in priority order. Link link = (Link)crawlQueue.getMin (); if (link.getStatus () == LinkEvent.DOWNLOADED) process (link); else crawlQueue.wait (); } else // Asynchronous crawling. // Main thread does nothing but wait, while // background threads call process(). crawlQueue.wait (); } } catch (InterruptedException e) {} timer.cancel (); for (int i=0; i<worms.length; ++i) worms[i].die (); if (state == CrawlEvent.PAUSED) { // put partly-processed links back in fetchQueue synchronized (fetchQueue) { for (int i=0; i<worms.length; ++i) if (worms[i].link != null) fetchQueue.put (worms[i].link); } } worms = null; } } /** * Initialize the crawler for a fresh crawl. Clears the crawling queue * and sets all crawling statistics to 0. Stops the crawler * if it is currently running. */ public void clear () { stop (); numPagesVisited = 0; numLinksTested = 0; clearVisited (); if (crawledRoots != null) for (int i=0; i < crawledRoots.length; ++i) crawledRoots[i].disconnect (); crawledRoots = null; state = CrawlEvent.CLEARED; sendCrawlEvent (state); } /** * Pause the crawl in progress. If the crawler is running, then * it finishes processing the current page, then returns. The queues remain as-is, * so calling run() again will resume the crawl exactly where it left off. * pause() can be called from any thread. */ public void pause () { if (state == CrawlEvent.STARTED) { synchronized (crawlQueue) { state = CrawlEvent.PAUSED; crawlQueue.notify (); } sendCrawlEvent (state); } } /** * Stop the crawl in progress. If the crawler is running, then * it finishes processing the current page, then returns. * Empties the crawling queue. */ public void stop () { if (state == CrawlEvent.STARTED || state == CrawlEvent.PAUSED) { synchronized (crawlQueue) { synchronized (fetchQueue) { state = CrawlEvent.STOPPED; fetchQueue.clear (); crawlQueue.clear (); numPagesLeft = 0; crawlQueue.notify (); } } sendCrawlEvent (state); } } /* * Timeout the crawl in progress. Used internally by * the CrawlTimer. */ void timedOut () { if (state == CrawlEvent.STARTED) { synchronized (crawlQueue) { synchronized (fetchQueue) { state = CrawlEvent.TIMED_OUT; fetchQueue.clear (); crawlQueue.clear (); numPagesLeft = 0; crawlQueue.notify (); } } sendCrawlEvent (state); } } /** * Get state of crawler. * @return one of CrawlEvent.STARTED, CrawlEvent.PAUSED, STOPPED, CLEARED. */ public int getState () { return state; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -