📄 crawler.java
字号:
/** * Callback for visiting a page. Default version does nothing. * * @param page Page retrieved by the crawler */ public void visit (Page page) { } /** * Callback for testing whether a link should be traversed. * Default version returns true for all links. Override this method * for more interesting behavior. * * @param l Link encountered by the crawler * @return true if link should be followed, false if it should be ignored. */ public boolean shouldVisit (Link l) { return true; } /** * Expand the crawl from a page. The default implementation of this * method tests every link on the page using shouldVisit (), and * submit()s the links that are approved. A subclass may want to override * this method if it's inconvenient to consider the links individually * with shouldVisit(). * @param page Page to expand */ public void expand (Page page) { // examine each link on the page Link[] links = page.getLinks(); if (links != null && links.length > 0) { // give each link a default priority based on its page // and position on page float priority = (depthFirst ? -numPagesVisited : numPagesVisited); float increment = 1.0f/links.length; for (int i=0; i<links.length; ++i) { Link l = links[i]; // set default download parameters l.setPriority (priority); priority += increment; l.setDownloadParameters (dp); ++numLinksTested; if (ignoreVisitedLinks && visited (l)) // FIX: use atomic test-and-set // FIX: set l.page somehow? sendLinkEvent (l, LinkEvent.ALREADY_VISITED); else if (!((type == null || l.hasAnyLabels (type)) && (domain == null || l.hasAnyLabels (domain)) && (linkPredicate == null || linkPredicate.shouldVisit (l)) && shouldVisit (l))) sendLinkEvent (l, LinkEvent.SKIPPED); else if (page.getDepth() >= maxDepth) sendLinkEvent (l, LinkEvent.TOO_DEEP); else submit (l); } } } /* * Crawl statistics */ /** * Get number of pages visited. * @return number of pages passed to visit() so far in this crawl */ public int getPagesVisited() { return numPagesVisited; } /** * Get number of links tested. * @return number of links passed to shouldVisit() so far in this crawl */ public int getLinksTested() { return numLinksTested; } /** * Get number of pages left to be visited. * @return number of links approved by shouldVisit() but not yet visited */ public int getPagesLeft() { return numPagesLeft; } /** * Get number of threads currently working. * @return number of threads downloading pages */ public int getActiveThreads () { Worm[] w = worms; if (w == null) return 0; int n = 0; for (int i=0; i<w.length; ++i) if (w[i] != null && w[i].link != null) ++n; return n; } /* * Crawler parameters */ /** * Get human-readable name of crawler. Default value is the * class name, e.g., "Crawler". Useful for identifying the crawler in a * user interface; also used as the default User-agent for identifying * the crawler to a remote Web server. (The User-agent can be * changed independently of the crawler name with setDownloadParameters().) * @return human-readable name of crawler */ public String getName () { return name; } /** * Set human-readable name of crawler. * @param name new name for crawler */ public void setName (String name) { this.name = name; } /** * Convert the crawler to a String. * @return Human-readable name of crawler. */ public String toString () { return getName (); } /** * Get starting points of crawl as an array of Link objects. * @return array of Links from which crawler will start its next crawl. */ public Link[] getRoots () { if (roots == null) return new Link[0]; Link[] result = new Link[roots.length]; System.arraycopy (roots, 0, result, 0, roots.length); return result; } /** * Get roots of last crawl. May differ from getRoots() * if new roots have been set. * @return array of Links from which crawler started its last crawl, * or null if the crawler was cleared. */ public Link[] getCrawledRoots () { if (crawledRoots == null) return null; Link[] result = new Link[crawledRoots.length]; System.arraycopy (crawledRoots, 0, result, 0, crawledRoots.length); return result; } /** * Get starting points of crawl as a String of newline-delimited URLs. * @return URLs where crawler will start, separated by newlines. */ public String getRootHrefs () { StringBuffer buf = new StringBuffer (); if (roots != null) { for (int i=0; i<roots.length; ++i) { if (buf.length() > 0) buf.append ('\n'); buf.append (roots[i].getURL().toExternalForm()); } } return buf.toString (); } /** * Set starting points of crawl as a string of whitespace-delimited URLs. * @param hrefs URLs of starting point, separated by space, \t, or \n * @exception java.net.MalformedURLException if any of the URLs is invalid, * leaving starting points unchanged */ public void setRootHrefs (String hrefs) throws MalformedURLException { Vector v = new Vector (); StringTokenizer tok = new StringTokenizer (hrefs); while (tok.hasMoreElements ()) v.addElement (new Link (tok.nextToken())); roots = new Link[v.size()]; v.copyInto (roots); } /** * Set starting point of crawl as a single Link. * @param link starting point */ public void setRoot (Link link) { roots = new Link[1]; roots[0] = link; } /** * Set starting points of crawl as an array of Links. * @param links starting points */ public void setRoots (Link[] links) { roots = new Link[links.length]; System.arraycopy (links, 0, roots, 0, links.length); } /** * Add a root to the existing set of roots. * @param link starting point to add */ public void addRoot (Link link) { if (roots == null) setRoot (link); else { Link newroots[] = new Link[roots.length+1]; System.arraycopy (roots, 0, newroots, 0, roots.length); newroots[newroots.length-1] = link; roots = newroots; } } /** * Get crawl domain. Default value is WEB. * @return WEB, SERVER, or SUBTREE. */ public String[] getDomain () { return domain; } /** * Set crawl domain. * @param domain one of WEB, SERVER, or SUBTREE. */ public void setDomain (String[] domain) { this.domain = domain; } /** * Get legal link types to crawl. Default value is HYPERLINKS. * @return HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS. */ public String[] getLinkType () { return type; } /** * Set legal link types to crawl. * @param domain one of HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS. */ public void setLinkType (String[] type) { this.type = type; } /** * Get depth-first search flag. Default value is true. * @return true if search is depth-first, false if search is breadth-first. */ public boolean getDepthFirst() { return depthFirst; } /** * Set depth-first search flag. If neither depth-first nor breadth-first * is desired, then override shouldVisit() to set a custom priority on * each link. * @param useDFS true if search should be depth-first, false if search should be breadth-first. */ public void setDepthFirst(boolean useDFS) { depthFirst = useDFS; } /** * Get synchronous flag. Default value is false. * @return true if crawler must visit the pages in priority order; false if crawler can visit * pages in any order. */ public boolean getSynchronous() { return synchronous; } /** * Set ssynchronous flag. * @param f true if crawler must visit the pages in priority order; false if crawler can visit * pages in any order. */ public void setSynchronous(boolean f) { synchronous = f; } /** * Get ignore-visited-links flag. Default value is true. * @return true if search skips links whose URLs have already been visited * (or queued for visiting). */ public boolean getIgnoreVisitedLinks() { return ignoreVisitedLinks; } /** * Set ignore-visited-links flag. * @param f true if search skips links whose URLs have already been visited * (or queued for visiting). */ public void setIgnoreVisitedLinks(boolean f) { ignoreVisitedLinks = f; } /** * Get maximum depth. Default value is 5. * @return maximum depth of crawl, in hops from starting point. */ public int getMaxDepth() { return maxDepth; } /** * Set maximum depth. * @param maxDepth maximum depth of crawl, in hops from starting point */ public void setMaxDepth(int maxDepth) { this.maxDepth = maxDepth; } /** * Get download parameters (such as number of threads, timeouts, maximum * page size, etc.) */ public DownloadParameters getDownloadParameters() { return dp; } /** * Set download parameters (such as number of threads, timeouts, maximum * page size, etc.) * @param dp Download parameters */ public void setDownloadParameters(DownloadParameters dp) { this.dp = dp; } /** * Set link predicate. This is an alternative way to * specify the links to walk. If the link predicate is * non-null, then only links that satisfy * the link predicate AND shouldVisit() are crawled. * @param pred Link predicate */ public void setLinkPredicate (LinkPredicate pred) { if (pred == linkPredicate || (pred != null && pred.equals (linkPredicate))) return; if (linkPredicate != null) linkPredicate.disconnected (this); linkPredicate = pred; if (linkPredicate != null) linkPredicate.connected (this); } /** * Get link predicate. * @return current link predicate */ public LinkPredicate getLinkPredicate () { return linkPredicate; } /** * Set page predicate. This is a way to filter the pages * passed to visit(). If the page predicate is * non-null, then only pages that satisfy it are passed to visit(). * @param pred Page predicate */ public void setPagePredicate (PagePredicate pred) { if (pred == pagePredicate || (pred != null && pred.equals (pagePredicate))) return; if (pagePredicate != null) pagePredicate.disconnected (this); pagePredicate = pred; if (pagePredicate != null) pagePredicate.connected (this); } /** * Get page predicate. * @return current page predicate */ public PagePredicate getPagePredicate () { return pagePredicate; } /** * Set the action. This is an alternative way to specify * an action performed on every page. If act is non-null, * then every page passed to visit() is also passed to this * action. * @param act Action */ public void setAction (Action act) { if (act == action || (act != null && act.equals (action))) return; if (action != null) action.disconnected (this); action = act; if (action != null) action.connected (this); } /** * Get action. * @return current action */ public Action getAction () { return action; } /* * Link queue management * */ /** * Puts a link into the crawling queue. If the crawler is running, the * link will eventually be retrieved and passed to visit(). * @param link Link to put in queue */ public void submit (Link link) { markVisited (link); // FIX: need atomic test-and-set of visited flag synchronized (crawlQueue) { synchronized (fetchQueue) { crawlQueue.put (link); ++numPagesLeft; fetchQueue.put (link); fetchQueue.notifyAll (); // wake up worms }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -