📄 directoryspider.java
字号:
package ir.webutils;import java.util.*;import java.net.*;import java.io.*;/** * Spider that limits itself to the directory it started in. * * @author Ted Wild and Ray Mooney */public class DirectorySpider extends Spider { static URL firstURL; /** * Gets links from the page that are in or below the starting * directory. * * @return The links on <code>page</code> that are in or below the * directory of the first page. */ public List getNewLinks(HTMLPage page) { List links = page.getOutLinks(); URL url = page.getLink().getURL(); ListIterator iterator = links.listIterator(); while(iterator.hasNext()) { Link link = (Link)iterator.next(); if(!url.getHost().equals(link.getURL().getHost())) iterator.remove(); else if (!link.getURL().getPath().startsWith(getDirectory(firstURL))) iterator.remove(); } return links; } /** * Sets the initial URL from the "-u" argument, then calls the * corresponding superclass method. * * @param value The value of the "-u" command line argument. */ protected void handleUCommandLineOption(String value) { try { firstURL = new URL(value); } catch (MalformedURLException e) { System.out.println(e.toString()); System.exit(-1); } super.handleUCommandLineOption(value); } private String getDirectory(URL u) { String directory = u.getPath(); if (directory.indexOf(".") != -1) directory = directory.substring(0, directory.lastIndexOf("/")); return directory; } /** Spider the web according to the following command options, * but only below the start URL directory. * <ul> * <li>-safe : Check for and obey robots.txt and robots META tag * directives.</li> * <li>-d <directory> : Store indexed files in <directory>.</li> * <li>-c <count> : Store at most <count> files.</li> * <li>-u <url> : Start at <url>.</li> * <li>-slow : Pause briefly before getting a page. This can be * useful when debugging. * </ul> */ public static void main(String args[]) { new DirectorySpider().go(args); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -