📄 sitespider.java
字号:
package ir.webutils;import java.util.*;import java.net.*;import java.io.*;/** * A spider that limits itself to a given site. * * @author Ray Mooney */public class SiteSpider extends Spider { /** * Gets links from the given page that are on the same host as the * page. * * @return A list of links on <code>page</code> that have the same * host as <code>url</code>. */ public List getNewLinks(HTMLPage page) { List links = page.getOutLinks(); URL url = page.getLink().getURL(); ListIterator iterator = links.listIterator(); while(iterator.hasNext()) { Link link = (Link) iterator.next(); if(!url.getHost().equals(link.getURL().getHost())) iterator.remove(); } return links; } /** Spider the web according to the following command options, * but stay within the given site (same URL host). * <ul> * <li>-safe : Check for and obey robots.txt and robots META tag * directives.</li> * <li>-d <directory> : Store indexed files in <directory>.</li> * <li>-c <count> : Store at most <count> files.</li> * <li>-u <url> : Start at <url>.</li> * <li>-slow : Pause briefly before getting a page. This can be * useful when debugging. * </ul> */ public static void main(String args[]) { new SiteSpider().go(args); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -