📄 safehtmlpageretriever.java
字号:
package ir.webutils;import java.util.*;import java.net.*;/** * Keeps track of Robot Exclusion information. Clients can use this * class to ensure that they do not access pages prohibited either by * the Robots Exclusion Protocol or Robots META tags. * * @author Ted Wild & Ray Mooney */public final class SafeHTMLPageRetriever extends HTMLPageRetriever { private Set disallowed; private String currentSite; public SafeHTMLPageRetriever() { disallowed = new RobotExclusionSet(); currentSite = ""; } /** * Tries to download the given web page. Throws * <code>PathDisallowedException</code> if access to the page is * prohibited. Also updates Robots Exclusion information based on * the new page. * * @param url The URL to try to download from. * * @return The web page specified by the URL. * * @throws PathDisallowedException If <code>url</code> is * disallowed by a robots.txt file or Robots META tag. */ public HTMLPage getHTMLPage(Link link) throws PathDisallowedException { // check to make sure access to link is not disallowed // (e. g. because of a NOFOLLOW) if (disallowed.contains(link.getURL())) throw new PathDisallowedException("Access disallowed :" + link); // if URL is for a different site, update the robots.txt file if (!currentSite.equals(getSite(link.getURL()))) { currentSite = getSite(link.getURL()); disallowed = new RobotExclusionSet(currentSite); } // currentSite and disallowed are updated for this URL // check to make sure this site is not already prohibited if (disallowed.contains(link.getURL().getPath())) throw new PathDisallowedException("Access disallowed: " + link); String page = WebPage.getWebPage(link.getURL()); RobotsMetaTagParser metaInf = new RobotsMetaTagParser(link.getURL(), page); // check for Robots META tags and add new rules disallowed.addAll(getPaths(metaInf.parseMetaTags())); return new SafeHTMLPage(link, page, metaInf.index()); } // The "site" is the host and port of the URL. This // information can be found by stripping any user information // off the authority (the part of the URL between the protocol // and the path). private String getSite(URL url) { String site = url.getAuthority(); if (site.indexOf("@") != -1) return site.substring(site.indexOf("@") + 1); else return site; } // Convert links into paths so that the RobotExclusionSet will // appropriately handle them. private List getPaths(List links) { List paths = new LinkedList(); for (Iterator i = links.iterator(); i.hasNext(); ) paths.add(((Link) i.next()).getURL().getPath()); return paths; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -