📄 norobots.java
字号:
// NoRobots - implements the Robot Exclusion Standard
//
// Copyright (C)1996,1998 by Jef Poskanzer <jef@acme.com>.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
// SUCH DAMAGE.
/**
* moved to the net.matuschek.spider package bei Daniel Matuschek
* did some minimal modifications to use HttpTool for retrieval of robots.txt
*/
package net.matuschek.spider;
import java.io.*;
import java.net.*;
import java.util.*;
import org.apache.log4j.Category;
import net.matuschek.http.*;
/**
* Implements the Robot Exclusion Standard.
* <P>
* The basic idea of the Robot Exclusion Standard is that each web server
* can set up a single file called "/robots.txt" which contains pathnames
* that robots should not look at.
* See <A HREF="http://www.robotstxt.org/wc/norobots.html">the full spec</A>
* for details.
* Using this class is very simple - you create the object using your robot's
* name and the httptool to retrieve the date, and then you call check() on
* each URL. For efficiency, the class caches entries for servers you've
* visited recently.
* <p>
* @author cn
* @version 0.1
*/
public class NoRobots {
Category log = Category.getInstance(getClass().getName());
// The file with the robot rules in it.
private static final String robotFile = "/robots.txt";
// The name of this robot.
private String robotName;
// A table of all the servers we have visited recently.
private Hashtable servers = new net.matuschek.util.LruHashtable(500);
// tool to get /robots.txt
private HttpTool httpTool;
private boolean ignore = false;
/**
* Constructor.
* @param robotName the name of the robot
* @param httpTool the HttpTool instance for downloading the robotFile
*/
public NoRobots(String robotName, HttpTool inhttpTool) {
this.robotName = robotName;
this.httpTool = inhttpTool;
/*
this.httpTool = new HttpTool();
httpTool.setAgentName(inhttpTool.getAgentName());
try{
httpTool.setProxy(inhttpTool.getProxy());
} catch (HttpException e){
// ignore
}
*/
}
/**
* Check whether it's ok for this robot to fetch this URL. reads the
* information in the robots.txt file on this host. If a robots.txt file is
* there and this file disallows the robot to retrieve the requested url
* then the method returns false
* @param url the url we want to retrieve
* @return boolean true if allowed to retireve the url, false otherwise
*/
public boolean ok(URL url) {
// if ignore is set to true, then this check returs true
if (ignore) {
return true;
}
String protocol = url.getProtocol();
String host = url.getHost();
int port = url.getPort();
if (port == -1) {
port = 80;
}
String file = url.getFile();
Vector disallows = getDisallows(protocol, host, port);
Enumeration en = disallows.elements();
while (en.hasMoreElements()) {
String pattern = (String) en.nextElement();
if (file.startsWith(pattern))
return false;
}
return true;
}
/**
* Method getDisallows.
* Get the disallows list for the given server. If it's not already in
* the servers hash table, we fetch it, parse it, and save it.
* @param protocol
* @param host
* @param port
* @return Vector
*/
private Vector getDisallows(String protocol, String host, int port) {
String key = protocol + "://" + host + ":" + port;
Vector disallows = (Vector) servers.get(key);
if (disallows != null)
return disallows;
disallows = new Vector();
try {
URL robotUrl = new URL(protocol, host, port, robotFile);
try {
// get document
log.debug("Retrieving robot file '" + robotUrl + "'.");
httpTool.setReferer("-");
String robotsFile = "";
try {
HttpDoc doc =
httpTool.retrieveDocument(
robotUrl,
HttpConstants.GET,
"");
//old source if (doc.isOk()) {
if (doc != null && doc.isOk()) {
robotsFile = new String(doc.getContent());
}
} catch (HttpException e) {
// ignore HTTP errors
log.info("Cannot read robots.txt: " + e.getMessage());
}
BufferedReader robotReader =
new BufferedReader(new StringReader(robotsFile));
boolean userAgentIsMe = false;
while (true) {
String line = robotReader.readLine();
if (line == null)
break;
line = line.trim();
// Completely ignore lines that are just a comment - they
// don't even separate records.
if (line.startsWith("#"))
continue;
// Trim off any other comments.
int cmt = line.indexOf('#');
if (cmt != -1)
line = line.substring(0, cmt).trim();
if (line.length() == 0)
userAgentIsMe = false;
else if (line.toLowerCase().startsWith("user-agent:")) {
if (!userAgentIsMe) {
String value = line.substring(11).trim();
if (match(value, robotName))
userAgentIsMe = true;
}
} else if (line.toLowerCase().startsWith("disallow:")) {
if (userAgentIsMe) {
String value = line.substring(9).trim();
disallows.addElement(value);
}
}
}
} catch (IOException ignore) {
}
} catch (MalformedURLException ignore) {
}
servers.put(key, disallows);
return disallows;
}
/**
* Method match.
* Checks whether a string matches a given wildcard pattern.
* Only does ? and *, and multiple patterns separated by |.
* @param pattern
* @param string
* @return boolean
*/
protected static boolean match(String pattern, String string) {
for (int p = 0;; ++p) {
for (int s = 0;; ++p, ++s) {
boolean sEnd = (s >= string.length());
boolean pEnd =
(p >= pattern.length() || pattern.charAt(p) == '|');
if (sEnd && pEnd)
return true;
if (sEnd || pEnd)
break;
if (pattern.charAt(p) == '?')
continue;
if (pattern.charAt(p) == '*') {
int i;
++p;
for (i = string.length(); i >= s; --i)
if (match(pattern.substring(p),
string.substring(i))) /* not quite right */
return true;
break;
}
if (pattern.charAt(p) != string.charAt(s))
break;
}
p = pattern.indexOf('|', p);
if (p == -1)
return false;
}
}
/**
* Method getIgnore.
* tells if the robot exclusion standard is ignored
* @return boolean true if the check on robots.txt is not done
*/
public boolean getIgnore() {
return ignore;
}
/**
* Method setIgnore.
* set the robot exclusion standard.
* @param ignore if ignore is true then the robot exclusion standard is
* ignored
*/
public void setIgnore(boolean ignore) {
this.ignore = ignore;
}
/**
* This method finishes the HttpTool.
*/
public void finish() {
if (httpTool != null) {
httpTool.finish();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -