📄 norobots.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
字号:
// NoRobots - implements the Robot Exclusion Standard
//
// Copyright (C)1996,1998 by Jef Poskanzer <jef@acme.com>.  
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
// SUCH DAMAGE.

/**
 * moved to the net.matuschek.spider package bei Daniel Matuschek
 * did some minimal modifications to use HttpTool for retrieval of robots.txt
 */

package net.matuschek.spider;

import java.io.*;
import java.net.*;
import java.util.*;

import org.apache.log4j.Category;

import net.matuschek.http.*;

/** 
 * Implements the Robot Exclusion Standard.
 * <P>
 * The basic idea of the Robot Exclusion Standard is that each web server
 * can set up a single file called "/robots.txt" which contains pathnames
 * that robots should not look at. 
 * See <A HREF="http://www.robotstxt.org/wc/norobots.html">the full spec</A>
 * for details.
 * Using this class is very simple - you create the object using your robot's 
 * name and the httptool to retrieve the date, and then you call check() on 
 * each URL.  For efficiency, the class caches entries for servers you've 
 * visited recently.
 * <p>
 * @author cn
 * @version 0.1
 */
public class NoRobots {

	Category log = Category.getInstance(getClass().getName());
		
	// The file with the robot rules in it.
	private static final String robotFile = "/robots.txt";

	// The name of this robot.
	private String robotName;

	// A table of all the servers we have visited recently.
	private Hashtable servers = new net.matuschek.util.LruHashtable(500);

	// tool to get /robots.txt
	private HttpTool httpTool;
	private boolean ignore = false;

	/**
	 * Constructor.
	 * @param robotName the name of the robot
	 * @param httpTool the HttpTool instance for downloading the robotFile
	 */
	public NoRobots(String robotName, HttpTool inhttpTool) {
		this.robotName = robotName;
		this.httpTool = inhttpTool;
		/*
		this.httpTool = new HttpTool();
		httpTool.setAgentName(inhttpTool.getAgentName());
		try{
			httpTool.setProxy(inhttpTool.getProxy());
		} catch (HttpException e){
			// ignore
		}
		*/	
	}
	
	/**
	 * Check whether it's ok for this robot to fetch this URL. reads the
	 * information in the robots.txt file on this host. If a robots.txt file is
	 * there and this file disallows the robot to retrieve the requested url
	 * then the method returns false
	 * @param url the url we want to retrieve
	 * @return boolean true if allowed to retireve the url, false otherwise
	 */
	public boolean ok(URL url) {
		// if ignore is set to true, then this check returs true
		if (ignore) {
			return true;
		}

		String protocol = url.getProtocol();
		String host = url.getHost();
		int port = url.getPort();
		if (port == -1) {
			port = 80;
		}

		String file = url.getFile();

		Vector disallows = getDisallows(protocol, host, port);
		Enumeration en = disallows.elements();
		while (en.hasMoreElements()) {
			String pattern = (String) en.nextElement();
			if (file.startsWith(pattern))
				return false;
		}
		return true;
	}

	/**
	 * Method getDisallows.
	 * Get the disallows list for the given server.  If it's not already in 
	 * the servers hash table, we fetch it, parse it, and save it.
	 * @param protocol
	 * @param host
	 * @param port
	 * @return Vector
	 */
	private Vector getDisallows(String protocol, String host, int port) {
		String key = protocol + "://" + host + ":" + port;
		Vector disallows = (Vector) servers.get(key);
		if (disallows != null)
			return disallows;

		disallows = new Vector();
		try {
			URL robotUrl = new URL(protocol, host, port, robotFile);
			try {

				// get document
				log.debug("Retrieving robot file '" + robotUrl + "'.");
				httpTool.setReferer("-");
				String robotsFile = "";
				try {
					HttpDoc doc =
						httpTool.retrieveDocument(
							robotUrl,
							HttpConstants.GET,
							"");
					//old source if (doc.isOk()) {
					if (doc != null && doc.isOk()) {
						robotsFile = new String(doc.getContent());
					}
				} catch (HttpException e) {
					// ignore HTTP errors
					log.info("Cannot read robots.txt: " + e.getMessage());
				}

				BufferedReader robotReader =
					new BufferedReader(new StringReader(robotsFile));
				boolean userAgentIsMe = false;
				while (true) {
					String line = robotReader.readLine();
					if (line == null)
						break;
					line = line.trim();

					// Completely ignore lines that are just a comment - they
					// don't even separate records.
					if (line.startsWith("#"))
						continue;

					// Trim off any other comments.
					int cmt = line.indexOf('#');
					if (cmt != -1)
						line = line.substring(0, cmt).trim();

					if (line.length() == 0)
						userAgentIsMe = false;
					else if (line.toLowerCase().startsWith("user-agent:")) {
						if (!userAgentIsMe) {
							String value = line.substring(11).trim();
							if (match(value, robotName))
								userAgentIsMe = true;
						}
					} else if (line.toLowerCase().startsWith("disallow:")) {
						if (userAgentIsMe) {
							String value = line.substring(9).trim();
							disallows.addElement(value);
						}
					}
				}
			} catch (IOException ignore) {
			}
		} catch (MalformedURLException ignore) {
		}

		servers.put(key, disallows);
		return disallows;
	}

	/**
	 * Method match.
	 * Checks whether a string matches a given wildcard pattern.
	 * Only does ? and *, and multiple patterns separated by |.
	 * @param pattern
	 * @param string
	 * @return boolean
	 */
	protected static boolean match(String pattern, String string) {
		for (int p = 0;; ++p) {
			for (int s = 0;; ++p, ++s) {
				boolean sEnd = (s >= string.length());
				boolean pEnd =
					(p >= pattern.length() || pattern.charAt(p) == '|');
				if (sEnd && pEnd)
					return true;
				if (sEnd || pEnd)
					break;
				if (pattern.charAt(p) == '?')
					continue;
				if (pattern.charAt(p) == '*') {
					int i;
					++p;
					for (i = string.length(); i >= s; --i)
						if (match(pattern.substring(p),
							string.substring(i))) /* not quite right */
							return true;
					break;
				}
				if (pattern.charAt(p) != string.charAt(s))
					break;
			}
			p = pattern.indexOf('|', p);
			if (p == -1)
				return false;
		}
	}

	/**
	 * Method getIgnore.
	 * tells if the robot exclusion standard is ignored
	 * @return boolean true if the check on robots.txt is not done
	 */
	public boolean getIgnore() {
		return ignore;
	}

	/**
	 * Method setIgnore.
	 * set the robot exclusion standard.
	 * @param ignore if ignore is true then the robot exclusion standard is 
	 * ignored
	 */
	public void setIgnore(boolean ignore) {
		this.ignore = ignore;
	}

	/**
	 * This method finishes the HttpTool.
	 */
	public void finish() {
		if (httpTool != null) {
			httpTool.finish();
		}
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -