📄 webrobot.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
package net.matuschek.spider;

/**
 * This class implements a web robot that does a search trough
 * the web starting from a given start document up to a given 
 * search depth.
 * 
 * @author Daniel Matuschek / Oliver Schmidt 
 * @version $Revision: 1.35 $
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;
import java.util.Vector;

import net.matuschek.html.FormFiller;
import net.matuschek.html.HtmlDocument;
import net.matuschek.http.DocManagerException;
import net.matuschek.http.DownloadRuleSet;
import net.matuschek.http.ExtendedURL;
import net.matuschek.http.HttpConstants;
import net.matuschek.http.HttpDoc;
import net.matuschek.http.HttpDocManager;
import net.matuschek.http.HttpException;
import net.matuschek.http.HttpHeader;
import net.matuschek.http.HttpTool;
import net.matuschek.http.HttpToolCallback;
import net.matuschek.http.NTLMAuthorization;
import net.matuschek.http.cookie.CookieManager;
import net.matuschek.spider.docfilter.FilterChain;
import net.matuschek.spider.docfilter.FilterException;

import org.apache.log4j.Category;
import org.w3c.dom.Element;

public class WebRobot implements Runnable, Cloneable {

	/** the name of the robot */
	private final static String ROBOT_NAME = "JoBo";

	/** the default agent name */
	private final static String AGENT_NAME = 
          ROBOT_NAME+"/1.4 (http://www.matuschek.net/jobo.html)";

	/** the robot exception handler*/
	protected RobotExceptionHandler exceptionHandler = 
          new DefaultRobotExceptionHandler();

	/** default maximal search depth */
	private final static int DEFAULT_DEPTH = 10;

	/** the URL where the robot walk starts from */
	protected URL startURL = null;

	/** the host and directory where retrieval started from */
	protected String startDir = "";

	/** maximal search depth */
	protected int maxDepth = DEFAULT_DEPTH;

	/** is it allowed to walk to other hosts then the starting host ? */
	protected boolean walkToOtherHosts = false;

	/** DocManager will store or process retrieved documents */
	protected HttpDocManager docManager;

	/** HttpTool will be used to retrieve documents from a web server */
	protected HttpTool httpTool = new HttpTool();

	/** Log4J category for logging */
	protected Category log;

	/** Referer used to retrieve to first document */
	protected String startReferer = "-";

	/** test for robots.txt */
	protected NoRobots robCheck;

	/** current tasks */
	protected TaskList todo = null;

	/** a list of all URLs we got already */
	protected TaskList visited = null;
	
	/** ignore settings in /robots.txt ? */
	protected boolean ignoreRobotsTxt = false;

	/** sleep that number of seconds after every retrieved document */
	protected int sleepTime = 1;

	/** fill out forms */
	protected FormFiller formFiller = new FormFiller();

	/** this URLs can be visited more then once */
	protected Vector visitMany = new Vector();

	/** for callback to the user interface **/
	protected WebRobotCallback webRobotCallback = null;

	/** should we stop robot operation ? **/
	protected boolean stopIt = false;

	/** to check if it is allowed to travel to a given URL **/
	protected URLCheck urlCheck = null;

	/** should the robot suspend the current walk() **/
	protected boolean sleep;

	/** list of allowed URLs (even if walkToOtherHosts is false) **/
	protected Vector allowedURLs = new Vector();

	/** allow travelling the whole host ? */
	protected boolean allowWholeHost = true;

	/** 
	 * maximum document age in seconds, negative value means
	 * no limit 
	 */
	protected long maxDocumentAge = -1; // no limit

	/** 
	 * allow travelling to all subdomains of the start host ? 
	 * @see #setAllowWholeDomain(boolean)
	 */
	protected boolean allowWholeDomain = true;

	/** 
	 * do more flexible tests if the new URL is on the same host
	 * @see #basicURLCheck(URL)
	 */
	protected boolean flexibleHostCheck = false;

	/**
	 * FilterChain to filter the document before storing it
	 */
	protected FilterChain filters = null;

	/**
	 * don't retrieve pages again that are already stored in the DocManager
	 */
	protected boolean allowCaching = true;
	
	/**
	 * Check for documents with the same content
	 */
	protected boolean duplicateCheck = false;
	
	/**
	 * initializes the robot with the default implementation 
	 * of the TaskList interface
	 * 
	 * @param expected document count
	 */
	public WebRobot(int expectedDocumentCount) {
		log = Category.getInstance(getClass().getName());
		content2UrlMap = new HashMap(expectedDocumentCount);
		registerVisitedList(new HashedMemoryTaskList(false,
					expectedDocumentCount));
		registerToDoList(new HashedMemoryTaskList(true,
					expectedDocumentCount));
		this.expectedDocumentCount = expectedDocumentCount;
		this.setAgentName(AGENT_NAME);
	}

	/**
	 * initializes the robot with the default implementation of the TaskList
	 * interface
	 */
	public WebRobot() {
		this(DEFAULT_EXPECTED_DOCUMENT_COUNT);
	}
	
	/**
	 * Sets the implementation class for the backend task list storage.
	 * WebRobot uses the TaskList interface to store future tasks.
	 *
	 * If you want to use your own TaskList implementation, just call
	 * this method.
	 * 
	 * @param todo TaskList to be used for the "to do" list
	 */
	public void registerToDoList(TaskList todo) {
		this.todo = todo;
	}

	/**
	 * Sets the implementation class for the backend task list storage.
	 * WebRobot uses the TaskList interface to store URLs that have
	 * been retrieved before.
	 *
	 * If you want to use your own TaskList implementation, just call
	 * this method.
	 * 
	 * @param visited TaskList to be used for the list of visited URLs
	 */
	public void registerVisitedList(TaskList visited) {
		this.visited = visited;
	}

	/**
	 * @return the start URL for this robot
	 */
	public URL getStartURL() {
		return startURL;
	}

	/**
	 * Sets the start URL for this robot
	 * @param startURL the start URL
	 */
	public void setStartURL(URL startURL) {
		String path = startURL.getPath();
		this.startURL = startURL;

		// is it a directory ?
		if (path.endsWith("/")) {
			this.startDir = startURL.getHost() + path;
		} else {
			int pos = path.lastIndexOf("/");
			if (pos < 0) {
				// this happens for URLs without a path
				this.startDir = startURL.getHost() + "/";
			} else {
				this.startDir = startURL.getHost() + path.substring(0, pos + 1);
			}
		}
	}

	/**
	 * @return the maximal allowed search depth
	 */
	public int getMaxDepth() {
		return maxDepth;
	}

	/**
	 * sets the maximal search depth
	 * @param maxDepth
	 */
	public void setMaxDepth(int maxDepth) {
		this.maxDepth = maxDepth;
	}

	/**
	 * Get the value of bandwith of the used HttpTool
	 * @return value of bandwith.
	 */
	public int getBandwidth() {
		return httpTool.getBandwidth();
	}

	/**
	 * Set the value of bandwith  of the used HttpTool
	 * @param bandwidth  Value to assign to bandwith.
	 */
	public void setBandwidth(int bandwidth) {
		httpTool.setBandwidth(bandwidth);
	}

	/**
	 * gets the WalkToOtherHost status
	 * @return true if the Robot is allowed to travel to other
	 * host then the start host, false otherwise
	 */
	public boolean getWalkToOtherHosts() {
		return walkToOtherHosts;
	}

	/**
	 * sets the WalkToOtherHosts status
	 * @param walkToOtherHosts true if the Robot is allowed to travel to other
	 * host then the start host, false otherwise
	 */
	public void setWalkToOtherHosts(boolean walkToOtherHosts) {
		this.walkToOtherHosts = walkToOtherHosts;
	}

	/**
	 * gets the AllowWholeHost value
	 * @return true if the Robot is allowed to travel to the whole 
	 * host where it started from, false otherwise. If false, it is only
	 * allowed to travel to URLs below the start URL
	 */
	public boolean getAllowWholeHost() {
		return allowWholeHost;
	}

	/**
	 * sets the AllowWholeHost status
	 * @param allowWholeHost if true, the Robot is allowed to
	 * travel to the whole host where it started from. Otherwise it is only
	 * allowed to travel to URLs below the start URL.
	 */
	public void setAllowWholeHost(boolean allowWholeHost) {
		this.allowWholeHost = allowWholeHost;
	}

	/**
	 * Gets the AllowWholeDomain value.
	 * @return true if the Robot is allowed to travel to the whole 
	 * domain of the start host, false otherwise. 
	 * @see #setAllowWholeDomain(boolean)
	 */
	public boolean getAllowWholeDomain() {
		return allowWholeDomain;
	}

	/**
	 * Sets the AllowWholeDomain status
	 * @param allowWholeDomain if true, the Robot is allows to travel
	 * to all hosts in the same domain as the starting host. E.g. if you
	 * start at www.apache.org, it is also allowed to travel to
	 * jakarta.apache.org, xml.apache.org ...
	 */
	public void setAllowWholeDomain(boolean allowWholeDomain) {
		this.allowWholeDomain = allowWholeDomain;
	}

	/**
	 * Gets the state of flexible host checking (enabled or disabled).
	 *
	 * To find out if a new URL is on the same host, the robot usually
	 * compares the host part of both. Some web servers have an inconsistent
	 * addressing scheme and use the hostname www.domain.com and domain.com.
	 * With flexible host check enabled, the robot will consider both
	 * hosts as equal.
	 *
	 * @return true, if flexible host checking is enabled
	 */
	public boolean getFlexibleHostCheck() {
		return flexibleHostCheck;
	}

	/**
	 * Defines if the host test should be more flexible.
	 *
	 * To find out if a new URL is on the same host, the robot usually
	 * compares the host part of both. Some web servers have an inconsistent
	 * addressing scheme and use the hostname www.domain.com and domain.com.
	 * With flexible host check enabled, the robot will consider both
	 * hosts as equal.
	 *
	 * @param flexibleHostCheck set this true, to enable flexible host checking
	 * (disabled by default)
	 */
	public void setFlexibleHostCheck(boolean flexibleHostCheck) {
		this.flexibleHostCheck = flexibleHostCheck;
	}

	/**
	 * Gets the AllowCaching value.
	 * @return true if the Robot is allowed to cache documents in the
	 * docManager
	 * @see #setAllowCaching(boolean)
	 */
	public boolean getAllowCaching() {
		return allowCaching;
	}

	/**
	 * Sets the AllowCaching status
	 *
	 * @param allowCaching if true, the Robot is allows to use
	 * cached documents. That means it will first try to get teh document
	 * from the docManager cache and will only retrieve it if it is
	 * not found in the cache. If the cache returns a document, the robot
	 * will NEVER retrieve it again. Therefore, expiration mechanisms have
	 * to be included in the HttpDocManager method retrieveFromCache.
	 * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
	 */
	public void setAllowCaching(boolean allowCaching) {
		this.allowCaching = allowCaching;
	}

	/**
	 * @return the document manager of this robot
	 * @see HttpDocManager
	 */
	public HttpDocManager getDocManager() {
		return docManager;
	}

	/**
	 * Sets the document manager for this robot <br />
	 * Without a document manager, the robot will travel through the web but
	 * don't do anything with the retrieved documents (simply forget
	 * them). 
	 * A document manager can store them, extract information or 
	 * whatever you like. 
	 * There can be only one document manager, but you are free to combine
	 * functionalities of available document managers in a new object (e.g.
	 * to store the document and extract meta informations).
	 * @param docManager
	 */
	public void setDocManager(HttpDocManager docManager) {
		this.docManager = docManager;
	}

	/**
	 * Sets the CookieManager used by the HttpTool
	 * By default a MemoryCookieManager will be used, but you can
	 * use this method to use your own CookieManager implementation.
	 *
	 * @param cm an object that implements the CookieManager interface
	 */
	public void setCookieManager(CookieManager cm) {
		httpTool.setCookieManager(cm);
	}

	/**
	 * Gets the CookieManager used by the HttpTool
	 *
	 * @return the CookieManager that will be used by the HttpTool
	 */
	public CookieManager getCookieManager() {
		return httpTool.getCookieManager();
	}

	/**
	 * Sets the DownloadRule
	 * @param rule the download rule set to use
	 */
	public void setDownloadRuleSet(DownloadRuleSet rules) {
		httpTool.setDownloadRuleSet(rules);
	}

	/**
	 * Sets the URLCheck for this robot
	 * @param check
	 */
	public void setURLCheck(URLCheck check) {
		this.urlCheck = check;
	}

	/** 
	 *  sets a proxy to use 
	 *  @param proxyDescr the Proxy definition in the format host:port
	 */
	public void setProxy(String proxyDescr) throws HttpException {
		httpTool.setProxy(proxyDescr);
	}

	/**
	 * @return the current proxy setting in the format host:port
	 */
	public String getProxy() {
		return httpTool.getProxy();
	}

	/**
	 * @return the Referer setting for the first HTTP reuest
	 */
	public String getStartReferer() {
		return startReferer;
	}

	/**
	 * sets the Referer setting for the first HTTP reuest
	 * @param startReferer an URL (e.g. http://www.matuschek.net)
	 */
	public void setStartReferer(String startReferer) {
		this.startReferer = startReferer;
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -