📄 webrobot.java
字号:
package net.matuschek.spider;
/**
* This class implements a web robot that does a search trough
* the web starting from a given start document up to a given
* search depth.
*
* @author Daniel Matuschek / Oliver Schmidt
* @version $Revision: 1.35 $
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;
import java.util.Vector;
import net.matuschek.html.FormFiller;
import net.matuschek.html.HtmlDocument;
import net.matuschek.http.DocManagerException;
import net.matuschek.http.DownloadRuleSet;
import net.matuschek.http.ExtendedURL;
import net.matuschek.http.HttpConstants;
import net.matuschek.http.HttpDoc;
import net.matuschek.http.HttpDocManager;
import net.matuschek.http.HttpException;
import net.matuschek.http.HttpHeader;
import net.matuschek.http.HttpTool;
import net.matuschek.http.HttpToolCallback;
import net.matuschek.http.NTLMAuthorization;
import net.matuschek.http.cookie.CookieManager;
import net.matuschek.spider.docfilter.FilterChain;
import net.matuschek.spider.docfilter.FilterException;
import org.apache.log4j.Category;
import org.w3c.dom.Element;
public class WebRobot implements Runnable, Cloneable {
/** the name of the robot */
private final static String ROBOT_NAME = "JoBo";
/** the default agent name */
private final static String AGENT_NAME =
ROBOT_NAME+"/1.4 (http://www.matuschek.net/jobo.html)";
/** the robot exception handler*/
protected RobotExceptionHandler exceptionHandler =
new DefaultRobotExceptionHandler();
/** default maximal search depth */
private final static int DEFAULT_DEPTH = 10;
/** the URL where the robot walk starts from */
protected URL startURL = null;
/** the host and directory where retrieval started from */
protected String startDir = "";
/** maximal search depth */
protected int maxDepth = DEFAULT_DEPTH;
/** is it allowed to walk to other hosts then the starting host ? */
protected boolean walkToOtherHosts = false;
/** DocManager will store or process retrieved documents */
protected HttpDocManager docManager;
/** HttpTool will be used to retrieve documents from a web server */
protected HttpTool httpTool = new HttpTool();
/** Log4J category for logging */
protected Category log;
/** Referer used to retrieve to first document */
protected String startReferer = "-";
/** test for robots.txt */
protected NoRobots robCheck;
/** current tasks */
protected TaskList todo = null;
/** a list of all URLs we got already */
protected TaskList visited = null;
/** ignore settings in /robots.txt ? */
protected boolean ignoreRobotsTxt = false;
/** sleep that number of seconds after every retrieved document */
protected int sleepTime = 1;
/** fill out forms */
protected FormFiller formFiller = new FormFiller();
/** this URLs can be visited more then once */
protected Vector visitMany = new Vector();
/** for callback to the user interface **/
protected WebRobotCallback webRobotCallback = null;
/** should we stop robot operation ? **/
protected boolean stopIt = false;
/** to check if it is allowed to travel to a given URL **/
protected URLCheck urlCheck = null;
/** should the robot suspend the current walk() **/
protected boolean sleep;
/** list of allowed URLs (even if walkToOtherHosts is false) **/
protected Vector allowedURLs = new Vector();
/** allow travelling the whole host ? */
protected boolean allowWholeHost = true;
/**
* maximum document age in seconds, negative value means
* no limit
*/
protected long maxDocumentAge = -1; // no limit
/**
* allow travelling to all subdomains of the start host ?
* @see #setAllowWholeDomain(boolean)
*/
protected boolean allowWholeDomain = true;
/**
* do more flexible tests if the new URL is on the same host
* @see #basicURLCheck(URL)
*/
protected boolean flexibleHostCheck = false;
/**
* FilterChain to filter the document before storing it
*/
protected FilterChain filters = null;
/**
* don't retrieve pages again that are already stored in the DocManager
*/
protected boolean allowCaching = true;
/**
* Check for documents with the same content
*/
protected boolean duplicateCheck = false;
/**
* initializes the robot with the default implementation
* of the TaskList interface
*
* @param expected document count
*/
public WebRobot(int expectedDocumentCount) {
log = Category.getInstance(getClass().getName());
content2UrlMap = new HashMap(expectedDocumentCount);
registerVisitedList(new HashedMemoryTaskList(false,
expectedDocumentCount));
registerToDoList(new HashedMemoryTaskList(true,
expectedDocumentCount));
this.expectedDocumentCount = expectedDocumentCount;
this.setAgentName(AGENT_NAME);
}
/**
* initializes the robot with the default implementation of the TaskList
* interface
*/
public WebRobot() {
this(DEFAULT_EXPECTED_DOCUMENT_COUNT);
}
/**
* Sets the implementation class for the backend task list storage.
* WebRobot uses the TaskList interface to store future tasks.
*
* If you want to use your own TaskList implementation, just call
* this method.
*
* @param todo TaskList to be used for the "to do" list
*/
public void registerToDoList(TaskList todo) {
this.todo = todo;
}
/**
* Sets the implementation class for the backend task list storage.
* WebRobot uses the TaskList interface to store URLs that have
* been retrieved before.
*
* If you want to use your own TaskList implementation, just call
* this method.
*
* @param visited TaskList to be used for the list of visited URLs
*/
public void registerVisitedList(TaskList visited) {
this.visited = visited;
}
/**
* @return the start URL for this robot
*/
public URL getStartURL() {
return startURL;
}
/**
* Sets the start URL for this robot
* @param startURL the start URL
*/
public void setStartURL(URL startURL) {
String path = startURL.getPath();
this.startURL = startURL;
// is it a directory ?
if (path.endsWith("/")) {
this.startDir = startURL.getHost() + path;
} else {
int pos = path.lastIndexOf("/");
if (pos < 0) {
// this happens for URLs without a path
this.startDir = startURL.getHost() + "/";
} else {
this.startDir = startURL.getHost() + path.substring(0, pos + 1);
}
}
}
/**
* @return the maximal allowed search depth
*/
public int getMaxDepth() {
return maxDepth;
}
/**
* sets the maximal search depth
* @param maxDepth
*/
public void setMaxDepth(int maxDepth) {
this.maxDepth = maxDepth;
}
/**
* Get the value of bandwith of the used HttpTool
* @return value of bandwith.
*/
public int getBandwidth() {
return httpTool.getBandwidth();
}
/**
* Set the value of bandwith of the used HttpTool
* @param bandwidth Value to assign to bandwith.
*/
public void setBandwidth(int bandwidth) {
httpTool.setBandwidth(bandwidth);
}
/**
* gets the WalkToOtherHost status
* @return true if the Robot is allowed to travel to other
* host then the start host, false otherwise
*/
public boolean getWalkToOtherHosts() {
return walkToOtherHosts;
}
/**
* sets the WalkToOtherHosts status
* @param walkToOtherHosts true if the Robot is allowed to travel to other
* host then the start host, false otherwise
*/
public void setWalkToOtherHosts(boolean walkToOtherHosts) {
this.walkToOtherHosts = walkToOtherHosts;
}
/**
* gets the AllowWholeHost value
* @return true if the Robot is allowed to travel to the whole
* host where it started from, false otherwise. If false, it is only
* allowed to travel to URLs below the start URL
*/
public boolean getAllowWholeHost() {
return allowWholeHost;
}
/**
* sets the AllowWholeHost status
* @param allowWholeHost if true, the Robot is allowed to
* travel to the whole host where it started from. Otherwise it is only
* allowed to travel to URLs below the start URL.
*/
public void setAllowWholeHost(boolean allowWholeHost) {
this.allowWholeHost = allowWholeHost;
}
/**
* Gets the AllowWholeDomain value.
* @return true if the Robot is allowed to travel to the whole
* domain of the start host, false otherwise.
* @see #setAllowWholeDomain(boolean)
*/
public boolean getAllowWholeDomain() {
return allowWholeDomain;
}
/**
* Sets the AllowWholeDomain status
* @param allowWholeDomain if true, the Robot is allows to travel
* to all hosts in the same domain as the starting host. E.g. if you
* start at www.apache.org, it is also allowed to travel to
* jakarta.apache.org, xml.apache.org ...
*/
public void setAllowWholeDomain(boolean allowWholeDomain) {
this.allowWholeDomain = allowWholeDomain;
}
/**
* Gets the state of flexible host checking (enabled or disabled).
*
* To find out if a new URL is on the same host, the robot usually
* compares the host part of both. Some web servers have an inconsistent
* addressing scheme and use the hostname www.domain.com and domain.com.
* With flexible host check enabled, the robot will consider both
* hosts as equal.
*
* @return true, if flexible host checking is enabled
*/
public boolean getFlexibleHostCheck() {
return flexibleHostCheck;
}
/**
* Defines if the host test should be more flexible.
*
* To find out if a new URL is on the same host, the robot usually
* compares the host part of both. Some web servers have an inconsistent
* addressing scheme and use the hostname www.domain.com and domain.com.
* With flexible host check enabled, the robot will consider both
* hosts as equal.
*
* @param flexibleHostCheck set this true, to enable flexible host checking
* (disabled by default)
*/
public void setFlexibleHostCheck(boolean flexibleHostCheck) {
this.flexibleHostCheck = flexibleHostCheck;
}
/**
* Gets the AllowCaching value.
* @return true if the Robot is allowed to cache documents in the
* docManager
* @see #setAllowCaching(boolean)
*/
public boolean getAllowCaching() {
return allowCaching;
}
/**
* Sets the AllowCaching status
*
* @param allowCaching if true, the Robot is allows to use
* cached documents. That means it will first try to get teh document
* from the docManager cache and will only retrieve it if it is
* not found in the cache. If the cache returns a document, the robot
* will NEVER retrieve it again. Therefore, expiration mechanisms have
* to be included in the HttpDocManager method retrieveFromCache.
* @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
*/
public void setAllowCaching(boolean allowCaching) {
this.allowCaching = allowCaching;
}
/**
* @return the document manager of this robot
* @see HttpDocManager
*/
public HttpDocManager getDocManager() {
return docManager;
}
/**
* Sets the document manager for this robot <br />
* Without a document manager, the robot will travel through the web but
* don't do anything with the retrieved documents (simply forget
* them).
* A document manager can store them, extract information or
* whatever you like.
* There can be only one document manager, but you are free to combine
* functionalities of available document managers in a new object (e.g.
* to store the document and extract meta informations).
* @param docManager
*/
public void setDocManager(HttpDocManager docManager) {
this.docManager = docManager;
}
/**
* Sets the CookieManager used by the HttpTool
* By default a MemoryCookieManager will be used, but you can
* use this method to use your own CookieManager implementation.
*
* @param cm an object that implements the CookieManager interface
*/
public void setCookieManager(CookieManager cm) {
httpTool.setCookieManager(cm);
}
/**
* Gets the CookieManager used by the HttpTool
*
* @return the CookieManager that will be used by the HttpTool
*/
public CookieManager getCookieManager() {
return httpTool.getCookieManager();
}
/**
* Sets the DownloadRule
* @param rule the download rule set to use
*/
public void setDownloadRuleSet(DownloadRuleSet rules) {
httpTool.setDownloadRuleSet(rules);
}
/**
* Sets the URLCheck for this robot
* @param check
*/
public void setURLCheck(URLCheck check) {
this.urlCheck = check;
}
/**
* sets a proxy to use
* @param proxyDescr the Proxy definition in the format host:port
*/
public void setProxy(String proxyDescr) throws HttpException {
httpTool.setProxy(proxyDescr);
}
/**
* @return the current proxy setting in the format host:port
*/
public String getProxy() {
return httpTool.getProxy();
}
/**
* @return the Referer setting for the first HTTP reuest
*/
public String getStartReferer() {
return startReferer;
}
/**
* sets the Referer setting for the first HTTP reuest
* @param startReferer an URL (e.g. http://www.matuschek.net)
*/
public void setStartReferer(String startReferer) {
this.startReferer = startReferer;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -