📄 webrobot.java
字号:
}
/**
* should we ignore robots.txt Robot Exclusion protocol ?
* @param ignoreRobotsTxt if set to true, the robot will ignore
* the settings of the /robots.txt file on the webserver
* <b>Know what you are doing if you change this setting</b>
*/
public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
robCheck.setIgnore(ignoreRobotsTxt);
}
/**
* @return the sleeptime setting
*/
public int getSleepTime() {
return sleepTime;
}
/**
* set the sleeptime<br />
* after every retrieved document the robot will wait this time
* before getting the next document. this allows it to limit the
* load on the server
* @param sleeptime wait time in seconds
*/
public void setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
}
/**
* sets the From: HTTP header<br />
* this should be a valid email address. it is not needed for the robot,
* but you should use it, because the administrator of the web server
* can contact you if the robot is doing things that he don't want
* @param fromAdress an RFC 822 email adress
*/
public void setFromAddress(String fromAddress) {
httpTool.setFromAddress(fromAddress);
}
/**
* sets the list of form handlers
* @see net.matuschek.html.FormHandler for more
* information about form handlers
*/
public void setFormHandlers(Vector handlers) {
formFiller.setFormHandlers(handlers);
if (handlers != null && handlers.size() > 0) {
hasFormHandlers = true;
}
}
/**
* @return the list of form handlers
* @see net.matuschek.html.FormHandler for more information
* about form handlers
*/
public Vector getFormHandlers() {
return formFiller.getFormHandlers();
}
/**
* Gets the name of the "User-Agent" header that the robot will use
* @return the user agent name
*/
public String getAgentName() {
if (httpTool != null) {
return httpTool.getAgentName();
} else {
return null;
}
}
/**
* sets the Agent-Name authentication for this robot
* @param name a name for this robot
* (e.g. "Mozilla 4.0 (compatible; Robot)")
*/
public void setAgentName(String name) {
httpTool.setAgentName(name);
// robCheck = new NoRobots(ROBOT_NAME, httpTool);
robCheck = new NoRobots(name, httpTool);
}
/**
* Gets the timeout for getting data in seconds of the used HttpTool
* @return the value of sockerTimeout
* @see #setTimeout(int)
*/
public int getTimeout() {
if (httpTool != null) {
return httpTool.getTimeout();
} else {
return -1;
}
}
/**
* Sets the timeout for getting data. If HttpTool can't read data from a
* remote web server after this number of seconds it will stop the download
* of the current file
* @param timeout Timeout in seconds
*/
public void setTimeout(int timeout) {
httpTool.setTimeout(timeout);
}
/**
* Gets the ntlmAuthentication of the robot
* @return the ntlmAuthentication
*/
public NTLMAuthorization getNtlmAuthorization() {
if (httpTool != null) {
return httpTool.getNtlmAuthorization();
} else {
return null;
}
}
/**
* sets a ntlmAuthentication for this robot
* @param ntlmAuthentication for this robot
*/
public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
httpTool.setNtlmAuthorization(ntlmAuthorization);
}
/**
* Gets the setting of the IgnoreRobotsTxt property
* @return true if robots.txt will be ignored, false otherwise
*/
public boolean getIgnoreRobotsTxt() {
return ignoreRobotsTxt;
}
/**
* Gets a vector of URLs that can be visited more then once
* @return a vector containing URLs formated as Strings
*/
public Vector getVisitMany() {
return visitMany;
}
public void setVisitMany(Vector visitMany) {
this.visitMany = visitMany;
}
public void setHttpToolCallback(HttpToolCallback callback) {
httpTool.setCallback(callback);
}
public WebRobotCallback getWebRobotCallback() {
return webRobotCallback;
}
public void setWebRobotCallback(WebRobotCallback webRobotCallback) {
this.webRobotCallback = webRobotCallback;
}
/**
* Sets the sleep status for this robot. If a WebRobot is set to sleep
* after starting run(), is will wait after retrieving the current document
* and wait for setSleep(false)
*/
public void setSleep(boolean sleep) {
this.sleep = sleep;
}
/**
* Is the robot sleeping ?
*/
public boolean isSleeping() {
return this.sleep;
}
/**
* Set the list of allowed URLs
* @param allowed a Vector containing Strings. URLs will be checked
* if they begin of a string in this vector
*/
public void setAllowedURLs(Vector allowed) {
this.allowedURLs = allowed;
}
/**
* Gets the list of allowed URLs
* @return a Vector containing Strings
* @see #setAllowedURLs(Vector)
*/
public Vector getAllowedURLs() {
return this.allowedURLs;
}
/**
* Enable/disable cookies
* @param enable if true, HTTP cookies will be enabled, if false
* the robot will not use cookies
*/
public void setEnableCookies(boolean enable) {
httpTool.setEnableCookies(enable);
}
/**
* Get the status of the cookie engine
* @return true, if HTTP cookies are enabled, false otherwise
*/
public boolean getEnableCookies() {
return httpTool.getEnableCookies();
}
/**
* Set the maximum age of documents to retrieve to this number
* of seconds
* @param maxAge integer value of the maximum document age
* (in seconds), negative value means no limit.
*/
public void setMaxDocumentAge(long maxAge) {
this.maxDocumentAge = maxAge;
}
/**
* Gets the maximum age of documents to retrieve
* @return maximum document age (in seconds), negative value means
* no limit.
*/
public long getMaxDocumentAge() {
return this.maxDocumentAge;
}
/**
* Sets a FilterChain. If teh WebRobot use a FilterChain it will
* process any retrieved document by this FilterChain before
* storing it
*
* @param filter a FilterChain to use for filtering HttpDocs
*/
public void setFilters(FilterChain filters) {
this.filters = filters;
}
/**
* Delete all cookies
*/
public void clearCookies() {
httpTool.clearCookies();
}
/**
* thread run() method, simply calls work()
* @see #work()
*/
public void run() {
work();
}
/**
* do your job travel through the web using the configured
* parameters and retrieve documents
*/
public void work() {
RobotTask task = createRobotTask(startURL, maxDepth, startReferer);
todo.add(task);
walkTree();
// ok, we did it, clean up dynamic data (the vistited vector)
cleanUp();
log.info("Documents retrieved by: Web=" + countWeb + " Cache=" + countCache + " Refresh=" + countRefresh+ " NoRefresh=" + countNoRefresh);
}
/**
* stop the current robot run
* note that this will not abourt the current download but stop after
* the current download has finished
*/
public void stopRobot() {
stopIt = true;
}
/**
* Holds information about memory status.
* @see handleMemoryError(OutOfMemoryError)
*/
private int memoryLevel = 0;
/** Can new tasks be added? (may depend on memoryLevel) */
protected boolean activatedNewTasks = true;
/** Are visited URLs collected? (may depend on memoryLevel) */
protected boolean activatedUrlHistory = true;
/** Are visited contents collected? (may depend on memoryLevel) */
protected boolean activatedContentHistory = true;
/** memory buffer of 200 KB to be freed in case of urgent memory needs */
private byte memoryBuffer[] = new byte[200 * 1024];
/**
* do your job !
*/
public void walkTree() {
while ((todo.size() > 0) && (!stopIt)) {
RobotTask task;
synchronized(visited) {
task = todo.removeFirst();
if (visited.contains(task) && (!visitMany.contains(task.getUrl().toString()))) {
log.debug("already visited: " + task.getUrl());
continue;
}
if (activatedUrlHistory) {
visited.add(task);
}
}
boolean repeat = true;
while (repeat) {
try {
retrieveURL(task);
repeat = false;
} catch (OutOfMemoryError memoryError) {
handleMemoryError(memoryError);
}
}
// sleep, if sleep is set to true
while (sleep) {
// callback
if (webRobotCallback != null) {
webRobotCallback.webRobotSleeping(true);
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
};
}
// callback
if (webRobotCallback != null) {
webRobotCallback.webRobotSleeping(false);
}
// callback
if (webRobotCallback != null) {
webRobotCallback.webRobotUpdateQueueStatus(todo.size());
}
spawnThread();
}
// callback
if (webRobotCallback != null) {
finishThreads();
}
}
/**
* Implements OutOfMemory handling strategies.
* Action depends on memoryLevel
* @param memoryError
* @throws OutOfMemoryError
*/
protected void handleMemoryError(OutOfMemoryError memoryError)
throws OutOfMemoryError {
memoryLevel++;
log.error("OutOfMemoryError level=" + memoryLevel + "! (visited=" + visited.size() + ", todo=" + todo.size() + ")");
switch (memoryLevel) {
case 1:
// don磘 remember visited URLs and contents any more
// and try it again
visited.clear(); activatedUrlHistory = false;
content2UrlMap.clear(); activatedContentHistory = false;
System.gc();
break;
case 2:
// stop adding new Tasks, just process todo-list.
// free memory buffer
// and try it again
activatedNewTasks = false;
memoryBuffer = null;
System.gc();
break;
case 3:
// there is nothing we can do any more.
// throw exception to stop robot
throw memoryError;
default :
// Should never be reached.
if (memoryBuffer != null) {
// avoid removal of memoryBuffer by compiler
System.err.println(memoryBuffer[0]);
}
throw memoryError;
}
}
/**
* calls webRobotDone and finishes docManager if
* executed in mainThread
*/
protected void finishThreads() {
webRobotCallback.webRobotDone();
if (docManager != null) {
docManager.finish();
}
}
/**
* Start subThreads for spidering.
* WARNING: Should only be implemented and used for local
* spidering purposes!
*/
protected synchronized void spawnThread() {
}
/** counter for calls of retrieveURL */
protected int iteration = 0;
/**
* retrieve the next URL, save it, extract all included links and
* add those links to the tasks list
* @param task task to retrieve, function does nothing if this is null
*/
public void retrieveURL(RobotTask task) {
if (task == null) {
log.debug("Empty task found, ignoring");
return;
}
long now = System.currentTimeMillis();
updateProgressInfo();
URL u = task.getUrl();
String urlString = u.toString();
String referer = task.getReferer();
int depth = task.getMaxDepth();
if (depth < 0) {
log.info("Max search depth reached");
return;
}
// we may need this additional check even if we
// tested it during adding to the tasks list
if (!isAllowed(u)) {
log.info("Url '" + u + "' filtered out.");
return;
}
if (u.getFile().equals("")) {
try {
urlString = urlString + "/";
u = new URL(urlString);
// fix for double retrieved files
task.setUrl(u);
} catch (MalformedURLException e) {
log.error("URL not well formed: " + e.toString());
// use exception handler to handle exception
exceptionHandler.handleException(this, u, e);
return;
}
}
log.info("retrieving " + urlString);
httpTool.setReferer(referer);
HttpDoc doc = null;
Vector links = null;
boolean cached = false;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -