📄 webrobot.java
字号:
// look in the cache first, but only for static pages
boolean reScan = true;
if ((docManager != null && allowCaching)
&& (task.getMethod() == HttpConstants.GET)
&& (task.getParamString() == null)) {
doc = docManager.retrieveFromCache(u);
/* if (doc != null) {
try {
links = ((UrlCollector) docManager).retrieveLinks(doc);
} catch (IOException e) {
log.info("Could not get links for " + u + ": " + e.getMessage());
links = null;
}
}*/
if (doc != null) {
countCache++;
long lastRetrieved = doc.getDateAsMilliSeconds();
double ageInSeconds = (now - lastRetrieved) / 1000;
if (ageInSeconds < 0) {
log.warn("DocumentAge < 0!");
}
reScan = maxDocumentAge >= 0 && ageInSeconds > maxDocumentAge;
if (reScan) {
long lastModified = doc.getLastModifiedAsMilliSeconds();
Date lastModifiedDate = new Date(lastModified);
httpTool.setIfModifiedSince(lastModifiedDate);
}
} else {
httpTool.setIfModifiedSince(null);
}
}
// if not found in cache, retrieve from the web page
if (reScan) {
HttpDoc newDoc;
boolean error = false;
try {
if (u.getProtocol().equalsIgnoreCase("file")) {
// retrieve from file
newDoc = retrieveFileURL(u, httpTool.getIfModifiedSince());
} else {
// retrieve from Web
newDoc = httpTool.retrieveDocument(u, task.getMethod(), task.getParamString());
if (newDoc != null) {
newDoc.setDate(now);
}
sleepNow();
}
if (newDoc!= null && !newDoc.isNotModified()) {
if (!(newDoc.isOk() || newDoc.isRedirect())) {
error = true;
}
} else {
// (newDoc == null || newDoc.isNotModified()) && doc != null
// -> Not modified
// -> refresh time stamp
if (doc != null) {
doc.setDate(now);
doc.setCached(false);
newDoc = null;
}
}
} catch (HttpException hex) {
error = true; newDoc = null;
}
if (error) {
int retry = task.retry();
if (retry <= maxRetries) {
synchronized(visited) {
todo.add(task);
visited.remove(task);
}
log.info("Adding " + u + " for retry no. " + retry);
return;
} else {
doc = docManager.retrieveFromCache(u);
if (doc == null) {
log.warn("Unsuccessfull retries for " + u);
return;
} else {
long docDate = doc.getDateAsMilliSeconds();
long age = (now - docDate);
age /= 1000;
if (expirationAge < 0 || age < expirationAge) {
newDoc = doc;
cached = true;
log.info("Cached document not expired: " + u);
} else {
log.warn("Cached document expired: " + u);
docManager.removeDocument(u);
return;
}
}
}
}
if (newDoc != null) {
countWeb++;
doc = newDoc;
links = null; // force recalculation of links
countRefresh++;
} else {
cached = true;
countNoRefresh++;
}
} else {
cached = true;
log.debug("Page " + u + " retrieved from cache");
}
// Add it to the visited vector
// needs to be synchronized with todo-list
// visited.add(task);
// got a NULL document, that doc was not retrieved
// usually, it was not downloaded because a rule didn't allow
// to download it
if (doc == null) {
log.info("not downloaded " + u);
return;
}
// Duplicate check
String duplicate=null;
if (duplicateCheck) {
duplicate = getContentVisitedURL(doc);
if (duplicate != null) {
log.info("URLs with same content found: " + urlString + " = " + duplicate);
} else {
try {
duplicate = docManager.findDuplicate(doc);
if (duplicate != null) {
log.info("URLs with same content found in cache: " + urlString + " = " + duplicate);
}
} catch (IOException e) {
e.printStackTrace();
}
}
if (duplicate != null) {
String pureDuplicate = removeParameters(duplicate);
String pureUrl = removeParameters(urlString);
if (!pureUrl.equals(pureDuplicate) && !cached) {
// different url not yet stored -> store it
try {
// retrieve links from original
HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
if (linksDoc != null) {
doc.setLinks(linksDoc.getLinks());
}
docManager.storeDocument(doc);
} catch (Exception e) {
e.printStackTrace();
}
}
RobotTask newTask;
try {
newTask = createRobotTask(new URL(duplicate), depth, referer);
// check already here for visited tasks to save memory
if (!visited.contains(newTask)) {
addTask(newTask);
}
} catch (MalformedURLException e) {
e.printStackTrace(); // Can磘 happen
}
return;
}
}
// was it an UnAuthorized document ?
if (doc.isUnauthorized()) {
log.info("got HTTP Unauthorized for URL " + u);
}
if (doc.isOk() || cached) {
// callback
if (webRobotCallback != null) {
int contentLength=0;
if (doc.getContent() != null) { contentLength=doc.getContent().length; }
webRobotCallback.webRobotRetrievedDoc(urlString, contentLength);
}
// extract links
try {
if (doc.isHTML() && (depth > 0)) {
// solving encoding problem
// HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
HtmlDocument htmlDoc = null;
HttpHeader contentTypeHeader = doc.getHeader("Content-type");
if (contentTypeHeader != null) {
String contentType = contentTypeHeader.getValue();
int index = contentType.toLowerCase().indexOf("charset=");
if (index > 0) {
htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
} else {
htmlDoc = new HtmlDocument(u, doc.getContent());
}
} else {
htmlDoc = new HtmlDocument(u, doc.getContent());
}
// add links
// this depth-check is critical!
// otherwise far too many RobotTasks will be created
// this will cause a premature OutOfMemoryException!
if (depth > 0) {
if (duplicate != null) {
HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
doc.setLinks(linksDoc.getLinks());
} else if (cached) {
}
if (links == null) {
links = htmlDoc.getLinks();
doc.setLinks(links);
}
if (duplicate == null) {
HashSet checkedLinks = new HashSet();
for (int i = 0; i < links.size(); i++) {
URL link = (URL) links.elementAt(i);
log.info("Link: "+link);
// check already here for duplicate links to avoid expensive
// creation of RobotTasks
if (!checkedLinks.contains(link)) {
checkedLinks.add(link);
String myReferer = u.toString();
if (u.getUserInfo() != null) {
// remove userinfo from referer
int endindex = myReferer.indexOf("@")+1;
myReferer = "http://"+ myReferer.substring(endindex);
}
RobotTask newTask = createRobotTask((URL) links.elementAt(i), depth - 1, myReferer);
// check already here for visited tasks to save memory
if (!visited.contains(newTask)) {
// bad workaround to retrieve images first
if (newTask.urlString.endsWith(".jpg")) {
addTaskAtStart(newTask);
} else {
addTask(newTask);
}
}
}
}
}
}
if (hasFormHandlers) {
// add forms
Vector forms = htmlDoc.getElements("form");
for (int i = 0; i < forms.size(); i++) {
ExtendedURL eurl = formFiller.fillForm(u, (Element) forms.elementAt(i));
if (eurl != null) {
RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString());
newTask.setParamString(eurl.getParams());
newTask.setMethod(eurl.getRequestMethod());
addTask(newTask);
}
}
}
}
// catch any occuring error to keep on processing
} catch (OutOfMemoryError e) {
throw e;
} catch (Throwable e){
log.error("Unexpected error while extraction links from url '" + u + "':"+e);
e.printStackTrace();
// continue processing
}
// filter and store the document
if ((docManager != null)) {
try {
if (filters != null) {
doc = filters.process(doc);
} else {
log.debug("No filters defined");
}
if (isProcessingAllowed(doc)) {
docManager.processDocument(doc);
} else {
String md5 = doc.getHeaderValue(HttpHeader.CONTENT_MD5);
doc.setContent("Not for indexing".getBytes());
doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5);
}
try {
docManager.storeDocument(doc);
} catch (Exception e) {
log.warn("could not store (not for indexing) " + urlString + ": " + e.getMessage());
}
if (activatedContentHistory && duplicate==null) {
setContentVisitedURL(doc, urlString);
}
} catch (DocManagerException e1) {
log.error("could not process document: " + e1.getMessage());
exceptionHandler.handleException(this, u, e1);
} catch (FilterException e2) {
log.error(e2.getMessage());
}
}
} else {
// it was NOT a 200 return code !
if (doc.isRedirect()) {
String ref = doc.getLocation();
log.info("Got redirect to " + ref);
try {
URL u2 = new URL(u, ref);
// is it on another host ?
// On a redirect, browsers use the old Referer instead of the
// URL that got this redirect
// Therefore we do not use u.toString as Referer but the old Referer
RobotTask newTask = createRobotTask(u2, depth - 1, referer);
// it will be inserted at the beginning of the vector !
addTaskAtStart(newTask);
} catch (MalformedURLException e) {
// ignore this URL
}
// handle other values
} else if (doc.isNotFound()) {
// the document was not found
exceptionHandler.handleException(this, u, new HttpException("Document not found"));
} else if (doc.isUnauthorized()) {
// the document was not found
exceptionHandler.handleException(
this,
u,
new HttpException("No authorization for the document."));
} else {
// an other error occured.
exceptionHandler.handleException(this, u, new HttpException("Unknown document error (Http return code "+doc.getHttpCode()+")."));
}
}
}
/**
* Inform about spidering progress.
* May use iteration, startTime,
* countCache, countWeb, countRefresh, countNoRefresh
*/
public void updateProgressInfo() {
}
/**
* sleep for sleepTime seconds.
*/
public void sleepNow() {
if (sleepTime > 0) {
synchronized(this) {
if (webRobotCallback != null) {
webRobotCallback.webRobotSleeping(true);
}
try {
Thread.sleep(sleepTime * 1000);
} catch (InterruptedException e) {
}
if (webRobotCallback != null) {
webRobotCallback.webRobotSleeping(false);
}
}
}
}
/**
* retrieves a file from the local file system.
* @param url the url of the file to retrieve
* @return HttpDoc containing the content and mime type
*/
private HttpDoc retrieveFileURL(URL url, Date ifModifiedSince) throws HttpException {
HttpDoc doc = new HttpDoc();
try {
String host = url.getHost();
String filename = url.getFile();
if ((host == null) || (host.equals(""))) {
// local file
// remove leading / or \
if ((filename.startsWith("\\")) || (filename.startsWith("/"))) {
filename = filename.substring(1);
}
} else {
filename = "//" + host + filename;
}
// get the mimetype and put in the http header
String mimetypestr = getMimeTypeForFilename(filename);
if (mimetypestr != null) {
HttpHeader header = new HttpHeader("content-type", mimetypestr);
doc.addHeader(header);
}
// get the content from the file
File file = new File(filename);
if (!file.exists()) {
doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTFOUND);
return doc;
}
long fileLastModified = file.lastModified();
long ifModifiedSinceTime = ifModifiedSince == null ? 0 : ifModifiedSince.getTime();
if (fileLastModified > ifModifiedSinceTime) {
byte[] content = readFileToByteArray(file);
doc.setContent(content);
doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK);
} else {
doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTMODIFIED);
}
doc.setLastModified(fileLastModified);
doc.setDate(System.currentTimeMillis());
doc.setURL(url);
return doc;
} catch (Exception e) {
throw new HttpException(e.getMessage());
}
}
/**
* Get the Mime type for the given filename.
* @param filename
* @return Mime type
*/
protected String getMimeTypeForFilename(String filename) {
if (filename.endsWith(".html") || filename.endsWith(".htm")) {
return "text/html";
} else {
return null;
}
}
/**
* Clean up temporary data
*/
protected void cleanUp() {
stopIt = false;
visited.clear();
todo.clear();
}
/**
* adds a new task to the task vector but does some checks to
*/
protected void addTask(RobotTask task) {
if (taskAddAllowed(task) && activatedNewTasks) {
todo.add(task);
}
}
/**
* adds a new tasks at the beginning of the tasks list
* @see #addTask(RobotTask)
*/
protected void addTaskAtStart(RobotTask task) {
if (taskAddAllowed(task) && activatedNewTasks) {
todo.addAtStart(task);
}
}
/**
* Checks if a tasks should be added to the task list
* @param robotTask
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -