📄 webrobot.java

📁 一个Java的网络爬虫
💻 JAVA
📖 第 1 页 / 共 4 页
字号:

		// look in the cache first, but only for static pages
		boolean reScan = true;
		if ((docManager != null && allowCaching)
			&& (task.getMethod() == HttpConstants.GET)
			&& (task.getParamString() == null)) {
			doc = docManager.retrieveFromCache(u);
/*			if (doc != null) {
				try {
					links = ((UrlCollector) docManager).retrieveLinks(doc);
				} catch (IOException e) {
					log.info("Could not get links for " + u + ": " + e.getMessage());
					links = null;
				} 
			}*/
			
			if (doc != null) {
				countCache++;
				long lastRetrieved = doc.getDateAsMilliSeconds();
				double ageInSeconds = (now - lastRetrieved) / 1000;
				if (ageInSeconds < 0) {
					log.warn("DocumentAge < 0!");
				}
				reScan = maxDocumentAge >= 0 && ageInSeconds > maxDocumentAge;
				if (reScan) {
					long lastModified = doc.getLastModifiedAsMilliSeconds();
					Date lastModifiedDate = new Date(lastModified);
					httpTool.setIfModifiedSince(lastModifiedDate);
				}
			} else {
				httpTool.setIfModifiedSince(null);
			}
		}

		// if not found in cache, retrieve from the web page
		if (reScan) {
			HttpDoc newDoc;
			boolean error = false;
			try {
				if (u.getProtocol().equalsIgnoreCase("file")) {
					// retrieve from file
					newDoc = retrieveFileURL(u, httpTool.getIfModifiedSince());
				} else {
					// retrieve from Web
					newDoc = httpTool.retrieveDocument(u, task.getMethod(), task.getParamString());
					if (newDoc != null) {
						newDoc.setDate(now);
					}
					sleepNow();
				}
				
				if (newDoc!= null && !newDoc.isNotModified()) {
					if (!(newDoc.isOk() || newDoc.isRedirect())) {
						error = true;
					}
				} else {
					// (newDoc == null || newDoc.isNotModified()) && doc != null 
					// -> Not modified
					// -> refresh time stamp
					if (doc != null) {
						doc.setDate(now);
						doc.setCached(false);
						newDoc = null;
					}
				}
			} catch (HttpException hex) {
				error = true; newDoc = null;
			}
			if (error) {
				int retry = task.retry();
				if (retry <= maxRetries) {
					synchronized(visited) {
						todo.add(task);
						visited.remove(task);
					}
					log.info("Adding " + u + " for retry no. " + retry);
					return;
				} else {
					doc = docManager.retrieveFromCache(u);
					if (doc == null) {
						log.warn("Unsuccessfull retries for " + u);
						return;
					} else {
						long docDate = doc.getDateAsMilliSeconds();
						long age = (now - docDate);
						age /= 1000;
						if (expirationAge < 0 || age < expirationAge) {
							newDoc = doc;
							cached = true;
							log.info("Cached document not expired: " + u);
						} else {
							log.warn("Cached document expired: " + u);
							docManager.removeDocument(u);
							return;
						}
					}
				}
			}
			
			if (newDoc != null) {
				countWeb++;
				doc = newDoc;
				links = null; // force recalculation of links
				countRefresh++;
			} else {
				cached = true;
				countNoRefresh++;
			}
		} else {
			cached = true;
			log.debug("Page " + u + " retrieved from cache");
		}

		// Add it to the visited vector
		// needs to be synchronized with todo-list
//		visited.add(task); 
		
		// got a NULL document, that doc was not retrieved
		// usually, it was not downloaded because a rule didn't allow
		// to download it
		if (doc == null) {
			log.info("not downloaded " + u);
			return;
		}

		// Duplicate check
		String duplicate=null;
		if (duplicateCheck) {
			duplicate = getContentVisitedURL(doc);
			if (duplicate != null) {
				log.info("URLs with same content found: " + urlString + " = " + duplicate);
			} else {	
				try {
					duplicate = docManager.findDuplicate(doc);
					if (duplicate != null) {
						log.info("URLs with same content found in cache: " + urlString + " = " + duplicate);
					}
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			
			if (duplicate != null) {
				String pureDuplicate = removeParameters(duplicate);
				String pureUrl = removeParameters(urlString);
				if (!pureUrl.equals(pureDuplicate) && !cached) {
					// different url not yet stored -> store it
					try {
						// retrieve links from original
						HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
						if (linksDoc != null) {		
							doc.setLinks(linksDoc.getLinks());
						}
						docManager.storeDocument(doc);
					} catch (Exception e) {
						e.printStackTrace();
					}
				}
				RobotTask newTask;
				try {
					newTask = createRobotTask(new URL(duplicate), depth, referer);
					// check already here for visited tasks to save memory
					if (!visited.contains(newTask)) {
						addTask(newTask);
					}
				} catch (MalformedURLException e) {
					e.printStackTrace(); // Can磘 happen
				}
				return;
			} 
		}

		// was it an UnAuthorized document ?
		if (doc.isUnauthorized()) {
			log.info("got HTTP Unauthorized for URL " + u);
		}

		if (doc.isOk() || cached) {
			// callback
			if (webRobotCallback != null) {
				int contentLength=0;
				if (doc.getContent() != null) { contentLength=doc.getContent().length; }
				webRobotCallback.webRobotRetrievedDoc(urlString, contentLength);
			}

			// extract links
			try {
				if (doc.isHTML() && (depth > 0)) {
					// solving encoding problem
					// HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
					HtmlDocument htmlDoc = null;
					HttpHeader contentTypeHeader = doc.getHeader("Content-type");
					if (contentTypeHeader != null) {
						String contentType = contentTypeHeader.getValue();
						int index = contentType.toLowerCase().indexOf("charset=");
						if (index > 0) {
							htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
						} else {
							htmlDoc = new HtmlDocument(u, doc.getContent());
						}
					} else {
						htmlDoc = new HtmlDocument(u, doc.getContent());
					}
	
					// add links
					
					// this depth-check is critical!
					// otherwise far too many RobotTasks will be created
					// this will cause a premature OutOfMemoryException!
					if (depth > 0) {
						if (duplicate != null) {
							HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
							doc.setLinks(linksDoc.getLinks());
						} else if (cached) {
						} 
						if (links == null) {
							links = htmlDoc.getLinks();
							doc.setLinks(links);
						}
						if (duplicate == null) {
							HashSet checkedLinks = new HashSet();
							for (int i = 0; i < links.size(); i++) {
								URL link = (URL) links.elementAt(i);
								log.info("Link: "+link);
								// check already here for duplicate links to avoid expensive
								// creation of RobotTasks
								if (!checkedLinks.contains(link)) {
									checkedLinks.add(link);
									String myReferer = u.toString();
									if (u.getUserInfo() != null) {
										// remove userinfo from referer
										int endindex = myReferer.indexOf("@")+1;
										myReferer = "http://"+ myReferer.substring(endindex);
									}
									
									RobotTask newTask = createRobotTask((URL) links.elementAt(i), depth - 1, myReferer);
									// check already here for visited tasks to save memory
									if (!visited.contains(newTask)) {
										// bad workaround to retrieve images first
										if (newTask.urlString.endsWith(".jpg")) {
											addTaskAtStart(newTask);
										} else {
											addTask(newTask);
										}
									}
								}
							}
						}
					}
					
					if (hasFormHandlers) {
						// add forms
						Vector forms = htmlDoc.getElements("form");
						for (int i = 0; i < forms.size(); i++) {
							ExtendedURL eurl = formFiller.fillForm(u, (Element) forms.elementAt(i));
							if (eurl != null) {
								RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString());
								newTask.setParamString(eurl.getParams());
								newTask.setMethod(eurl.getRequestMethod());
								addTask(newTask);
							}
						}
					}
	
				}
			// catch any occuring error to keep on processing
			} catch (OutOfMemoryError e) {
				throw e;
			} catch (Throwable e){
				log.error("Unexpected error while extraction links from url '" + u + "':"+e);
				e.printStackTrace();
				// continue processing
			}

			// filter and store the document
			if ((docManager != null)) {
				try {
					if (filters != null) {
						doc = filters.process(doc);
					} else {
						log.debug("No filters defined");
					}
					
					if (isProcessingAllowed(doc)) {
						docManager.processDocument(doc);
					} else	{
						String md5 = doc.getHeaderValue(HttpHeader.CONTENT_MD5);
						doc.setContent("Not for indexing".getBytes());
						doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5);
					} 
					
					try {
						docManager.storeDocument(doc);
					} catch (Exception e) {
						log.warn("could not store (not for indexing) " + urlString + ": " + e.getMessage());
					}
					if (activatedContentHistory && duplicate==null) {
						setContentVisitedURL(doc, urlString);
					}
				} catch (DocManagerException e1) {
					log.error("could not process document: " + e1.getMessage());
					exceptionHandler.handleException(this, u, e1);
				} catch (FilterException e2) {
					log.error(e2.getMessage());
				}
			}

		} else {
			// it was NOT a 200 return code !

			if (doc.isRedirect()) {
				String ref = doc.getLocation();
				log.info("Got redirect to " + ref);

				try {
					URL u2 = new URL(u, ref);
					// is it on another host ?

					// On a redirect, browsers use the old Referer instead of the
					// URL that got this redirect
					// Therefore we do not use u.toString as Referer but the old Referer
					RobotTask newTask = createRobotTask(u2, depth - 1, referer);

					// it will be inserted at the beginning of the vector !
					addTaskAtStart(newTask);
				} catch (MalformedURLException e) {
					// ignore this URL
				}
				// handle other values
			} else if (doc.isNotFound()) {
				// the document was not found
				exceptionHandler.handleException(this, u, new HttpException("Document not found"));
			} else if (doc.isUnauthorized()) {
				// the document was not found
				exceptionHandler.handleException(
					this,
					u,
					new HttpException("No authorization for the document."));
			} else {
				// an other error occured.
				exceptionHandler.handleException(this, u, new HttpException("Unknown document error (Http return code "+doc.getHttpCode()+")."));
			}
		}
	}

	/**
	 * Inform about spidering progress.
	 * May use iteration, startTime,
	 * countCache, countWeb, countRefresh, countNoRefresh
	 */
	public void updateProgressInfo() {
	}

	/**
	 * sleep for sleepTime seconds.
	 */
	public void sleepNow() {
		if (sleepTime > 0) {
			synchronized(this) {
				if (webRobotCallback != null) {
					webRobotCallback.webRobotSleeping(true);
				}
				
				try {
					Thread.sleep(sleepTime * 1000);
				} catch (InterruptedException e) {
				}
			
				if (webRobotCallback != null) {
					webRobotCallback.webRobotSleeping(false);
				}
			}
		}
	}

	/**
	 * retrieves a file from the local file system.
	 * @param url the url of the file to retrieve
	 * @return HttpDoc containing the content and mime type
	 */
	private HttpDoc retrieveFileURL(URL url, Date ifModifiedSince) throws HttpException {
		HttpDoc doc = new HttpDoc();

		try {
			String host = url.getHost();
			String filename = url.getFile();
			if ((host == null) || (host.equals(""))) {
				// local file
				// remove leading / or \
				if ((filename.startsWith("\\")) || (filename.startsWith("/"))) {
					filename = filename.substring(1);
				}
			} else {
				filename = "//" + host + filename;
			}
			// get the mimetype and put in the http header
			String mimetypestr = getMimeTypeForFilename(filename);
			if (mimetypestr != null) {
				HttpHeader header = new HttpHeader("content-type", mimetypestr);
				doc.addHeader(header);
			}
			
			// get the content from the file
			File file = new File(filename);
			if (!file.exists()) {
				doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTFOUND);
				return doc;
			}
			long fileLastModified = file.lastModified();
			long ifModifiedSinceTime = ifModifiedSince == null ? 0 : ifModifiedSince.getTime();
			if (fileLastModified > ifModifiedSinceTime) {
				byte[] content = readFileToByteArray(file);
				doc.setContent(content);
				doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK);
			} else {
				doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTMODIFIED);
			}
			doc.setLastModified(fileLastModified);
			doc.setDate(System.currentTimeMillis());
			doc.setURL(url);
			
			return doc;
		} catch (Exception e) {
			throw new HttpException(e.getMessage());
		}
	}

	/**
	 * Get the Mime type for the given filename.
	 * @param filename
	 * @return Mime type
	 */
	protected String getMimeTypeForFilename(String filename) {
		if (filename.endsWith(".html") || filename.endsWith(".htm")) {
			return "text/html";
		} else {
			return null;
		}
	}
	
	/** 
	 * Clean up temporary data
	 */
	protected void cleanUp() {
		stopIt = false;
		visited.clear();
		todo.clear();
	}

	/** 
	 * adds a new task to the task vector but does some checks to 
	 */
	protected void addTask(RobotTask task) {
		if (taskAddAllowed(task) && activatedNewTasks) {
			todo.add(task);
		}
	}

	/** 
	 * adds a new tasks at the beginning of the tasks list 
	 * @see #addTask(RobotTask)
	 */
	protected void addTaskAtStart(RobotTask task) {
		if (taskAddAllowed(task) && activatedNewTasks) {
			todo.addAtStart(task);
		}
	}

	/**
	 * Checks if a tasks should be added to the task list
	 * @param robotTask
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -