📄 webrobot.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
	}

	/**
	 * should we ignore robots.txt Robot Exclusion protocol ?
	 * @param ignoreRobotsTxt if set to true, the robot will ignore
	 * the settings of the /robots.txt file on the webserver
	 * <b>Know what you are doing if you change this setting</b>
	 */
	public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
		robCheck.setIgnore(ignoreRobotsTxt);
	}

	/** 
	 * @return the sleeptime setting
	 */
	public int getSleepTime() {
		return sleepTime;
	}

	/**
	 * set the sleeptime<br />
	 * after every retrieved document the robot will wait this time
	 * before getting the next document. this allows it to limit the
	 * load on the server
	 * @param sleeptime wait time in seconds
	 */
	public void setSleepTime(int sleepTime) {
		this.sleepTime = sleepTime;
	}

	/**
	 * sets the From: HTTP header<br />
	 * this should be a valid email address. it is not needed for the robot,
	 * but you should use it, because the administrator of the web server
	 * can contact you if the robot is doing things that he don't want
	 * @param fromAdress an RFC 822 email adress
	 */
	public void setFromAddress(String fromAddress) {
		httpTool.setFromAddress(fromAddress);
	}

	/**
	 * sets the list of form handlers
	 * @see net.matuschek.html.FormHandler for more 
	 * information about form handlers
	 */
	public void setFormHandlers(Vector handlers) {
		formFiller.setFormHandlers(handlers);
		if (handlers != null && handlers.size() > 0) {
			hasFormHandlers = true;
		}
	}

	/**
	 * @return the list of form handlers
	 * @see net.matuschek.html.FormHandler for more information 
	 * about form handlers
	 */
	public Vector getFormHandlers() {
		return formFiller.getFormHandlers();
	}

	/**
	 * Gets the name of the "User-Agent" header that the robot will use
	 * @return the user agent name 
	 */
	public String getAgentName() {
		if (httpTool != null) {
			return httpTool.getAgentName();
		} else {
			return null;
		}
	}

	/**
	 * sets the Agent-Name authentication for this robot
	 * @param name a name for this robot 
	 * (e.g. "Mozilla 4.0 (compatible; Robot)")
	 */
	public void setAgentName(String name) {
		httpTool.setAgentName(name);
		// robCheck = new NoRobots(ROBOT_NAME, httpTool);
		robCheck = new NoRobots(name, httpTool);
	}

	/**
	 * Gets the timeout for getting data in seconds of the used HttpTool
	 * @return the value of sockerTimeout
	 * @see #setTimeout(int)
	 */
	public int getTimeout() {
		if (httpTool != null) {
			return httpTool.getTimeout();
		} else {
			return -1;
		}
	}

	/**
	 * Sets the timeout for getting data. If HttpTool can't read data from a
	 * remote web server after this number of seconds it will stop the download
	 * of the current file
	 * @param timeout Timeout in seconds
	 */
	public void setTimeout(int timeout) {
		httpTool.setTimeout(timeout);
	}

	/**
	 * Gets the ntlmAuthentication of the robot
	 * @return the ntlmAuthentication
	 */
	public NTLMAuthorization getNtlmAuthorization() {
		if (httpTool != null) {
			return httpTool.getNtlmAuthorization();
		} else {
			return null;
		}
	}

	/**
	 * sets a ntlmAuthentication for this robot
	 * @param ntlmAuthentication for this robot 
	 */
	public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
		httpTool.setNtlmAuthorization(ntlmAuthorization);
	}

	/**
	 * Gets the setting of the IgnoreRobotsTxt property
	 * @return true if robots.txt will be ignored, false otherwise
	 */
	public boolean getIgnoreRobotsTxt() {
		return ignoreRobotsTxt;
	}

	/**
	 * Gets a vector of URLs that can be visited more then once
	 * @return a vector containing URLs formated as Strings
	 */
	public Vector getVisitMany() {
		return visitMany;
	}

	public void setVisitMany(Vector visitMany) {
		this.visitMany = visitMany;
	}

	public void setHttpToolCallback(HttpToolCallback callback) {
		httpTool.setCallback(callback);
	}

	public WebRobotCallback getWebRobotCallback() {
		return webRobotCallback;
	}

	public void setWebRobotCallback(WebRobotCallback webRobotCallback) {
		this.webRobotCallback = webRobotCallback;
	}

	/**
	 * Sets the sleep status for this robot. If a WebRobot is set to sleep
	 * after starting run(), is will wait after retrieving the current document
	 * and wait for setSleep(false)
	 */
	public void setSleep(boolean sleep) {
		this.sleep = sleep;
	}

	/**
	 * Is the robot sleeping ?
	 */
	public boolean isSleeping() {
		return this.sleep;
	}

	/** 
	 * Set the list of allowed URLs
	 * @param allowed a Vector containing Strings. URLs will be checked
	 * if they begin of a string in this vector
	 */
	public void setAllowedURLs(Vector allowed) {
		this.allowedURLs = allowed;
	}

	/**
	 * Gets the list of allowed URLs
	 * @return a Vector containing Strings
	 * @see #setAllowedURLs(Vector)
	 */
	public Vector getAllowedURLs() {
		return this.allowedURLs;
	}
	
	/**
	 * Enable/disable cookies
	 * @param enable if true, HTTP cookies will be enabled, if false
	 * the robot will not use cookies
	 */
	public void setEnableCookies(boolean enable) {
		httpTool.setEnableCookies(enable);
	}

	/**
	 * Get the status of the cookie engine
	 * @return true, if HTTP cookies are enabled, false otherwise
	 */
	public boolean getEnableCookies() {
		return httpTool.getEnableCookies();
	}

	/** 
	 * Set the maximum age of documents to retrieve to this number
	 * of seconds
	 * @param maxAge integer value of the maximum document age 
	 * (in seconds), negative value means no limit.
	 */
	public void setMaxDocumentAge(long maxAge) {
		this.maxDocumentAge = maxAge;
	}
	


	/**
	 * Gets the maximum age of documents to retrieve
	 * @return maximum document age (in seconds), negative value means 
	 * no limit.
	 */
	public long getMaxDocumentAge() {
		return this.maxDocumentAge;
	}

	/**
	 * Sets a FilterChain. If teh WebRobot use a FilterChain it will
	 * process any retrieved document by this FilterChain before
	 * storing it
	 *
	 * @param filter a FilterChain to use for filtering HttpDocs
	 */
	public void setFilters(FilterChain filters) {
		this.filters = filters;
	}

	/**
	 * Delete all cookies
	 */
	public void clearCookies() {
		httpTool.clearCookies();
	}

	/**
	 * thread run() method, simply calls work()
	 * @see #work()
	 */
	public void run() {
		work();
	}

	/**
	 * do your job travel through the web using the configured 
	 * parameters and retrieve documents
	 */
	public void work() {
		RobotTask task = createRobotTask(startURL, maxDepth, startReferer);
		todo.add(task);
		walkTree();
		// ok, we did it, clean up dynamic data (the vistited vector)
		cleanUp();
		log.info("Documents retrieved by: Web=" + countWeb + " Cache=" + countCache + " Refresh=" + countRefresh+ " NoRefresh=" + countNoRefresh);
	}

	/**
	 * stop the current robot run 
	 * note that this will not abourt the current download but stop after
	 * the current download has finished
	 */
	public void stopRobot() {
		stopIt = true;
	}

	/**
	 * Holds information about memory status.
	 * @see handleMemoryError(OutOfMemoryError)
	 */
	private int memoryLevel = 0;
	
	/** Can new tasks be added? (may depend on memoryLevel) */
	protected boolean activatedNewTasks = true;
	
	/** Are visited URLs collected? (may depend on memoryLevel) */
	protected boolean activatedUrlHistory = true;
	
	/** Are visited contents collected? (may depend on memoryLevel) */
	protected boolean activatedContentHistory = true;
	
	/** memory buffer of 200 KB to be freed in case of urgent memory needs */
	private byte memoryBuffer[] = new byte[200 * 1024];

	/**
	 * do your job !
	 */
	
	public void walkTree() {
		while ((todo.size() > 0) && (!stopIt)) {
			RobotTask task;
			synchronized(visited) {
				task = todo.removeFirst();
				if (visited.contains(task) && (!visitMany.contains(task.getUrl().toString()))) {
					log.debug("already visited: " + task.getUrl());
					continue;
				}
				if (activatedUrlHistory) {
					visited.add(task);
				}
			}
			
			boolean repeat = true;
			while (repeat) {
				try {
					retrieveURL(task);
					repeat = false;
				} catch (OutOfMemoryError memoryError) {
					handleMemoryError(memoryError); 
				}
			}

			// sleep, if sleep is set to true
			while (sleep) {
				// callback
				if (webRobotCallback != null) {
					webRobotCallback.webRobotSleeping(true);
				}

				try {
					Thread.sleep(1000);
				} catch (InterruptedException e) {
				};
			}

			// callback
			if (webRobotCallback != null) {
				webRobotCallback.webRobotSleeping(false);
			}

			// callback
			if (webRobotCallback != null) {
				webRobotCallback.webRobotUpdateQueueStatus(todo.size());
			}
			spawnThread();
		}

		// callback
		if (webRobotCallback != null) {
			finishThreads();
		}
	}

	/**
	 * Implements OutOfMemory handling strategies.
	 * Action depends on memoryLevel
	 * @param memoryError
	 * @throws OutOfMemoryError
	 */
	protected void handleMemoryError(OutOfMemoryError memoryError)
		throws OutOfMemoryError {
		memoryLevel++;
		log.error("OutOfMemoryError level=" + memoryLevel + "! (visited=" + visited.size() + ", todo=" + todo.size() + ")");
		switch (memoryLevel) {
			case 1:
				// don磘 remember visited URLs and contents any more
				// and try it again
				visited.clear(); activatedUrlHistory = false;
				content2UrlMap.clear(); activatedContentHistory = false;
				System.gc();
				break;
			case 2:
				// stop adding new Tasks, just process todo-list.
				// free memory buffer 
				// and try it again 
				activatedNewTasks = false;
				memoryBuffer = null;
				System.gc();
				break;
			case 3:
				// there is nothing we can do any more.
				// throw exception to stop robot
				throw memoryError;
			default :
				// Should never be reached.
				if (memoryBuffer != null) {
					// avoid removal of memoryBuffer by compiler
					System.err.println(memoryBuffer[0]);
				}
				throw memoryError;
		}
	}

	/**
	 * calls webRobotDone and finishes docManager if 
	 * executed in mainThread
	 */
	protected void finishThreads() {
		webRobotCallback.webRobotDone();
		if (docManager != null) {
		  docManager.finish();
		}
	}
	
	/**
	 * Start subThreads for spidering.
	 * WARNING: Should only be implemented and used for local
	 * spidering purposes!
	 */
	protected synchronized void spawnThread() {
	}
	
	/** counter for calls of retrieveURL */
	protected int iteration = 0;
	
	/**
	 * retrieve the next URL, save it, extract all included links and
	 * add those links to the tasks list
	 * @param task task to retrieve, function does nothing if this is null
	 */
	public void retrieveURL(RobotTask task) {
		if (task == null) {
			log.debug("Empty task found, ignoring");
			return;
		}
		
		long now = System.currentTimeMillis();

		updateProgressInfo();

		URL u = task.getUrl();
		String urlString = u.toString();
		String referer = task.getReferer();
		int depth = task.getMaxDepth();

		if (depth < 0) {
			log.info("Max search depth reached");
			return;
		}

		// we may need this additional check even if we
		// tested it during adding to the tasks list 
		if (!isAllowed(u)) {
			log.info("Url '" + u + "' filtered out.");
			return;
		}

		if (u.getFile().equals("")) {
			try {
				urlString = urlString + "/";
				u = new URL(urlString);
				// fix for double retrieved files
				task.setUrl(u);
			} catch (MalformedURLException e) {
				log.error("URL not well formed: " + e.toString());
				// use exception handler to handle exception
				exceptionHandler.handleException(this, u, e);
				return;
			}
		}

		log.info("retrieving " + urlString);
		httpTool.setReferer(referer);

		HttpDoc doc = null;
		Vector links = null;
		boolean cached = false;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -