📄 webrobot.java

📁 一个Java的网络爬虫
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
	 * @return true if this tasks can be added to the task list,
	 * false otherwise
	 */
	protected boolean taskAddAllowed(RobotTask task) {
		if (task == null) {
			log.info("Null task not allowed");
			return false;
		}

		if (!isAllowed(task.getUrl())) {
			return false;
		}

		if (todo.contains(task)) {
			return false;
		}

		return true;
	}

	/**
	 * Is it allowed to travel to this new URL ?
	 * @param u the URL to test
	 * @return true if traveling to this URL is allowed, false otherwise
	 */
	protected boolean isAllowed(URL u) {

		// do the basic checks
		if (basicURLCheck(u)) {

			// if we have an URLCheck then test this URL against it 
			if ((urlCheck != null) && (!urlCheck.checkURL(u))) {
				log.debug("not allowed by URLCheck:" + u);
				return false;
			}

			if (robCheck.ok(u)) {
				return true;
			} else {
				log.debug("not allowed by robots.txt:" + u);
				return false;
			}
		}
		return false;
	}
	
	/**
	 * Is it allowed to process this document ?
	 * @param document
	 * @return true if processing of this URL is allowed
	 */
	protected boolean isProcessingAllowed(HttpDoc doc) {
		URL u = doc.getURL();
		if ((urlCheck != null) && (!urlCheck.checkURLForProcessing(u))) {
			log.debug("processing not allowed by URLCheck:" + u);
			return false;
		}
		
		DownloadRuleSet downloadRuleSet = httpTool.getDownloadRuleSet();
		if (downloadRuleSet != null && !downloadRuleSet.processAllowed(doc.getHttpHeaders())) {
			log.debug("processing not allowed by DownloadRuleSet:" + u);
			return false;
		}

		return true;
	}

	/**
	 * Basic URL allow check
	 * it is allowed to walk to a new URL if <ul>
	 *  <li>WalkToOtherHost is true. In this case there will be no additional
	 *      tests.</li>
	 *  <li>The new URL is located below the start URL, e.g. is the start URL
	 *      is http://localhost/test, the URL http://localhost/test/index.html
	 *      is allowed, but http://localhost/ is not allowed.</li>
	 *  <li>AllowWholeHost is true and the new URL is located on the same host
	 *      as the start URL.</li>
	 *  <li>FlexibleHostCheck is true and the host part of the current URL
	 *      is equal to the host part of the start URL modulo the prefix "www."
	 *      </li>
	 *  <li>The URL starts with a string in the "AllowedURLs" list.</li>
	 * </ul>
	 */
	protected boolean basicURLCheck(URL currURL) {
		String currURLStr = currURL.getHost() + currURL.getPath();
		String currHost = currURL.getHost().toLowerCase();
		String startHost = startURL.getHost().toLowerCase();

		// no more checks, if walkToOtherHosts is true
		if (walkToOtherHosts) {
			return true;
		}

		// new URL below start URL ?
		if (currURLStr.startsWith(startDir)) {
			return true;
		}

		// on the same host ?
		if (allowWholeHost && (currURL.getHost().equalsIgnoreCase(startURL.getHost()))) {
			return true;
		}

		// on the same host with flexible test (host name with and without "www."
		if (flexibleHostCheck) {
			if (cutWWW(currHost).equalsIgnoreCase(cutWWW(startHost))) {
				return true;
			}
		}

		// allow whole domain ?
		if (allowWholeDomain) {
			if (currHost.endsWith(getDomain(startHost))) {
				return true;
			}
		}

		// in the list of allowed URLs ?
		for (int i = 0; i < allowedURLs.size(); i++) {
			String s = (String) allowedURLs.elementAt(i);
			if (currURLStr.startsWith(s)) {
				return true;
			}
		}
		log.debug("URL " + currURLStr + " not allowed");
		return false;
	}

	/**
	 * remove a leading www. from a given hostname
	 * 
	 * @param hostname some hostname
	 * @return the hostname if it doesn't start with "www." otherwise
	 *  the hostname without the leading www.
	 */
	private String cutWWW(String hostname) {
		if (hostname.toLowerCase().startsWith("www.")) {
			return hostname.substring(4);
		} else {
			return hostname;
		}
	}

	/** 
	 * Gets the domain name of a given host (just delete everything
	 * to the last "."
	 *
	 * @param hostname some hostname
	 * @return the domain part of this hostname
	 */
	private String getDomain(String hostname) {
		int pos = hostname.indexOf(".");
		if (pos < 0) {
			// this should not happen !
			return hostname;
		} else {
			return hostname.substring(pos + 1);
		}
	}

	/**
	 * Method getExceptionHandler.
	 * @return RobotExceptionHandler the exceptionhandler of the robot
	 */
	public RobotExceptionHandler getExceptionHandler() {
		return exceptionHandler;
	}

	/**
	 * Method setExceptionHandler.
	 * sets the exceptionhandler of the robot
	 * @param newExceptionHandler the new exception handler
	 */
	public void setExceptionHandler(RobotExceptionHandler newExceptionHandler) {
		if (newExceptionHandler != null) {
			exceptionHandler = newExceptionHandler;
		}
	}

	/**
	 * Method setStart.
	 * sets the start URL 
	 * @param the startURL as String
	 */
	public void setStart(String startURL) {
		try {
			setStartURL(new URL(startURL));
		} catch (MalformedURLException e) {
			e.printStackTrace();
		}
	}

	/**
	 * Method getStart.
	 * gets the start url as string
	 * @return String
	 */
	public String getStart() {
		URL url = getStartURL();
		if (url != null) {
			return url.toExternalForm();
		} else {
			return null;
		}
	}

	/**
	 * This method finishes HttpTool, NoRobots, HttpDocManager.
	 */
	public void finish() {
		if (httpTool != null) {
			httpTool.finish();
		}
		if (robCheck != null) {
			robCheck.finish();
		}
		if (docManager != null) {
			docManager.finish();
		}
	}

	public static void main(String[] args) {
		if (args.length > 0) System.err.println("Arguments will be ignored!");
		Field[] fields = WebRobot.class.getDeclaredFields();
		StringBuffer str = new StringBuffer(60);
		for (int i = 0; i < fields.length; i++) {
			if (!Modifier.isFinal(fields[i].getModifiers())
				&& !Modifier.isStatic(fields[i].getModifiers())) {
				str.delete(0, str.length());
				str.append("		robot." + fields[i].getName() + " = " + fields[i].getName() + ";");
				while (str.length() < 50) {
					str.append(" ");
				}
				System.out.println(str.toString()+"// ("+fields[i].getType().getName()+")");
			}
		}
	}

	/** default expected count of documents */
	private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000;
	
	/** expected count of documents */
	protected int expectedDocumentCount = DEFAULT_EXPECTED_DOCUMENT_COUNT;
	 
	/** remember visited content here (md5, urlString) */ 
	protected HashMap content2UrlMap;

	/**  counter for pages that were found in cache */
	long countCache = 0;
	
	/** counter for pages retrieved by web */
	long countWeb = 0;
	
	/** counter for pages that didn磘 need a refresh */
	long countNoRefresh = 0;
	
	/** counter for refreshed pages (=cache+web) */
	long countRefresh = 0;
	
	/**
	 * Method getContentVisitedURL.
	 * Checks if the content was visited before and retrieves the corresponding URL.
	 * @param content
	 * @return found url or null if not found
	 */
	public String getContentVisitedURL(HttpDoc doc) {
		Object key = doc.getContentMD5();
		synchronized(content2UrlMap) {
			String url = (String) content2UrlMap.get(key);
			return url;
		}
	}
	
	/**
	 * Method setContentVisitedURL.
	 * Makes an URL retrievable by its content by entering it in content2UrlMap.
	 * @param content
	 * @param url
	 */
	public void setContentVisitedURL(HttpDoc doc, String url) {
		Object key = doc.getContentMD5();
		synchronized(content2UrlMap) {
			content2UrlMap.put(key, url);
		}
	}
	
	private final RobotTask createRobotTask(URL url, int maxDepth, String startReferer) {
		url = removeWasteParameters(url);
		return new RobotTask(url, maxDepth, startReferer);
	}

	/** only true if form-handlers are defined */
	boolean hasFormHandlers = false;
	
	/** list of wasteParameters (will be removed from URLs) **/
	protected Vector wasteParameters = new Vector();
	
	/** 
	 * Set the list of wasteParameters (will be removed from URLs)
	 * @param wasteParameters 
	 * if they begin of a string in this vector
	 */
	public void setWasteParameters(Vector wasteParameters) {
		this.wasteParameters = wasteParameters;
	}

	/**
	 * Gets the list of wasteParameters (will be removed from URLs)
	 * @return a Vector containing Strings
	 */
	public Vector getWasteParameters() {
		return this.wasteParameters;
	}

	/** Removes wasteParameters from URL.
	 * (eg. ID)
	 * @param url
	 * @return URL
	 */
	public URL removeWasteParameters(URL url) {
		String urlString = url.toExternalForm();
		String newUrlString = removeParametersFromString(urlString, wasteParameters);
		if (urlString != newUrlString) {
			try {
				url = new URL(newUrlString);
			} catch (MalformedURLException ex) {
				ex.printStackTrace();
			}
		};
		return url;
	}
	
	/**
	 * Remove passed Parameters from UrlString
	 * @param urlString
	 * @param wasteParameters
	 * @return String
	 */
	public static String removeParametersFromString(String urlString, Vector wasteParameters) {
		if (wasteParameters != null && wasteParameters.size() > 0) {
			int questionMark = urlString.indexOf("?");
			if (questionMark>0 && questionMark<urlString.length()) {
				int restPosition = urlString.indexOf("#", questionMark);
				String parameters;
				String rest;
				if (restPosition<0) {
					parameters = urlString.substring(questionMark+1);
					rest = null;
				} else {
					parameters = urlString.substring(questionMark+1,restPosition);
					rest = urlString.substring(restPosition);
				}
		  		
				StringBuffer filteredUrl = new StringBuffer(urlString.substring(0,questionMark));
				StringTokenizer tokenizer = new StringTokenizer(parameters, "&");
				String and = "?";
				boolean changed = false;
				while (tokenizer.hasMoreTokens()) {
					String token = tokenizer.nextToken();
					boolean keep = true;
					for (int w=0; w<wasteParameters.size(); w++) {
						String wasteParameter = (String) wasteParameters.elementAt(w);
						if (token.startsWith(wasteParameter + "=")) {
							keep = false; 
							changed = true;
							break;
						}
					}
					if (keep) {
						filteredUrl.append(and);
						filteredUrl.append(token);
						and = "&";
					}
				}
				if (rest != null) filteredUrl.append(rest);
				if (changed) {
					urlString = filteredUrl.toString();
				}
			}
		}
		return urlString;
	}
	
	/** time of WebRobot start in milliseconds */
	protected long startTime = System.currentTimeMillis();
	
	/** number of allowed retries for document retrieval */
	protected int maxRetries = 0;
	
	/**
	 * Set allowed retries for document retrieval
	 * @param maxRetries
	 */
	public void setMaxRetries(int maxRetries) { this.maxRetries = maxRetries; }
	
	/**
	 * Get allowed retries for document retrieval
	 * @return maxRetries
	 */
	public int getMaxRetries() { return maxRetries; }
	
	/** 
	 * expiration age of documents in cache.
	 * Documents older than expirationAge will be removed,
	 * negative value means no limit. 
	 */
	protected long expirationAge = -1;
	
	/**
	 * set expiration age of documents in cache.
	 * Documents older than expirationAge will be removed,
	 * negative value means no limit. 
	 * @param age
	 */
	public void setExpirationAge(long age) { expirationAge = age; }
	
	/**
	 * get expiration age of documents in cache.
	 * @return long
	 */
	public long getExpirationAge() { return expirationAge; }
	
	/**
	 * Remove Parameters from Url
	 * @param url
	 * @return url without parameters
	 */
	private final static String removeParameters(String url) {
		int pos = url.indexOf("?");
		return pos >= 0 ? url.substring(0,pos) : url;
	}
	
	/**
	 * Reads a File to a byte array.
	 * @param file
	 * @return byte[]
	 * @throws IOException
	 */
	protected byte[] readFileToByteArray(File file) throws IOException
	{
		FileInputStream in = null;

		try
		{
			byte[] buffer = new byte[(int) file.length()];
			in = new FileInputStream(file);
			in.read(buffer);

			return buffer;
		}
		finally
		{
			if (in != null)
			{
				try
				{
					in.close();
				}
				catch (IOException e)
				{
				}
			}
		}
	}
	
}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -