⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regexpurlcheck.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
字号:
package net.matuschek.spider;

/************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
 *************************************************/


import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.net.URL;
import java.util.StringTokenizer;
import java.util.Vector;

import org.apache.regexp.RESyntaxException;


/**
 * This URLChecker checks a URL using a list of regular expressions
 * that should be allowed or denied.
 * 
 * @author Daniel Matuschek
 * @version $Revision: 1.4 $
 */
public class RegExpURLCheck 
implements URLCheck
{
	/** vector to store the rules */
	private Vector<RegExpRule> rules = null;

	/** default check result if no matching regexp was found */
	private boolean defaultResult = true;

	/** initializes the object with an empty rule set */
	public RegExpURLCheck() {
		rules = new Vector<RegExpRule>();
	}

	/** 
	 * <p>initialized the object with a rule set from an 
	 * input stream (e.g. a file)</p>
	 *
	 * <p>every line of this stream has the format 
	 * <code>allow|deny expression</code></p>
	 *
	 * <p>default value can be set with
	 * <code>allow|deny .</code> at the end of the file</p>
	 *  
	 * <p>lines that start with "#" and empty lines will be
	 * ignored</p>
	 */
	public RegExpURLCheck(Reader r) 
	throws IOException, 
	org.apache.regexp.RESyntaxException
	{
		this();

		BufferedReader reader = 
			new BufferedReader(r);

		String line = "";
		int lineno=0;

		while (line != null) {
			line=reader.readLine();
			lineno++;

			if ((line != null) &&
					(! line.trim().equals("")) &&
					(! line.startsWith("#"))) {
				StringTokenizer st = new StringTokenizer(line);
				// did we get 2 tokens ?
				if (st.countTokens() != 2) {
					throw new IOException("line "+lineno+" don't consists of 2 fields");
				}

				String allowStr = st.nextToken();
				boolean allow = true;
				String expression = st.nextToken();

				// allow or deny ?
				if (allowStr.equalsIgnoreCase("allow")) {
					allow=true;
				} else if (allowStr.equalsIgnoreCase("deny")) {
					allow=false;
				} else {
					throw new IOException("first token in line "+lineno+
					" has to be allow or deny");
				}

				addRule(expression,allow);
			}
		}
	}


	/** 
	 * Sets the default result that will be returned if no matching
	 * regular expression was found
	 * @param default the default result
	 */
	public void setDefaultResult(boolean defaultResult) {
		this.defaultResult = defaultResult;
	}

	/** 
	 * Gets the default result that will be returned if no matching
	 * regular expression was found
	 * @return the default result
	 */
	public boolean getDefaultResult() {
		return defaultResult;
	}

	/**
	 * Gets the list of rules
	 * @return a vector of RegExpRule objects
	 */
	public Vector getRules() {
		return rules;
	}

	/**
	 * Sets the list of rules
	 * @param rules a vector of RegExpRule objects
	 */
	public void setRules(Vector<RegExpRule> rules) {
		this.rules=rules;
	}


	/**
	 * adds a allow or deny rule
	 * @param regExp a String containing the regular expression
	 * @param allow allow (TRUE) or deny (FALSE)
	 */
	public void addRule(String regExp, boolean allow) 
	throws RESyntaxException
	{
		RegExpRule rule = new RegExpRule();
		rule.setPattern(regExp);
		rule.setAllow(allow);
		rules.add(rule);
	}


	/** 
	 * Checks if a given URL is allowed or denied by the rules
	 *
	 * @return true if a matching "allow" rule was found, 
	 * false if a matching "deny" rule was found,
	 * the default value if no rule was found
	 * @see #setDefaultResult(boolean)
	 */
	public boolean checkURL(URL u) {
		String urlStr = u.toString();

		for (int i=0; i<rules.size(); i++) {
			RegExpRule rule = rules.elementAt(i);

			if (rule.match(urlStr)) {
				return rule.getAllow();
			}
		}

		return defaultResult;
	}

	/** 
	 * Checks if a given URL is allowed or denied by the rules for processing
	 *
	 * @return true if a matching "allow" rule was found, 
	 * false if a matching "deny" rule was found,
	 * the default value if no rule was found
	 * @see #setDefaultResult(boolean)
	 */
	public boolean checkURLForProcessing(URL u) {
		String urlStr = u.toString();

		for (int i=0; i<rules.size(); i++) {
			RegExpRule rule = rules.elementAt(i);

			if (rule.match(urlStr)) {
				return rule.getProcessAllowed();
			}
		}

		return defaultResult;
	}

} // RegExpURLCheck

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -