⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 robotsfilter.java

📁 VHDL制作的ann的code
💻 JAVA
字号:
/* * Encog Neural Network and Bot Library for Java v1.x * http://www.heatonresearch.com/encog/ * http://code.google.com/p/encog-java/ *  * Copyright 2008, Heaton Research Inc., and individual contributors. * See the copyright.txt in the distribution for a full listing of  * individual contributors. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */package org.encog.bot.spider.filter;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.List;/** * RobotsFilter: This filter causes the spider to skip URL's from a robots.txt * file. */public class RobotsFilter implements SpiderFilter {		/**	 * The maximum length of a line.	 */	public static final int MAXLINE = 80;		/**	 * The full URL of the robots.txt file.	 */	private URL robotURL;	/**	 * A list of URL's to exclude.	 */	private final List<String> exclude = new ArrayList<String>();	/**	 * Is the parser active? It can become inactive when parsing sections of the	 * file for other user agents.	 */	private boolean active;	/**	 * The user agent string we are to use, null for default.	 */	private String userAgent;	/**	 * Add the specified URL to the exclude list.	 * 	 * @param str	 *            The URL to add.	 */	private void add(final String str) {		if (!this.exclude.contains(str)) {			this.exclude.add(str);		}	}	/**	 * Returns a list of URL's to be excluded.	 * 	 * @return A vector of URL's to be excluded.	 */	public List<String> getExclude() {		return this.exclude;	}	/**	 * Returns the full URL of the robots.txt file.	 * 	 * @return The full URL of the robots.txt file.	 */	public URL getRobotFile() {		return this.robotURL;	}	/**	 * Check to see if the specified URL is to be excluded.	 * 	 * @param url	 *            The URL to be checked.	 * @return Returns true if the URL should be excluded.	 */	public boolean isExcluded(final URL url) {		for (final String str : this.exclude) {			if (url.getFile().startsWith(str)) {				return true;			}		}		return false;	}	/**	 * Called internally to process each line of the robots.txt file.	 * 	 * @param line	 *            The line that was read in.	 * @throws MalformedURLException	 *             Thrown if a bad URL is found.	 */	private void loadLine(final String line) throws MalformedURLException {		final String str = line.trim();		final int i = str.indexOf(':');		if (str.length() == 0 || str.charAt(0) == '#' || i == -1) {			return;		}		final String command = str.substring(0, i);		final String rest = str.substring(i + 1).trim();		if (command.equalsIgnoreCase("User-agent")) {			this.active = false;			if (rest.equals("*")) {				this.active = true;			} else {				if (this.userAgent != null						&& rest.equalsIgnoreCase(this.userAgent)) {					this.active = true;				}			}		}		if (this.active) {			if (command.equalsIgnoreCase("disallow")) {				if (rest.trim().length() > 0) {					final URL url = new URL(this.robotURL, rest);					add(url.getFile());				}			}		}	}	/**	 * Called when a new host is to be processed. Hosts are processed one at a	 * time. SpiderFilter classes can not be shared among hosts.	 * 	 * @param host	 *            The new host.	 * @param userAgent	 *            The user agent being used by the spider. Leave null for	 *            default.	 * @throws IOException	 *             Thrown if an I/O error occurs.	 */	public void newHost(final String host, final String userAgent)			throws IOException {		String str;		this.active = false;		this.userAgent = userAgent;		this.robotURL = new URL("http", host, MAXLINE, "/robots.txt");		final URLConnection http = this.robotURL.openConnection();		if (userAgent != null) {			http.setRequestProperty("User-Agent", userAgent);		}		final InputStream is = http.getInputStream();		final InputStreamReader isr = new InputStreamReader(is);		final BufferedReader r = new BufferedReader(isr);		this.exclude.clear();		try {			while ((str = r.readLine()) != null) {				loadLine(str);			}		} finally {			r.close();			isr.close();		}	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -