pagedownload.java

来自「java下的 多线程爬虫 输入线程数目」· Java 代码 · 共 248 行

JAVA
248
字号
package crawler;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//import crawler.UrlDatabase;

public class PageDownload implements Runnable {

	String startUrl;
	String searchString;// 要搜索的字符串(英文)
	boolean caseSensitive = false;// 是否区分大小写
	boolean limitHost = true;// 是否在限制的主机内搜索
	UrlDatabase database;
	boolean limitField = true;
	int interval = 2000;
	ArrayList<String> limitFields;
	public PageDownload(UrlDatabase database) {
		this.database = database;
		limitFields = new ArrayList<String>();
	}

	public PageDownload() {
		limitFields = new ArrayList<String>();
	}
	
	public void setDatabase(UrlDatabase database)
	{
		this.database = database;
	}
	
	public void setInterval(int interval) {
		this.interval = interval;
	}
	
	public void setFields(ArrayList<String> fields) {
		limitFields.addAll(fields);
	}
	
	public void run() {// 启动搜索线程
		download(limitHost, caseSensitive);
	}
	
	public URL verifyUrl(String url) {
		// 只处理HTTP URLs.
		if (!url.toLowerCase().startsWith("http://") && !startUrl.equals(""))
			return null;
		URL verifiedUrl = null;
		try {
			verifiedUrl = new URL(url);
		} catch (Exception e) {
			return null;
		}
		return verifiedUrl;
	}
	
	private String downloadPage(URL pageUrl) {
		try {
			// Open connection to URL for reading.
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					pageUrl.openStream()));

			// Read page into buffer.
			String line;
			StringBuffer pageBuffer = new StringBuffer();
			while ((line = reader.readLine()) != null) {
				pageBuffer.append(line);
			}

			return pageBuffer.toString();
		} catch (Exception e) {
		}

		return null;
	}
	// 从URL中去掉"www"
	private String removeWwwFromUrl(String url) {
		int index = url.indexOf("://www.");
		if (index != -1) {
			return url.substring(0, index + 3) + url.substring(index + 7);
		}

		return (url);
	}

	// 解析页面并找出链接
	public ArrayList<String> retrieveLinks(URL pageUrl, String pageContents,
			 boolean limitHost) {
		// 用正则表达式编译链接的匹配模式。
		Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
				Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(pageContents);

		ArrayList<String> linkList = new ArrayList<String>();
		while (m.find()) {
			String link = m.group(1).trim();

			if (link.length() < 1) {
				continue;
			}

			// 跳过链到本页面内链接。
			if (link.charAt(0) == '#') {
				continue;
			}

			if (link.indexOf("mailto:") != -1) {
				continue;
			}

			if (link.toLowerCase().indexOf("javascript") != -1) {
				continue;
			}

			if (link.indexOf("://") == -1) {
				if (link.charAt(0) == '/') {// 处理绝对地
					int port = pageUrl.getPort();
					if (port > 0)
						link = "http://" + pageUrl.getHost() + ":"
							+ pageUrl.getPort() + link;
					else 
						link = "http://" + pageUrl.getHost() + link;
						
				} else {
					String file = pageUrl.getFile();
					if (file.indexOf('/') == -1) {// 处理相对地址
						int port = pageUrl.getPort();
						if (port > 0)
							link = "http://" + pageUrl.getHost() + ":"
								+ pageUrl.getPort() + "/" + link;
						else link = "http://" + pageUrl.getHost() + "/" + link;
					} else {
						int port = pageUrl.getPort();
						String path = file.substring(0,
								file.lastIndexOf('/') + 1);
						if (port > 0)
							link = "http://" + pageUrl.getHost() + ":"
								+ pageUrl.getPort() + path + link;
						else link = "http://" + pageUrl.getHost() + path + link;
					}
				}
			}

			int index = link.indexOf('#');
			if (index != -1) {
				link = link.substring(0, index);
			}

			link = removeWwwFromUrl(link);

			URL verifiedLink = verifyUrl(link);
			if (verifiedLink == null) {
				continue;
			}


			if (limitField && limitFields.size() > 0) {
				int i;
				for (i = 0; i < limitFields.size(); i++) {
					if (link.indexOf(limitFields.get(i)) >= 0)
						break;
				}
				if (i >= limitFields.size()) continue; 
			}
			if (link.indexOf("#fr=qrl") > 0) continue;
			//System.out.println("new :" + link);
			linkList.add(link);
		}

		return (linkList);
	}

	public String urlToFileName(String url) {
		String fileName = "data\\" + url + ".html";
		fileName = fileName.replace('?', '#');
		fileName = fileName.replace('/', '%');
		fileName = fileName.replace(':', '$');	
		return fileName;
	}
	public ArrayList<String> download(boolean limithost, boolean caseSensitive) {
		// 从开始URL中移出www
		ArrayList<String> links = new ArrayList<String>();
		while (true)
		{
			String url;
			url = database.getUrl();
			if (url == null && database.getNum() == 0)
			{
				break;
			}
			else if (url == null)
			{
				try {
					Thread.sleep((int) (Math.random() * 1000));
				}
				catch (InterruptedException e) {
					System.err.println(e.toString());
				}
				continue;
			}
			startUrl = removeWwwFromUrl(url);
	
			URL verifiedUrl = verifyUrl(startUrl);
	
				// Skip URL if robots are not allowed to access it.
				//if (!isRobotAllowed(verifiedUrl)) {
				//	continue;
				//}
	
				// 增加已处理的URL到crawledList
			String pageContents = downloadPage(verifiedUrl);
			System.out.println(url);
			String fileName = urlToFileName(url);
			File myfile = new File(fileName);
			try
			{
				FileOutputStream out = new FileOutputStream(myfile);
				out.write(pageContents.getBytes());
				out.close();
			}
			catch (IOException e)
			{
				
			}
			
			if (pageContents != null && pageContents.length() > 0) {
				// 从页面中获取有效的链接
				links = retrieveLinks(verifiedUrl,
						pageContents, limitHost);
			
			}
			database.addUrls(links);
			try {
				Thread.sleep(interval);
			}
			catch (InterruptedException e) {
				System.err.println(e.toString());
			}
		}
		return links;
	}
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?