⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spider.java

📁 爬取网站信息
💻 JAVA
字号:
package test;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.URL;
import java.net.URLEncoder;
import java.sql.*;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

import domain.Code;
/**
 * @author eatsun
 * 此类的所用是从文件里读取省市,把其下的所有邮编URL保存到数据库中
 */
public class Spider {
	// 盛放所有连接地址
	public static Set set = new HashSet();
	public static Pattern pHtml = Pattern.compile("/{1}[0-9]{6}\\.html");
	public static Pattern pTitle = Pattern.compile("<TITLE>.+</TITLE>");//175
	public static Pattern pExist = Pattern.compile("\"code\": [0-9]{3}");//191
	public static Pattern pl = Pattern.compile("\\d{1,3}\\.\\d{7}");//197
	public static Pattern pTd = Pattern.compile("<td width=\"+190+\">.+</td>");
	//public static long timer = 0;
	public static void main(String[] args) {
		// 从文本文件得到城市名称
		//timer=System.currentTimeMillis();
		System.out.println("入库开始......");
		String[] strArray = getAeraArray();
		// 循环遍历城市名称(如同在百度关键字中填入该城市)
		for (int i = 0; i < strArray.length; i++) {
			// System.out.println(strArray[i]);

			String u = URLEncoder.encode(strArray[i]);
			u = "http://youbian.baidu.com/s?word=" + u
					+ "&tn=baiduyb&ct=2097152&cl=0&si=youbian.baidu.com";
			getAllLinks(u);
		}
		// 都加到set集合中再处理
		dealSet();
		System.out.println("入库结束");
	}

	public static String[] getAeraArray() {
		String[] ss = null;
		try {
			FileReader fr = new FileReader("d://aera.txt");
			BufferedReader br = new BufferedReader(fr);
			StringBuffer sb = new StringBuffer();
			String str = "";
			while (!((str = br.readLine()) == null)) {
				sb.append(str);
			}
			br.close();
			String s = sb.toString();
			ss = s.split(";");
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return ss;
	}

	public static void getAllLinks(String HTTPURL) {
		long timer = System.currentTimeMillis();
		
		String pageStr = getPageStr(HTTPURL);
		String nextPageStr = "";
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}//.(html){1}");
		Matcher m = pHtml.matcher(pageStr);
		while (m.find()) {
			String linkStr = "http://youbian.baidu.com" + m.group();
			set.add(linkStr);
		}
		// 表示有下一页
		int nextPageIndex = pageStr.indexOf("><font size=3>下一页<");
		if(nextPageIndex != -1) {
			while (true) {
				// 截取下一页的URL地址
				String withoutNextLable = pageStr.substring(0, nextPageIndex);
				// System.out.println(pageStr);
				int lastHref = withoutNextLable
						.lastIndexOf("s?lm=0&si=youbian.baidu.com");
				String nextPageURL = withoutNextLable.substring(lastHref);
				String s = "http://youbian.baidu.com/" + nextPageURL;
				// 得到下一页的页面
				nextPageStr = getPageStr(s);
				// 把下一页的连接全找出来,放到set集合中
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}.html{1}");
				Matcher mm = pHtml.matcher(nextPageStr);
				while (mm.find()) {
					String nextLinkStr = "http://youbian.baidu.com" + mm.group();
					set.add(nextLinkStr);
				}
				// 判断第二页有没有“下一页”
				nextPageIndex = nextPageStr.indexOf("><font size=3>下一页<");
				// 有下一页就按照流程while流程再处理
				if (nextPageIndex != -1) {
					pageStr = nextPageStr;
					continue;
				}
				// 没有下一页表示到了最后,只要把此页面的URL地址放到set集合中,跳出即可
				else {
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}.html{1}");
					Matcher mmm = pHtml.matcher(nextPageStr);
					while (mmm.find()) {
						String nextLinkStr = "http://youbian.baidu.com"
								+ mmm.group();
						set.add(nextLinkStr);
					}
					break;
				}
				// 假如输入“北京市”此时set集合中装的就都是类似于http://youbian.baidu.com/100076.html的URL字符串
			}
			long outTime = System.currentTimeMillis();
			System.out.println("得到所有链接时间:" + (outTime - timer)/1000.0 + "s");
		}
		else {
			long outTime = System.currentTimeMillis();
			System.out.println("得到所有链接时间:" + (outTime - timer)/1000.0 + "s");
			return;
		}
	}

	public static String getPageStr(String URL) {
		String pageStr = "";
		// 构造HttpClient的实例
		HttpClient httpClient = new HttpClient();
		// 创建GET方法的实例
		GetMethod getMethod = new GetMethod(URL);
		// 使用系统提供的默认的恢复策略
		getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
				new DefaultHttpMethodRetryHandler());
		try {
			// 执行getMethod
			int statusCode = httpClient.executeMethod(getMethod);
			if (statusCode != HttpStatus.SC_OK) {
				System.err.println("Method failed: "
						+ getMethod.getStatusLine());
			}
			// 读取内容
			byte[] responseBody = getMethod.getResponseBody();

			// 处理内容,在搜索一个城市时,取出所有链接,扫描里面的内容,解析里面的链接
			// System.out.println(new String(responseBody));
			pageStr = new String(responseBody);
		} catch (SocketException e) {
			System.out.println("百度连接关闭!!!set集合将被销毁,重试本省!");
			System.exit(0);
		} catch (HttpException e) {
			// 发生致命的异常,可能是协议不对或者返回的内容有问题
			System.out.println("Please check your provided http address!");
			e.printStackTrace();
			System.exit(0);
		} catch (IOException e) {
			// 发生网络异常
			e.printStackTrace();
			System.exit(0);
		} catch (Exception e) {
			e.printStackTrace();
			System.exit(0);
		} 
		finally {
			// 释放连接
			getMethod.releaseConnection();
		}
		return pageStr;
	}

	public static void dealSet() {
		TableDao td = new TableDao();
		//String outputFile = "D:/pro/" + u + ".txt";
		Iterator it = set.iterator();
		while (it.hasNext()) {
			//取出每一条URL地址,存到数据库中
			String URLString = (String) it.next();
			td.insertURL(URLString);
		}
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -