📄 spiderdetail.java

📁 爬取网站信息
💻 JAVA
字号:
package test;

import java.io.IOException;
import java.net.SocketException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

import domain.Code;

/**
 * @author eatsun
 * 此类的作用是把数据库中的URL地址拿出来，再解析pageStr入库，URL_BAIDU中有还数据，表示连接中断，支持续传
 * 在从百度爬下相关信息时可能遇到如下问题：
 * 1 内存不足 2 百度连接关闭 3 no respond异常
 * 对于后两种异常等待几分钟再访问即可
 * 
 */
public class SpiderDetail {
	public static TableDao td = new TableDao();
	public static Pattern pHtml = Pattern.compile("/{1}[0-9]{6}\\.html");
	public static Pattern pTitle = Pattern.compile("<TITLE>.+</TITLE>");// 175
	public static Pattern pExist = Pattern.compile("\"code\": [0-9]{3}");// 191
	public static Pattern pl = Pattern.compile("\\d{1,3}\\.\\d{7}");// 197
	public static Pattern pTd = Pattern.compile("<td width=\"+190+\">.+</td>");

	public static void main(String[] args) {
		System.out.println("开始向数据库中逐条写入地区和街道，并删除已经使用过的URLString");
		findDetail();
		System.out.println("地区和街道插入数据完毕");
	}

	public static String getPageStr(String URL) {
		String pageStr = "";
		// 构造HttpClient的实例
		HttpClient httpClient = new HttpClient();
		// 创建GET方法的实例
		GetMethod getMethod = new GetMethod(URL);
		// 使用系统提供的默认的恢复策略
		getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
				new DefaultHttpMethodRetryHandler());
		try {
			// 执行getMethod
			int statusCode = httpClient.executeMethod(getMethod);
			if (statusCode != HttpStatus.SC_OK) {
				System.err.println("Method failed: "
						+ getMethod.getStatusLine());
			}
			// 读取内容
			byte[] responseBody = getMethod.getResponseBody();

			// 处理内容，在搜索一个城市时，取出所有链接，扫描里面的内容，解析里面的链接
			// System.out.println(new String(responseBody));
			pageStr = new String(responseBody);
		} catch (SocketException e) {
			System.out.println("百度连接关闭！！！等待一会儿再去连接!");
			System.exit(0);
		} catch (HttpException e) {
			// 发生致命的异常，可能是协议不对或者返回的内容有问题
			System.out.println("Please check your provided http address!");
			e.printStackTrace();
			System.exit(0);
		} catch (IOException e) {
			// 发生网络异常
			e.printStackTrace();
			System.exit(0);
		} catch (Exception e) {
			e.printStackTrace();
			System.exit(0);
		}
		finally {
			// 释放连接
			getMethod.releaseConnection();
		}
		return pageStr;
	}

	public static void findDetail() {
		while (true) {
			String URLString = td.getMinIdURL();
			if (URLString != null) {
				String pageStr = getPageStr(URLString);
				// 得到页面的字符串，开始处理入库
				// Pattern pTitle = Pattern.compile("<TITLE>.+</TITLE>");
				Matcher mm = pTitle.matcher(pageStr);
				// 找到每个页面的<TITLE>.+</TITLE>，放入数据库，每个页面只有一个<TITLE>.+</TITLE>
				while (mm.find()) {
					// System.out.println(mm.group());
					String group = mm.group();
					String[] ss = group.split(" ");
					String code = ss[0].substring(7);
					// 插入邮编表中的数据
					Integer codeNumber = Integer.parseInt(code);
					String areaName = ss[2];
					// 得到坐标页面信息
					String locationURL = "http://maps.google.com/maps/geo?output=json&oe=utf-8&q="
							+ codeNumber
							+ "&key=ABQIAAAACbx_JGnqV-1PDzVrHj3XdhTABvwwz049_81qKrWEm99Y-pOq9hS_bCg81mYizPwn91I_OJ22BR0HcQ&callback=_xdc_._7fu2j59h8";
					String locationPageStr = getPageStr(locationURL);
					// 判断在qiblalocator可否查找到baidu上搜到的邮政编码
					// Pattern pExist = Pattern.compile("\"code\": [0-9]{3}");
					Matcher mExist = pExist.matcher(locationPageStr);
					mExist.find();
					String whetherExist = mExist.group(0);

					if (whetherExist.equalsIgnoreCase("\"code\": 200")) {
						// 查找latitude和longitude
						// Pattern pl = Pattern.compile("\\d{1,3}\\.\\d{7}");
						Matcher ml = pl.matcher(locationPageStr);
						String[] direction = new String[6];
						int count = 0;
						while (ml.find()) {
							direction[count] = ml.group();
							count++;
							if (count == 6) {
								break;
							}
						}

						double north = Double.parseDouble(direction[0]);
						double south = Double.parseDouble(direction[1]);
						double east = Double.parseDouble(direction[2]);
						double west = Double.parseDouble(direction[3]);
						double longitude = Double.parseDouble(direction[4]);
						double latitude = Double.parseDouble(direction[5]);

						Code codeEntity = new Code();
						codeEntity.setCodenumber(codeNumber);
						codeEntity.setAeraName(areaName);
						codeEntity.setLatitude(latitude);
						codeEntity.setLongitude(longitude);
						codeEntity.setNorth(north);
						codeEntity.setSouth(south);
						codeEntity.setEast(east);
						codeEntity.setWest(west);

						td.InsertCodeTable(codeEntity);
						// 查出邮编表中codeId
						Long codeId = td.findCodeId(codeNumber);
						// 插入街道表
						// Pattern pTd =
						// Pattern.compile("<td width=\"+190+\">.+</td>");
						Matcher m = pTd.matcher(pageStr);
						while (m.find()) {
							String street = m.group();
							int endIndex = street.indexOf("</td>");
							street = street.substring(16, endIndex).trim();
							td.InsertStreet(street, codeId);
						}
					}

					else {
						Code codeEntity = new Code();
						codeEntity.setCodenumber(codeNumber);
						codeEntity.setAeraName(areaName);

						td.InsertCodeTable(codeEntity);
						// 查出邮编表中codeId
						Long codeId = td.findCodeId(codeNumber);
						// 插入街道表
						// Pattern p =
						// Pattern.compile("<td width=\"+190+\">.+</td>");
						Matcher m = pTd.matcher(pageStr);
						while (m.find()) {
							String street = m.group();
							int endIndex = street.indexOf("</td>");
							street = street.substring(16, endIndex).trim();
							td.InsertStreet(street, codeId);
						}
					}
				}
				td.deleteURL();
			} 
			else {
				break;
			}
		}
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -