📄 testspider.java

📁 爬取网站信息
💻 JAVA
字号:
package test;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.sql.*;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

import domain.Code;

public class TestSpider {
	// 盛放所有连接地址
	public static Set set = new HashSet();
	public static Pattern pHtml = Pattern.compile("/{1}[0-9]{6}\\.html");
	public static Pattern pTitle = Pattern.compile("<TITLE>.+</TITLE>");//175
	public static Pattern pExist = Pattern.compile("\"code\": [0-9]{3}");//191
	public static Pattern pl = Pattern.compile("\\d{1,3}\\.\\d{7}");//197
	public static Pattern pTd = Pattern.compile("<td width=\"+190+\">.+</td>");
	//public static long timer = 0;
	public static void main(String[] args) {
		// 从文本文件得到城市名称
		//timer=System.currentTimeMillis();
		System.out.println("入库开始......");
		String[] strArray = getAeraArray();
		// 循环遍历城市名称（如同在百度关键字中填入该城市）
		for (int i = 0; i < strArray.length; i++) {
			// System.out.println(strArray[i]);
			String u = URLEncoder.encode(strArray[i]);
			u = "http://youbian.baidu.com/s?word=" + u
					+ "&tn=baiduyb&ct=2097152&cl=0&si=youbian.baidu.com";
			getAllLinks(u);
		}
		// 都加到set集合中再处理
		dealSet();
		System.out.println("入库结束");
	}

	public static String[] getAeraArray() {
		String[] ss = null;
		try {
			FileReader fr = new FileReader("d://aera.txt");
			BufferedReader br = new BufferedReader(fr);
			StringBuffer sb = new StringBuffer();
			String str = "";
			while (!((str = br.readLine()) == null)) {
				sb.append(str);
			}
			br.close();
			String s = sb.toString();
			ss = s.split("；");
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return ss;
	}

	public static void getAllLinks(String HTTPURL) {
		long timer = System.currentTimeMillis();
		
		String pageStr = getPageStr(HTTPURL);
		String nextPageStr = "";
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}//.(html){1}");
		Matcher m = pHtml.matcher(pageStr);
		while (m.find()) {
			String linkStr = "http://youbian.baidu.com" + m.group();
			set.add(linkStr);
		}
		// 表示有下一页
		int nextPageIndex = pageStr.indexOf("><font size=3>下一页<");
		if(nextPageIndex != -1) {
			while (true) {
				// 截取下一页的URL地址
				String withoutNextLable = pageStr.substring(0, nextPageIndex);
				// System.out.println(pageStr);
				int lastHref = withoutNextLable
						.lastIndexOf("s?lm=0&si=youbian.baidu.com");
				String nextPageURL = withoutNextLable.substring(lastHref);
				String s = "http://youbian.baidu.com/" + nextPageURL;
				// 得到下一页的页面
				nextPageStr = getPageStr(s);
				// 把下一页的连接全找出来，放到set集合中
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}.html{1}");
				Matcher mm = pHtml.matcher(nextPageStr);
				while (mm.find()) {
					String nextLinkStr = "http://youbian.baidu.com" + mm.group();
					set.add(nextLinkStr);
				}
				// 判断第二页有没有“下一页”
				nextPageIndex = nextPageStr.indexOf("><font size=3>下一页<");
				// 有下一页就按照流程while流程再处理
				if (nextPageIndex != -1) {
					pageStr = nextPageStr;
					continue;
				}
				// 没有下一页表示到了最后，只要把此页面的URL地址放到set集合中，跳出即可
				else {
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}.html{1}");
					Matcher mmm = pHtml.matcher(nextPageStr);
					while (mmm.find()) {
						String nextLinkStr = "http://youbian.baidu.com"
								+ mmm.group();
						set.add(nextLinkStr);
					}
					break;
				}
				// 假如输入“北京市”此时set集合中装的就都是类似于http://youbian.baidu.com/100076.html的URL字符串
			}
			long outTime = System.currentTimeMillis();
			System.out.println("得到所有链接时间：" + (outTime - timer)/1000.0 + "s");
		}
		else {
			long outTime = System.currentTimeMillis();
			System.out.println("得到所有链接时间：" + (outTime - timer)/1000.0 + "s");
			return;
		}
	}

	public static String getPageStr(String URL) {
		String pageStr = "";
		// 构造HttpClient的实例
		HttpClient httpClient = new HttpClient();
		// 创建GET方法的实例
		GetMethod getMethod = new GetMethod(URL);
		// 使用系统提供的默认的恢复策略
		getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
				new DefaultHttpMethodRetryHandler());
		try {
			// 执行getMethod
			int statusCode = httpClient.executeMethod(getMethod);
			if (statusCode != HttpStatus.SC_OK) {
				System.err.println("Method failed: "
						+ getMethod.getStatusLine());
			}
			// 读取内容
			byte[] responseBody = getMethod.getResponseBody();

			// 处理内容，在搜索一个城市时，取出所有链接，扫描里面的内容，解析里面的链接
			// System.out.println(new String(responseBody));
			pageStr = new String(responseBody);
		} catch (HttpException e) {
			// 发生致命的异常，可能是协议不对或者返回的内容有问题
			System.out.println("Please check your provided http address!");
			e.printStackTrace();
		} catch (IOException e) {
			// 发生网络异常
			e.printStackTrace();
		} finally {
			// 释放连接
			getMethod.releaseConnection();
		}
		return pageStr;
	}

	public static void dealSet() {
		TableDao td = new TableDao();
		Iterator it = set.iterator();
		while (it.hasNext()) {
			String URLString = (String) it.next();
			String pageStr = getPageStr(URLString);
			// 得到页面的字符串，开始处理入库
//Pattern pTitle = Pattern.compile("<TITLE>.+</TITLE>");
			Matcher mm = pTitle.matcher(pageStr);
			// 找到每个页面的<TITLE>.+</TITLE>，放入数据库，每个页面只有一个<TITLE>.+</TITLE>
			while (mm.find()) {
				// System.out.println(mm.group());
				String group = mm.group();
				String[] ss = group.split(" ");
				String code = ss[0].substring(7);
				// 插入邮编表中的数据
				Integer codeNumber = Integer.parseInt(code);
				String areaName = ss[2];
				//得到坐标页面信息
				String locationURL = "http://maps.google.com/maps/geo?output=json&oe=utf-8&q="+codeNumber+"&key=ABQIAAAACbx_JGnqV-1PDzVrHj3XdhTABvwwz049_81qKrWEm99Y-pOq9hS_bCg81mYizPwn91I_OJ22BR0HcQ&callback=_xdc_._7fu2j59h8";
				String locationPageStr = getPageStr(locationURL);
				//判断在qiblalocator可否查找到baidu上搜到的邮政编码
//Pattern pExist = Pattern.compile("\"code\": [0-9]{3}");
				Matcher mExist = pExist.matcher(locationPageStr);
				mExist.find();
				String whetherExist = mExist.group(0);
				
				if(whetherExist.equalsIgnoreCase("\"code\": 200")) {
					//查找latitude和longitude
//Pattern pl = Pattern.compile("\\d{1,3}\\.\\d{7}");
					Matcher ml = pl.matcher(locationPageStr);
					String[] direction = new String[6];
					int count = 0;
					while(ml.find()) {
						direction[count] = ml.group();
						count++;
						if(count == 6) {
							break;
						}
					}
					
					double north = Double.parseDouble(direction[0]);
					double south = Double.parseDouble(direction[1]);
					double east = Double.parseDouble(direction[2]);
					double west = Double.parseDouble(direction[3]);
					double longitude = Double.parseDouble(direction[4]);
					double latitude = Double.parseDouble(direction[5]);
					
					Code codeEntity = new Code();
					codeEntity.setCodenumber(codeNumber);
					codeEntity.setAeraName(areaName);
					codeEntity.setLatitude(latitude);
					codeEntity.setLongitude(longitude);
					codeEntity.setNorth(north);
					codeEntity.setSouth(south);
					codeEntity.setEast(east);
					codeEntity.setWest(west);
					
					td.InsertCodeTable(codeEntity);
					//查出邮编表中codeId
					Long codeId = td.findCodeId(codeNumber);
					//插入街道表
//Pattern pTd = Pattern.compile("<td width=\"+190+\">.+</td>");
					Matcher m = pTd.matcher(pageStr);
					while (m.find()) {
						String street = m.group();
						int endIndex = street.indexOf("</td>");
						street = street.substring(16, endIndex).trim();
						td.InsertStreet(street, codeId);
					}
				}
				
				else {
					Code codeEntity = new Code();
					codeEntity.setCodenumber(codeNumber);
					codeEntity.setAeraName(areaName);
					
					td.InsertCodeTable(codeEntity);
					//查出邮编表中codeId
					Long codeId = td.findCodeId(codeNumber);
					//插入街道表
//Pattern p = Pattern.compile("<td width=\"+190+\">.+</td>");
					Matcher m = pTd.matcher(pageStr);
					while (m.find()) {
						String street = m.group();
						int endIndex = street.indexOf("</td>");
						street = street.substring(16, endIndex).trim();
						td.InsertStreet(street, codeId);
					}
				}
			}
		}
	}
}
💿 文件大小 38 K
👤 上传用户 karon9999
📂 所属分类 Java编程
🏷️ 相关标签

#网站
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -