📄 testspider.java
字号:
package test;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.sql.*;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import domain.Code;
public class TestSpider {
// 盛放所有连接地址
public static Set set = new HashSet();
public static Pattern pHtml = Pattern.compile("/{1}[0-9]{6}\\.html");
public static Pattern pTitle = Pattern.compile("<TITLE>.+</TITLE>");//175
public static Pattern pExist = Pattern.compile("\"code\": [0-9]{3}");//191
public static Pattern pl = Pattern.compile("\\d{1,3}\\.\\d{7}");//197
public static Pattern pTd = Pattern.compile("<td width=\"+190+\">.+</td>");
//public static long timer = 0;
public static void main(String[] args) {
// 从文本文件得到城市名称
//timer=System.currentTimeMillis();
System.out.println("入库开始......");
String[] strArray = getAeraArray();
// 循环遍历城市名称(如同在百度关键字中填入该城市)
for (int i = 0; i < strArray.length; i++) {
// System.out.println(strArray[i]);
String u = URLEncoder.encode(strArray[i]);
u = "http://youbian.baidu.com/s?word=" + u
+ "&tn=baiduyb&ct=2097152&cl=0&si=youbian.baidu.com";
getAllLinks(u);
}
// 都加到set集合中再处理
dealSet();
System.out.println("入库结束");
}
public static String[] getAeraArray() {
String[] ss = null;
try {
FileReader fr = new FileReader("d://aera.txt");
BufferedReader br = new BufferedReader(fr);
StringBuffer sb = new StringBuffer();
String str = "";
while (!((str = br.readLine()) == null)) {
sb.append(str);
}
br.close();
String s = sb.toString();
ss = s.split(";");
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return ss;
}
public static void getAllLinks(String HTTPURL) {
long timer = System.currentTimeMillis();
String pageStr = getPageStr(HTTPURL);
String nextPageStr = "";
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}//.(html){1}");
Matcher m = pHtml.matcher(pageStr);
while (m.find()) {
String linkStr = "http://youbian.baidu.com" + m.group();
set.add(linkStr);
}
// 表示有下一页
int nextPageIndex = pageStr.indexOf("><font size=3>下一页<");
if(nextPageIndex != -1) {
while (true) {
// 截取下一页的URL地址
String withoutNextLable = pageStr.substring(0, nextPageIndex);
// System.out.println(pageStr);
int lastHref = withoutNextLable
.lastIndexOf("s?lm=0&si=youbian.baidu.com");
String nextPageURL = withoutNextLable.substring(lastHref);
String s = "http://youbian.baidu.com/" + nextPageURL;
// 得到下一页的页面
nextPageStr = getPageStr(s);
// 把下一页的连接全找出来,放到set集合中
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}.html{1}");
Matcher mm = pHtml.matcher(nextPageStr);
while (mm.find()) {
String nextLinkStr = "http://youbian.baidu.com" + mm.group();
set.add(nextLinkStr);
}
// 判断第二页有没有“下一页”
nextPageIndex = nextPageStr.indexOf("><font size=3>下一页<");
// 有下一页就按照流程while流程再处理
if (nextPageIndex != -1) {
pageStr = nextPageStr;
continue;
}
// 没有下一页表示到了最后,只要把此页面的URL地址放到set集合中,跳出即可
else {
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}.html{1}");
Matcher mmm = pHtml.matcher(nextPageStr);
while (mmm.find()) {
String nextLinkStr = "http://youbian.baidu.com"
+ mmm.group();
set.add(nextLinkStr);
}
break;
}
// 假如输入“北京市”此时set集合中装的就都是类似于http://youbian.baidu.com/100076.html的URL字符串
}
long outTime = System.currentTimeMillis();
System.out.println("得到所有链接时间:" + (outTime - timer)/1000.0 + "s");
}
else {
long outTime = System.currentTimeMillis();
System.out.println("得到所有链接时间:" + (outTime - timer)/1000.0 + "s");
return;
}
}
public static String getPageStr(String URL) {
String pageStr = "";
// 构造HttpClient的实例
HttpClient httpClient = new HttpClient();
// 创建GET方法的实例
GetMethod getMethod = new GetMethod(URL);
// 使用系统提供的默认的恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler());
try {
// 执行getMethod
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: "
+ getMethod.getStatusLine());
}
// 读取内容
byte[] responseBody = getMethod.getResponseBody();
// 处理内容,在搜索一个城市时,取出所有链接,扫描里面的内容,解析里面的链接
// System.out.println(new String(responseBody));
pageStr = new String(responseBody);
} catch (HttpException e) {
// 发生致命的异常,可能是协议不对或者返回的内容有问题
System.out.println("Please check your provided http address!");
e.printStackTrace();
} catch (IOException e) {
// 发生网络异常
e.printStackTrace();
} finally {
// 释放连接
getMethod.releaseConnection();
}
return pageStr;
}
public static void dealSet() {
TableDao td = new TableDao();
Iterator it = set.iterator();
while (it.hasNext()) {
String URLString = (String) it.next();
String pageStr = getPageStr(URLString);
// 得到页面的字符串,开始处理入库
//Pattern pTitle = Pattern.compile("<TITLE>.+</TITLE>");
Matcher mm = pTitle.matcher(pageStr);
// 找到每个页面的<TITLE>.+</TITLE>,放入数据库,每个页面只有一个<TITLE>.+</TITLE>
while (mm.find()) {
// System.out.println(mm.group());
String group = mm.group();
String[] ss = group.split(" ");
String code = ss[0].substring(7);
// 插入邮编表中的数据
Integer codeNumber = Integer.parseInt(code);
String areaName = ss[2];
//得到坐标页面信息
String locationURL = "http://maps.google.com/maps/geo?output=json&oe=utf-8&q="+codeNumber+"&key=ABQIAAAACbx_JGnqV-1PDzVrHj3XdhTABvwwz049_81qKrWEm99Y-pOq9hS_bCg81mYizPwn91I_OJ22BR0HcQ&callback=_xdc_._7fu2j59h8";
String locationPageStr = getPageStr(locationURL);
//判断在qiblalocator可否查找到baidu上搜到的邮政编码
//Pattern pExist = Pattern.compile("\"code\": [0-9]{3}");
Matcher mExist = pExist.matcher(locationPageStr);
mExist.find();
String whetherExist = mExist.group(0);
if(whetherExist.equalsIgnoreCase("\"code\": 200")) {
//查找latitude和longitude
//Pattern pl = Pattern.compile("\\d{1,3}\\.\\d{7}");
Matcher ml = pl.matcher(locationPageStr);
String[] direction = new String[6];
int count = 0;
while(ml.find()) {
direction[count] = ml.group();
count++;
if(count == 6) {
break;
}
}
double north = Double.parseDouble(direction[0]);
double south = Double.parseDouble(direction[1]);
double east = Double.parseDouble(direction[2]);
double west = Double.parseDouble(direction[3]);
double longitude = Double.parseDouble(direction[4]);
double latitude = Double.parseDouble(direction[5]);
Code codeEntity = new Code();
codeEntity.setCodenumber(codeNumber);
codeEntity.setAeraName(areaName);
codeEntity.setLatitude(latitude);
codeEntity.setLongitude(longitude);
codeEntity.setNorth(north);
codeEntity.setSouth(south);
codeEntity.setEast(east);
codeEntity.setWest(west);
td.InsertCodeTable(codeEntity);
//查出邮编表中codeId
Long codeId = td.findCodeId(codeNumber);
//插入街道表
//Pattern pTd = Pattern.compile("<td width=\"+190+\">.+</td>");
Matcher m = pTd.matcher(pageStr);
while (m.find()) {
String street = m.group();
int endIndex = street.indexOf("</td>");
street = street.substring(16, endIndex).trim();
td.InsertStreet(street, codeId);
}
}
else {
Code codeEntity = new Code();
codeEntity.setCodenumber(codeNumber);
codeEntity.setAeraName(areaName);
td.InsertCodeTable(codeEntity);
//查出邮编表中codeId
Long codeId = td.findCodeId(codeNumber);
//插入街道表
//Pattern p = Pattern.compile("<td width=\"+190+\">.+</td>");
Matcher m = pTd.matcher(pageStr);
while (m.find()) {
String street = m.group();
int endIndex = street.indexOf("</td>");
street = street.substring(16, endIndex).trim();
td.InsertStreet(street, codeId);
}
}
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -