📄 spiderdetail.java
字号:
package test;
import java.io.IOException;
import java.net.SocketException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import domain.Code;
/**
* @author eatsun
* 此类的作用是把数据库中的URL地址拿出来,再解析pageStr入库,URL_BAIDU中有还数据,表示连接中断,支持续传
* 在从百度爬下相关信息时可能遇到如下问题:
* 1 内存不足 2 百度连接关闭 3 no respond异常
* 对于后两种异常等待几分钟再访问即可
*
*/
public class SpiderDetail {
public static TableDao td = new TableDao();
public static Pattern pHtml = Pattern.compile("/{1}[0-9]{6}\\.html");
public static Pattern pTitle = Pattern.compile("<TITLE>.+</TITLE>");// 175
public static Pattern pExist = Pattern.compile("\"code\": [0-9]{3}");// 191
public static Pattern pl = Pattern.compile("\\d{1,3}\\.\\d{7}");// 197
public static Pattern pTd = Pattern.compile("<td width=\"+190+\">.+</td>");
public static void main(String[] args) {
System.out.println("开始向数据库中逐条写入地区和街道,并删除已经使用过的URLString");
findDetail();
System.out.println("地区和街道插入数据完毕");
}
public static String getPageStr(String URL) {
String pageStr = "";
// 构造HttpClient的实例
HttpClient httpClient = new HttpClient();
// 创建GET方法的实例
GetMethod getMethod = new GetMethod(URL);
// 使用系统提供的默认的恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler());
try {
// 执行getMethod
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: "
+ getMethod.getStatusLine());
}
// 读取内容
byte[] responseBody = getMethod.getResponseBody();
// 处理内容,在搜索一个城市时,取出所有链接,扫描里面的内容,解析里面的链接
// System.out.println(new String(responseBody));
pageStr = new String(responseBody);
} catch (SocketException e) {
System.out.println("百度连接关闭!!!等待一会儿再去连接!");
System.exit(0);
} catch (HttpException e) {
// 发生致命的异常,可能是协议不对或者返回的内容有问题
System.out.println("Please check your provided http address!");
e.printStackTrace();
System.exit(0);
} catch (IOException e) {
// 发生网络异常
e.printStackTrace();
System.exit(0);
} catch (Exception e) {
e.printStackTrace();
System.exit(0);
}
finally {
// 释放连接
getMethod.releaseConnection();
}
return pageStr;
}
public static void findDetail() {
while (true) {
String URLString = td.getMinIdURL();
if (URLString != null) {
String pageStr = getPageStr(URLString);
// 得到页面的字符串,开始处理入库
// Pattern pTitle = Pattern.compile("<TITLE>.+</TITLE>");
Matcher mm = pTitle.matcher(pageStr);
// 找到每个页面的<TITLE>.+</TITLE>,放入数据库,每个页面只有一个<TITLE>.+</TITLE>
while (mm.find()) {
// System.out.println(mm.group());
String group = mm.group();
String[] ss = group.split(" ");
String code = ss[0].substring(7);
// 插入邮编表中的数据
Integer codeNumber = Integer.parseInt(code);
String areaName = ss[2];
// 得到坐标页面信息
String locationURL = "http://maps.google.com/maps/geo?output=json&oe=utf-8&q="
+ codeNumber
+ "&key=ABQIAAAACbx_JGnqV-1PDzVrHj3XdhTABvwwz049_81qKrWEm99Y-pOq9hS_bCg81mYizPwn91I_OJ22BR0HcQ&callback=_xdc_._7fu2j59h8";
String locationPageStr = getPageStr(locationURL);
// 判断在qiblalocator可否查找到baidu上搜到的邮政编码
// Pattern pExist = Pattern.compile("\"code\": [0-9]{3}");
Matcher mExist = pExist.matcher(locationPageStr);
mExist.find();
String whetherExist = mExist.group(0);
if (whetherExist.equalsIgnoreCase("\"code\": 200")) {
// 查找latitude和longitude
// Pattern pl = Pattern.compile("\\d{1,3}\\.\\d{7}");
Matcher ml = pl.matcher(locationPageStr);
String[] direction = new String[6];
int count = 0;
while (ml.find()) {
direction[count] = ml.group();
count++;
if (count == 6) {
break;
}
}
double north = Double.parseDouble(direction[0]);
double south = Double.parseDouble(direction[1]);
double east = Double.parseDouble(direction[2]);
double west = Double.parseDouble(direction[3]);
double longitude = Double.parseDouble(direction[4]);
double latitude = Double.parseDouble(direction[5]);
Code codeEntity = new Code();
codeEntity.setCodenumber(codeNumber);
codeEntity.setAeraName(areaName);
codeEntity.setLatitude(latitude);
codeEntity.setLongitude(longitude);
codeEntity.setNorth(north);
codeEntity.setSouth(south);
codeEntity.setEast(east);
codeEntity.setWest(west);
td.InsertCodeTable(codeEntity);
// 查出邮编表中codeId
Long codeId = td.findCodeId(codeNumber);
// 插入街道表
// Pattern pTd =
// Pattern.compile("<td width=\"+190+\">.+</td>");
Matcher m = pTd.matcher(pageStr);
while (m.find()) {
String street = m.group();
int endIndex = street.indexOf("</td>");
street = street.substring(16, endIndex).trim();
td.InsertStreet(street, codeId);
}
}
else {
Code codeEntity = new Code();
codeEntity.setCodenumber(codeNumber);
codeEntity.setAeraName(areaName);
td.InsertCodeTable(codeEntity);
// 查出邮编表中codeId
Long codeId = td.findCodeId(codeNumber);
// 插入街道表
// Pattern p =
// Pattern.compile("<td width=\"+190+\">.+</td>");
Matcher m = pTd.matcher(pageStr);
while (m.find()) {
String street = m.group();
int endIndex = street.indexOf("</td>");
street = street.substring(16, endIndex).trim();
td.InsertStreet(street, codeId);
}
}
}
td.deleteURL();
}
else {
break;
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -