📄 spider.java
字号:
package test;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.URL;
import java.net.URLEncoder;
import java.sql.*;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import domain.Code;
/**
* @author eatsun
* 此类的所用是从文件里读取省市,把其下的所有邮编URL保存到数据库中
*/
public class Spider {
// 盛放所有连接地址
public static Set set = new HashSet();
public static Pattern pHtml = Pattern.compile("/{1}[0-9]{6}\\.html");
public static Pattern pTitle = Pattern.compile("<TITLE>.+</TITLE>");//175
public static Pattern pExist = Pattern.compile("\"code\": [0-9]{3}");//191
public static Pattern pl = Pattern.compile("\\d{1,3}\\.\\d{7}");//197
public static Pattern pTd = Pattern.compile("<td width=\"+190+\">.+</td>");
//public static long timer = 0;
public static void main(String[] args) {
// 从文本文件得到城市名称
//timer=System.currentTimeMillis();
System.out.println("入库开始......");
String[] strArray = getAeraArray();
// 循环遍历城市名称(如同在百度关键字中填入该城市)
for (int i = 0; i < strArray.length; i++) {
// System.out.println(strArray[i]);
String u = URLEncoder.encode(strArray[i]);
u = "http://youbian.baidu.com/s?word=" + u
+ "&tn=baiduyb&ct=2097152&cl=0&si=youbian.baidu.com";
getAllLinks(u);
}
// 都加到set集合中再处理
dealSet();
System.out.println("入库结束");
}
public static String[] getAeraArray() {
String[] ss = null;
try {
FileReader fr = new FileReader("d://aera.txt");
BufferedReader br = new BufferedReader(fr);
StringBuffer sb = new StringBuffer();
String str = "";
while (!((str = br.readLine()) == null)) {
sb.append(str);
}
br.close();
String s = sb.toString();
ss = s.split(";");
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return ss;
}
public static void getAllLinks(String HTTPURL) {
long timer = System.currentTimeMillis();
String pageStr = getPageStr(HTTPURL);
String nextPageStr = "";
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}//.(html){1}");
Matcher m = pHtml.matcher(pageStr);
while (m.find()) {
String linkStr = "http://youbian.baidu.com" + m.group();
set.add(linkStr);
}
// 表示有下一页
int nextPageIndex = pageStr.indexOf("><font size=3>下一页<");
if(nextPageIndex != -1) {
while (true) {
// 截取下一页的URL地址
String withoutNextLable = pageStr.substring(0, nextPageIndex);
// System.out.println(pageStr);
int lastHref = withoutNextLable
.lastIndexOf("s?lm=0&si=youbian.baidu.com");
String nextPageURL = withoutNextLable.substring(lastHref);
String s = "http://youbian.baidu.com/" + nextPageURL;
// 得到下一页的页面
nextPageStr = getPageStr(s);
// 把下一页的连接全找出来,放到set集合中
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}.html{1}");
Matcher mm = pHtml.matcher(nextPageStr);
while (mm.find()) {
String nextLinkStr = "http://youbian.baidu.com" + mm.group();
set.add(nextLinkStr);
}
// 判断第二页有没有“下一页”
nextPageIndex = nextPageStr.indexOf("><font size=3>下一页<");
// 有下一页就按照流程while流程再处理
if (nextPageIndex != -1) {
pageStr = nextPageStr;
continue;
}
// 没有下一页表示到了最后,只要把此页面的URL地址放到set集合中,跳出即可
else {
//Pattern pHtml = Pattern.compile("/{1}[0-9]{6}.html{1}");
Matcher mmm = pHtml.matcher(nextPageStr);
while (mmm.find()) {
String nextLinkStr = "http://youbian.baidu.com"
+ mmm.group();
set.add(nextLinkStr);
}
break;
}
// 假如输入“北京市”此时set集合中装的就都是类似于http://youbian.baidu.com/100076.html的URL字符串
}
long outTime = System.currentTimeMillis();
System.out.println("得到所有链接时间:" + (outTime - timer)/1000.0 + "s");
}
else {
long outTime = System.currentTimeMillis();
System.out.println("得到所有链接时间:" + (outTime - timer)/1000.0 + "s");
return;
}
}
public static String getPageStr(String URL) {
String pageStr = "";
// 构造HttpClient的实例
HttpClient httpClient = new HttpClient();
// 创建GET方法的实例
GetMethod getMethod = new GetMethod(URL);
// 使用系统提供的默认的恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler());
try {
// 执行getMethod
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: "
+ getMethod.getStatusLine());
}
// 读取内容
byte[] responseBody = getMethod.getResponseBody();
// 处理内容,在搜索一个城市时,取出所有链接,扫描里面的内容,解析里面的链接
// System.out.println(new String(responseBody));
pageStr = new String(responseBody);
} catch (SocketException e) {
System.out.println("百度连接关闭!!!set集合将被销毁,重试本省!");
System.exit(0);
} catch (HttpException e) {
// 发生致命的异常,可能是协议不对或者返回的内容有问题
System.out.println("Please check your provided http address!");
e.printStackTrace();
System.exit(0);
} catch (IOException e) {
// 发生网络异常
e.printStackTrace();
System.exit(0);
} catch (Exception e) {
e.printStackTrace();
System.exit(0);
}
finally {
// 释放连接
getMethod.releaseConnection();
}
return pageStr;
}
public static void dealSet() {
TableDao td = new TableDao();
//String outputFile = "D:/pro/" + u + ".txt";
Iterator it = set.iterator();
while (it.hasNext()) {
//取出每一条URL地址,存到数据库中
String URLString = (String) it.next();
td.insertURL(URLString);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -