pagedownload.java
来自「java下的 多线程爬虫 输入线程数目」· Java 代码 · 共 248 行
JAVA
248 行
package crawler;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//import crawler.UrlDatabase;
public class PageDownload implements Runnable {
String startUrl;
String searchString;// 要搜索的字符串(英文)
boolean caseSensitive = false;// 是否区分大小写
boolean limitHost = true;// 是否在限制的主机内搜索
UrlDatabase database;
boolean limitField = true;
int interval = 2000;
ArrayList<String> limitFields;
public PageDownload(UrlDatabase database) {
this.database = database;
limitFields = new ArrayList<String>();
}
public PageDownload() {
limitFields = new ArrayList<String>();
}
public void setDatabase(UrlDatabase database)
{
this.database = database;
}
public void setInterval(int interval) {
this.interval = interval;
}
public void setFields(ArrayList<String> fields) {
limitFields.addAll(fields);
}
public void run() {// 启动搜索线程
download(limitHost, caseSensitive);
}
public URL verifyUrl(String url) {
// 只处理HTTP URLs.
if (!url.toLowerCase().startsWith("http://") && !startUrl.equals(""))
return null;
URL verifiedUrl = null;
try {
verifiedUrl = new URL(url);
} catch (Exception e) {
return null;
}
return verifiedUrl;
}
private String downloadPage(URL pageUrl) {
try {
// Open connection to URL for reading.
BufferedReader reader = new BufferedReader(new InputStreamReader(
pageUrl.openStream()));
// Read page into buffer.
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line);
}
return pageBuffer.toString();
} catch (Exception e) {
}
return null;
}
// 从URL中去掉"www"
private String removeWwwFromUrl(String url) {
int index = url.indexOf("://www.");
if (index != -1) {
return url.substring(0, index + 3) + url.substring(index + 7);
}
return (url);
}
// 解析页面并找出链接
public ArrayList<String> retrieveLinks(URL pageUrl, String pageContents,
boolean limitHost) {
// 用正则表达式编译链接的匹配模式。
Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(pageContents);
ArrayList<String> linkList = new ArrayList<String>();
while (m.find()) {
String link = m.group(1).trim();
if (link.length() < 1) {
continue;
}
// 跳过链到本页面内链接。
if (link.charAt(0) == '#') {
continue;
}
if (link.indexOf("mailto:") != -1) {
continue;
}
if (link.toLowerCase().indexOf("javascript") != -1) {
continue;
}
if (link.indexOf("://") == -1) {
if (link.charAt(0) == '/') {// 处理绝对地
int port = pageUrl.getPort();
if (port > 0)
link = "http://" + pageUrl.getHost() + ":"
+ pageUrl.getPort() + link;
else
link = "http://" + pageUrl.getHost() + link;
} else {
String file = pageUrl.getFile();
if (file.indexOf('/') == -1) {// 处理相对地址
int port = pageUrl.getPort();
if (port > 0)
link = "http://" + pageUrl.getHost() + ":"
+ pageUrl.getPort() + "/" + link;
else link = "http://" + pageUrl.getHost() + "/" + link;
} else {
int port = pageUrl.getPort();
String path = file.substring(0,
file.lastIndexOf('/') + 1);
if (port > 0)
link = "http://" + pageUrl.getHost() + ":"
+ pageUrl.getPort() + path + link;
else link = "http://" + pageUrl.getHost() + path + link;
}
}
}
int index = link.indexOf('#');
if (index != -1) {
link = link.substring(0, index);
}
link = removeWwwFromUrl(link);
URL verifiedLink = verifyUrl(link);
if (verifiedLink == null) {
continue;
}
if (limitField && limitFields.size() > 0) {
int i;
for (i = 0; i < limitFields.size(); i++) {
if (link.indexOf(limitFields.get(i)) >= 0)
break;
}
if (i >= limitFields.size()) continue;
}
if (link.indexOf("#fr=qrl") > 0) continue;
//System.out.println("new :" + link);
linkList.add(link);
}
return (linkList);
}
public String urlToFileName(String url) {
String fileName = "data\\" + url + ".html";
fileName = fileName.replace('?', '#');
fileName = fileName.replace('/', '%');
fileName = fileName.replace(':', '$');
return fileName;
}
public ArrayList<String> download(boolean limithost, boolean caseSensitive) {
// 从开始URL中移出www
ArrayList<String> links = new ArrayList<String>();
while (true)
{
String url;
url = database.getUrl();
if (url == null && database.getNum() == 0)
{
break;
}
else if (url == null)
{
try {
Thread.sleep((int) (Math.random() * 1000));
}
catch (InterruptedException e) {
System.err.println(e.toString());
}
continue;
}
startUrl = removeWwwFromUrl(url);
URL verifiedUrl = verifyUrl(startUrl);
// Skip URL if robots are not allowed to access it.
//if (!isRobotAllowed(verifiedUrl)) {
// continue;
//}
// 增加已处理的URL到crawledList
String pageContents = downloadPage(verifiedUrl);
System.out.println(url);
String fileName = urlToFileName(url);
File myfile = new File(fileName);
try
{
FileOutputStream out = new FileOutputStream(myfile);
out.write(pageContents.getBytes());
out.close();
}
catch (IOException e)
{
}
if (pageContents != null && pageContents.length() > 0) {
// 从页面中获取有效的链接
links = retrieveLinks(verifiedUrl,
pageContents, limitHost);
}
database.addUrls(links);
try {
Thread.sleep(interval);
}
catch (InterruptedException e) {
System.err.println(e.toString());
}
}
return links;
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?