📄 crawlerui.java
字号:
package com.crawler;
import java.awt.BorderLayout;
import java.awt.Color;
import java.awt.FlowLayout;
import java.awt.Font;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Vector;
import java.util.logging.FileHandler;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
import javax.swing.JButton;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTabbedPane;
import javax.swing.JTextArea;
import javax.swing.JTextField;
import javax.swing.text.html.HTMLEditorKit;
import com.manner.MannerGather;
import com.parser.HTTP;
import com.parser.HtmlParser;
import com.parser.Parser;
import com.relative.Relative;
// 界面
public class CrawlerUI extends JFrame {
private JTabbedPane centerPane;
private JButton startButton;
private JButton stopButton;
private JButton exitButton;
private JTextField depthField;
private JTextField threadField;
private JScrollPane allUrlPane;
private JScrollPane relativeUrlPane;
private JScrollPane sitePane;
private JPanel parameterTab;
private JPanel resultTab;
private JTextArea allUrlArea;
private JTextArea relativeUrlArea;
private JTextArea siteArea;
private JLabel depthLabel;
private JLabel threadLabel;
private JLabel siteLabel;
private JLabel statusLabel;
private JLabel allUrlLabel;
private JLabel relativeUrlLabel;
private JPanel toolBar;
private int allUrl;
private int relativeUrl;
private int timeoutUrl;
// 待爬队列
private ArrayList<String> waitingQueue;
// 已经处理过的队列
private ArrayList<String> processedQueue;
// 对放入待爬队列的URL建立索引
private HashMap<String, Integer> indexQueue;
// 停止搜索的标识
private boolean stopSearch;
private Logger logger;
private int index1;
private int index2;
public CrawlerUI() {
initComponent();
setSize(650, 600);
setTitle("topic crawler");
setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
setVisible(true);
}
class Crawler extends Thread {
private Relative r = new Relative();
private MannerGather mg = new MannerGather();
private long startTime;
private long endTime;
// 限定深度
private int depthLimit;
private FileHandler logFile;
String url;
public Crawler(String name, int depthLimit) {
super(name);
this.depthLimit = depthLimit;
}
public void run() {
startTime = System.currentTimeMillis();
allUrl = 0;
relativeUrl = 0;
updateStatus();
while (true) {
url = deWaitingQueue();
// 当deWaitingQueue返回空字符串时退出循环;
if (url.equals(""))
break;
enProcessedQueue(url);
relativeUrlArea.append(getName() + "运行" + "\n");
int level = ((Integer) indexQueue.get(url)).intValue();
// 当出来的URL深度为depthLimit时,通过队列是否为空且URL深度是否为depthLimit来继续判断
// 不能URL深度为depthLimit就退出循环,因为多线程原因,队列中可能还有深度小于depthLimit的URL
while (level == depthLimit) {
if (waitingQueue.isEmpty()) {
relativeUrlArea.append("超过限定深度" + getName() + "停止"
+ "\n");
break;
} else {
url = deWaitingQueue();
enProcessedQueue(url);
level = ((Integer) indexQueue.get(url)).intValue();
}
if (stopSearch)
break;
}
// 当队列为空,且URL深度为depthLimit时才退出整个循环
if (waitingQueue.isEmpty() && level == depthLimit)
break;
// 在爬取之前判断一次是否停止
if (stopSearch) {
relativeUrlArea.append(getName() + "停止" + "\n");
break;
}
crawler(url, level);
// 爬取返回时判断一次是否停止
if (stopSearch) {
relativeUrlArea.append(getName() + "停止" + "\n");
break;
}
}
endTime = System.currentTimeMillis();
lastUpdateStatus();
}
// 爬取
public void crawler(String url, int level) {
try {
HTMLEditorKit.Parser parser = new HtmlParser().getParser();
Parser p = new Parser(url);
HTTP http = new HTTP();
parser.parse(new StringReader(http.getBody(url)), p, true);
Vector v = p.getLinks();
v = r.delRepeat(v);
//System.out.println(v.size());
for (int i = 0; i < v.size(); i++) {
String checkUrl = (String) v.get(i);
getAllUrl(checkUrl);
if (urlHasBeenVisited(checkUrl)) {
continue;
}
if (!mg.isRobotAllowed(new URL(checkUrl)))
continue;
// 调用相关函数后的返回值
int returnValue = r.urlRelative(checkUrl);
if (returnValue == -1) {
getTimeoutUrl();
continue;
} else if (!waitingQueue.contains(checkUrl)
&& returnValue == 1) {
indexQueue.put(checkUrl, level + 1);
writeText("./log/relativePage/" + (index1++) + ".txt", http
.getBody(checkUrl));
getRelativeUrl(checkUrl);
}
// System.out.println(""+v.get(i)+" "+i);
Thread.sleep(1000);
if (stopSearch)
return;
}
} catch (Exception e) {
}
}
// 将text写入对应path
public void writeText(String path, String text) throws Exception {
FileOutputStream out = new FileOutputStream(path);
out.write(text.getBytes("gbk"));
out.close();
}
public synchronized void getTimeoutUrl() {
timeoutUrl++;
updateStatus();
}
// 在relativeUrlArea显示相关的URL,relativeUrl自增,以及相关信息写入文本文件
public synchronized void getRelativeUrl(String url) throws IOException {
enWaitingQueue(url);
relativeUrlArea.append(url + "\n");
relativeUrl++;
logFile = new FileHandler("./log/relativeUrl/"+(index2++)+".txt");
logFile.setFormatter(new SimpleFormatter());
logger.addHandler(logFile);
logger.setUseParentHandlers(false);
logger.info(url);
updateStatus();
}
// allUrlArea显示所有的URL,allUrl++自增
public synchronized void getAllUrl(String url) throws IOException {
allUrlArea.append(url + "\n");
allUrl++;
updateStatus();
}
// 在statusLabel显示爬取的变化信息
public synchronized void updateStatus() {
statusLabel.setText("allUrl: " + allUrl + " relativeUrl: "
+ relativeUrl + " timeoutUrl: " + timeoutUrl);
}
// 在statusLabel显示爬取的最终信息
public synchronized void lastUpdateStatus() {
statusLabel.setText("allUrl: " + allUrl + " relativeUrl: "
+ relativeUrl + " timeoutUrl: " + timeoutUrl
+ " time:" + ((double) (endTime - startTime)) / 1000
+ "s speed: " + (double) (1000 * allUrl)
/ (endTime - startTime) + "个/s");
}
// 爬取的相关url进入待爬队列
public synchronized void enWaitingQueue(String url) {
waitingQueue.add(url);
}
// 出队列
public synchronized String deWaitingQueue() {
if (waitingQueue.isEmpty())
try {
Thread.sleep(30000);// 当线程数大于初始种子数时,休息60000秒是为了让多余的线程等待相关url进入waitingQueue,以便能利用多余的线程
} catch (InterruptedException e) {
}
// 当让线程等待30秒后waitingQueue仍没url则返回空字符串
if (waitingQueue.isEmpty())
return "";
return waitingQueue.remove(0);
}
// url进入已经处理的队列
public synchronized void enProcessedQueue(String url) {
processedQueue.add(url);
}
// url是否已经爬取过
public synchronized boolean urlHasBeenVisited(String url) {
if (processedQueue.contains(url))
return true;
return false;
}
}
// 设置stopSearch的值
public void stopSearch() {
stopSearch = true;
}
// 初始化界面组件
public void initComponent() {
timeoutUrl = 0;
index1 =index2= 1;
logger = Logger.getLogger("url");
stopSearch = false;
waitingQueue = new ArrayList<String>();
processedQueue = new ArrayList<String>();
indexQueue = new HashMap<String, Integer>();
centerPane = new JTabbedPane();
startButton = new JButton();
stopButton = new JButton();
exitButton = new JButton();
depthField = new JTextField();
threadField = new JTextField();
allUrlPane = new JScrollPane();
relativeUrlPane = new JScrollPane();
sitePane = new JScrollPane();
toolBar = new JPanel();
resultTab = new JPanel();
parameterTab = new JPanel();
allUrlArea = new JTextArea();
relativeUrlArea = new JTextArea();
siteArea = new JTextArea();
depthLabel = new JLabel();
threadLabel = new JLabel();
siteLabel = new JLabel();
statusLabel = new JLabel();
allUrlLabel = new JLabel();
relativeUrlLabel = new JLabel();
setBackground(new Color(153, 153, 255));
addWindowListener(new WindowAdapter() {
public void windowClosing(WindowEvent evt) {
exitForm(evt);
}
});
toolBar.setLayout(new FlowLayout(FlowLayout.LEFT));
toolBar.setBackground(new Color(204, 204, 204));
startButton.setFont(new Font("Arial", 1, 11));
startButton.setText("Start");
startButton.setToolTipText("Start the search");
startButton.addActionListener(new ActionListener() {
public void actionPerformed(ActionEvent evt) {
startButtonActionPerformed(evt);
}
});
toolBar.add(startButton);
stopButton.setFont(new Font("Arial", 1, 11));
stopButton.setText("Stop");
stopButton.setToolTipText("Stop the search ");
stopButton.addActionListener(new ActionListener() {
public void actionPerformed(java.awt.event.ActionEvent evt) {
stopButtonActionPerformed(evt);
}
});
toolBar.add(stopButton);
exitButton.setFont(new Font("Arial", 1, 11));
exitButton.setText("Exit");
exitButton.addActionListener(new ActionListener() {
public void actionPerformed(ActionEvent evt) {
exitButtonActionPerformed(evt);
}
});
toolBar.add(exitButton);
getContentPane().add(toolBar, BorderLayout.NORTH);
centerPane.setBorder(new javax.swing.border.EtchedBorder());
parameterTab.setLayout(null);
parameterTab.setBackground(new Color(204, 204, 204));
threadLabel.setText("threads : ");
parameterTab.add(threadLabel);
threadLabel.setBounds(20, 30, 250, 15);
threadField.setColumns(8);
threadField.setText("1");
parameterTab.add(threadField);
threadField.setBounds(260, 30, 70, 21);
depthLabel.setText("depth : ");
parameterTab.add(depthLabel);
depthLabel.setBounds(20, 80, 230, 15);
depthField.setColumns(8);
depthField.setText("5");
parameterTab.add(depthField);
depthField.setBounds(260, 80, 70, 21);
// parameterTab.add(sitePane);
// sitePane.setBounds(260, 270, 170, 60);
siteLabel.setText("starting site: ");
parameterTab.add(siteLabel);
siteLabel.setBounds(20, 140, 120, 15);
// siteArea.setBounds(260, 140, 130, 100);
sitePane.setViewportView(siteArea);
parameterTab.add(sitePane);
sitePane.setBounds(260, 140, 200, 150);
;
centerPane.addTab("Search Parameters", parameterTab);
resultTab.setLayout(null);
allUrlLabel.setText("all url: ");
resultTab.add(allUrlLabel);
allUrlLabel.setBounds(10, 20, 120, 15);
allUrlPane.setBackground(new Color(204, 204, 204));
allUrlPane.setViewportView(allUrlArea);
resultTab.add(allUrlPane);
allUrlPane.setBounds(10, 50, 250, 300);
relativeUrlLabel.setText("relative url: ");
resultTab.add(relativeUrlLabel);
relativeUrlLabel.setBounds(370, 20, 120, 15);
relativeUrlPane.setBackground(new Color(204, 204, 204));
relativeUrlPane.setViewportView(relativeUrlArea);
resultTab.add(relativeUrlPane);
relativeUrlPane.setBounds(370, 50, 250, 300);
centerPane.addTab("Search result", resultTab);
getContentPane().add(centerPane, BorderLayout.CENTER);
statusLabel.setText("Inactive");
getContentPane().add(statusLabel, BorderLayout.SOUTH);
}
// 响应停止按钮
public void stopButtonActionPerformed(ActionEvent evt) {
stopSearch();
}
// 响应开始按钮
public void startButtonActionPerformed(ActionEvent evt) {
int depthLimit = Integer.parseInt(depthField.getText().trim());
int threadLimit = Integer.parseInt(threadField.getText().trim());
BufferedReader in = new BufferedReader(new StringReader(siteArea
.getText()));
String s;
try {
while ((s = in.readLine()) != null) {
waitingQueue.add(s);
indexQueue.put(s, new Integer(0));
}
} catch (IOException e) {
}
centerPane.setSelectedIndex(1);
for (int i = 0; i < threadLimit; i++)
new Crawler("crawler" + i, depthLimit).start();
}
// 响应退出按钮
public void exitButtonActionPerformed(ActionEvent evt) {
System.exit(0);
}
public void exitForm(WindowEvent evt) {
System.exit(0);
}
public static void main(String[] args) {
new CrawlerUI();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -