📄 mycrawlerframe.java
字号:
package src;
import java.awt.BorderLayout;
import java.awt.Dimension;
import java.awt.GridBagConstraints;
import java.awt.GridBagLayout;
import java.awt.Insets;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyAdapter;
import java.awt.event.KeyEvent;
import java.awt.event.MouseAdapter;
import java.awt.event.MouseEvent;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JMenu;
import javax.swing.JMenuBar;
import javax.swing.JMenuItem;
import javax.swing.JPanel;
import javax.swing.JOptionPane;
import java.util.*;
import javax.swing.*;
import java.io.*;
import java.awt.*;
import javax.swing.table.*;
import java.net.*;
import java.util.regex.*;
import java.sql.*;
import java.util.Date;
public class MyCrawlerFrame extends JFrame {
// 选择深度,1为只找当前页面的所有链接,如此类推
private static final String[] MAXURLS =
{"50","100","200","500","1000"};
//选择的数目不能太大,以防止内存不足以存储所有信息
//遵循robot协议,制定一个禁止的map
private HashMap disallowListCache = new HashMap();
//一个简单的用户界面
private JTextField startField;//开始查找的网址
private JComboBox maxUrlsComboBox;//查找深度的选择框
//private JComboBox fileTypeComboBox;//查找的文件类型,本来打算实现,后来时间不足,决定放弃
private JTextField logFileField;//日志文件的存放位置
private JButton crawlButton;//按钮
//搜索状态信息
private JLabel crawlingLabel2;
private JLabel crawledLabel2;
private JLabel toCrawlLabel2;
private JLabel resultUrlLabel2;
private JLabel notHostLinkLabel2;
private JLabel gt30kbLinkLabel2;
//列出已经找到的网页
private JTable resultTable;
//是否正在crawl
private boolean crawling;
//记录非本地链接
// private HashSet notHostLink = new HashSet();
//记录大于30KB的网页
//private HashSet gt30kbList = new HashSet();
//日志文件
private PrintWriter logFileWriter;
/**
* Launch the application
* @param args
*/
public static void main(String args[]) {
try {
MyCrawlerFrame frame = new MyCrawlerFrame();
frame.setVisible(true);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Create the frame
*/
public MyCrawlerFrame() {
super();
setSize(new Dimension(600, 600));
getContentPane().setLayout(new GridBagLayout());
setTitle("Christ's Web Crawler");
setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
final JMenuBar FileMenubar = new JMenuBar();
setJMenuBar(FileMenubar);
final JMenu FileMenu = new JMenu();
setLocationRelativeTo(FileMenu);
FileMenu.setText("File");
FileMenubar.add(FileMenu);
final JMenuItem AboutmenuItem = new JMenuItem();
AboutmenuItem.addActionListener(new ActionListener() {
public void actionPerformed(ActionEvent arg0) {
JOptionPane.showMessageDialog(null,"Christ's web Crawler v 1.0");
}
});
AboutmenuItem.setText("About");
FileMenu.add(AboutmenuItem);
final JMenuItem ExitMenuItem = new JMenuItem();
ExitMenuItem.addActionListener(new ActionListener() {
public void actionPerformed(ActionEvent e) {
System.exit(0);
}
});
ExitMenuItem.setText("Exit");
FileMenu.add(ExitMenuItem);
JPanel controlPanel = new JPanel();
GridBagConstraints constraints;
GridBagLayout layout = new GridBagLayout();
controlPanel.setLayout(layout);
JLabel startLabel = new JLabel("Start URL:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets= new Insets(5,5,0,0);
layout.setConstraints(startLabel,constraints);
controlPanel.add(startLabel,constraints);
startField = new JTextField();
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,0,5);
layout.setConstraints(startField,constraints);
controlPanel.add(startField,constraints);
JLabel maxUrls = new JLabel("Max URLS to Crawl:");
constraints = new GridBagConstraints();
constraints.anchor = constraints.EAST;
constraints.insets = new Insets(5,5,0,0);
layout.setConstraints(maxUrls,constraints);
controlPanel.add(maxUrls,constraints);
maxUrlsComboBox = new JComboBox(MAXURLS);
maxUrlsComboBox.setEditable(true);
constraints = new GridBagConstraints();
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,0,0);
layout.setConstraints(maxUrlsComboBox,constraints);
controlPanel.add(maxUrlsComboBox,constraints);
/*JLabel fileTypeLabel = new JLabel("File Type to Crawl:");
constraints = new GridBagConstraints();
constraints.anchor = constraints.EAST;
constraints.insets = new Insets(5,5,0,0);
layout.setConstraints(fileTypeLabel,constraints);
controlPanel.add(fileTypeLabel,constraints);
fileTypeComboBox = new JComboBox(FILETYPE);
fileTypeComboBox.setEditable(true);
constraints = new GridBagConstraints();
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,0,0);
layout.setConstraints(fileTypeComboBox,constraints);
controlPanel.add(fileTypeComboBox,constraints);*/
JLabel logLabel = new JLabel("Log File:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5,5,0,0);
layout.setConstraints(logLabel,constraints);
controlPanel.add(logLabel,constraints);
//日志文件的存放位置
String file = System.getProperty("user.dir") +
System.getProperty("file.separator")+
"MyCrawler.log";
logFileField = new JTextField(file);
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,0,5);
layout.setConstraints(logFileField,constraints);
controlPanel.add(logFileField,constraints);
crawlButton = new JButton("Crawl");
crawlButton.addKeyListener(new KeyAdapter() {
public void keyPressed(KeyEvent arg0) {
actionCrawl();
}
});
crawlButton.addMouseListener(new MouseAdapter() {
public void mouseClicked(MouseEvent arg0) {
actionCrawl();
}
});
constraints = new GridBagConstraints();
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,5,5);
layout.setConstraints(crawlButton,constraints);
controlPanel.add(crawlButton,constraints);
//控制和状态的分界线
JSeparator separator = new JSeparator();
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,5,5);
layout.setConstraints(separator,constraints);
controlPanel.add(separator,constraints);
JLabel crawlingLabel = new JLabel("Crawling:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5,5,0,0);
layout.setConstraints(crawlingLabel,constraints);
controlPanel.add(crawlingLabel,constraints);
crawlingLabel2 = new JLabel();
crawlingLabel2.setFont(
crawlingLabel2.getFont().deriveFont(Font.PLAIN)
);
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,0,5);
layout.setConstraints(crawlingLabel2,constraints);
controlPanel.add(crawlingLabel2,constraints);
JLabel crawledLabel1 = new JLabel("Crawled URLS:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5,5,0,0);
layout.setConstraints(crawledLabel1,constraints);
controlPanel.add(crawledLabel1,constraints);
crawledLabel2 = new JLabel();
crawledLabel2.setFont(
crawledLabel2.getFont().deriveFont(Font.PLAIN)
);
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,0,5);
layout.setConstraints(crawledLabel2,constraints);
controlPanel.add(crawledLabel2,constraints);
JLabel toCrawlLabel1 = new JLabel("URL to Crawl:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5,5,0,0);
layout.setConstraints(toCrawlLabel1,constraints);
controlPanel.add(toCrawlLabel1,constraints);
toCrawlLabel2 = new JLabel();
toCrawlLabel2.setFont(
toCrawlLabel2.getFont().deriveFont(Font.PLAIN)
);
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,0,5);
layout.setConstraints(toCrawlLabel2,constraints);
controlPanel.add(toCrawlLabel2,constraints);
//发现的站外链接数目
JLabel notHostLinkLabel1 = new JLabel("URLs not this host:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5,5,0,0);
layout.setConstraints(notHostLinkLabel1,constraints);
controlPanel.add(notHostLinkLabel1,constraints);
notHostLinkLabel2 = new JLabel();
notHostLinkLabel2.setFont(
notHostLinkLabel2.getFont().deriveFont(Font.PLAIN)
);
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,0,5);
layout.setConstraints(notHostLinkLabel2,constraints);
controlPanel.add(notHostLinkLabel2,constraints);
//大小超过30kb的网页数
JLabel gt30kbLinkLabel1 = new JLabel("Pages size>30 KB:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5,5,0,0);
layout.setConstraints(gt30kbLinkLabel1,constraints);
controlPanel.add(gt30kbLinkLabel1,constraints);
gt30kbLinkLabel2 = new JLabel();
gt30kbLinkLabel2.setFont(
gt30kbLinkLabel2.getFont().deriveFont(Font.PLAIN)
);
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5,5,0,5);
layout.setConstraints(gt30kbLinkLabel2,constraints);
controlPanel.add(gt30kbLinkLabel2,constraints);
//设置结果表格
resultTable = new JTable(new DefaultTableModel(new Object[][]{},
new String[]{"TimeStamp","Action","URL"})
{
public boolean isCellEditable(int row,int column)
{
return false;
}
}
);
//Set up result panel
JPanel resultPanel = new JPanel();
resultPanel.setBorder(BorderFactory.createTitledBorder("Result"));
resultPanel.setLayout(new BorderLayout());
resultPanel.add(new JScrollPane(resultTable),BorderLayout.CENTER);
//Add panels to display
getContentPane().setLayout(new BorderLayout());
getContentPane().add(controlPanel,BorderLayout.NORTH);
getContentPane().add(resultPanel,BorderLayout.CENTER);
}
private void actionCrawl()
{
//If stop button is clicked,turn crawling flag off
if(crawling)
{
crawling = false;
return;
}
ArrayList errorList = new ArrayList();
//检查开始地址是否为空
String startUrl = startField.getText().trim();
if(startUrl.length()<1)
{
errorList.add("Missing Start URL,Please Enter it");
}
else if(verifyUrl(startUrl)==null)
{
errorList.add("Invalid Start URL,Please check it again");
}
//判断要查找的网址的最大数目
int maxUrls = 50;
String max =((String)maxUrlsComboBox.getSelectedItem()).trim();
if(max.length()<1)
{
errorList.add("Invalid number");
}
if(max.length()>0)
{
try
{
maxUrls = Integer.parseInt(max);
}
catch(NumberFormatException e)
{
}
if(maxUrls < 1 )
{
errorList.add("Invalid max urls");
}
}
//检查日志文件是否已经输入
String logFile = logFileField.getText().trim();
if(logFile.length()<1)
{
errorList.add("Missing Log file");
}
//显示所有错误信息
if(errorList.size()>0)
{
StringBuffer message = new StringBuffer();
//将所有错误信息合并
for(int i= 0;i<errorList.size();i++)
{
message.append(errorList.get(i));
if(i+1 <errorList.size())
{
message.append("\n");
}
}
showError(message.toString());
return;
}
//开始crawl
crawl(logFile,startUrl,maxUrls);
}
//crawl,整个程序的核心
private void crawl(final String logFile,final String startUrl,
final int maxUrls)
{
//用新的线程开始crawl
Thread thread = new Thread(new Runnable()
{
public void run()
{
//设置鼠标形状为等待模式;
setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
//Disable control
startField.setEnabled(false);
maxUrlsComboBox.setEnabled(false);
logFileField.setEnabled(false);
//Switch crawl button to "stop"
crawlButton.setText("Stop");
//Reset stats
resultTable.setModel(new DefaultTableModel(new Object[][]{},
new String[]{"TimeStamp","Action","URL"})
{
public boolean isCellEditable(int row,int column)
{
return false;
}
}
);
updateStats(startUrl,0,0,maxUrls,0,0);
//打开结果日志文件
try
{
logFileWriter = new PrintWriter(new FileWriter(logFile));
}
catch(Exception e)
{
showError("Unable to open result log file");
}
//Turn crawling flag on
crawling = true;
//正式进行crawl
crawlAction(startUrl,maxUrls);
//Turn crawling flag off
crawling = false;
//Close result log file
try
{
logFileWriter.close();
}
catch(Exception e)
{
showError("Unable to close result log file");
}
//Mark crawl as done
crawlingLabel2.setText("Done");
//Enable controls
startField.setEnabled(true);
maxUrlsComboBox.setEnabled(true);
logFileField.setEnabled(true);
//Switch crawl button back to crawl
crawlButton.setText("Crawl");
//Return to default cursor
setCursor(Cursor.getDefaultCursor());
//show message if there is no any links
if(resultTable.getRowCount()==0)
{
JOptionPane.showMessageDialog(MyCrawlerFrame.this, "There is no links in your start url." +
"please try another","No any links",JOptionPane.WARNING_MESSAGE);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -