⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mycrawlerframe.java

📁 java 开发的网页爬虫
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package src;

import java.awt.BorderLayout;
import java.awt.Dimension;
import java.awt.GridBagConstraints;
import java.awt.GridBagLayout;
import java.awt.Insets;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyAdapter;
import java.awt.event.KeyEvent;
import java.awt.event.MouseAdapter;
import java.awt.event.MouseEvent;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JMenu;
import javax.swing.JMenuBar;
import javax.swing.JMenuItem;
import javax.swing.JPanel;
import javax.swing.JOptionPane;
import java.util.*;
import javax.swing.*;
import java.io.*;
import java.awt.*;
import javax.swing.table.*;
import java.net.*;
import java.util.regex.*;
import java.sql.*;
import java.util.Date;

public class MyCrawlerFrame extends JFrame {

	
	
//	选择深度,1为只找当前页面的所有链接,如此类推
	private static final String[] MAXURLS =
	{"50","100","200","500","1000"};
	//选择的数目不能太大,以防止内存不足以存储所有信息
	

	
	//遵循robot协议,制定一个禁止的map
	private HashMap disallowListCache = new HashMap();
	
	//一个简单的用户界面
	private JTextField startField;//开始查找的网址
	private JComboBox maxUrlsComboBox;//查找深度的选择框
	//private JComboBox fileTypeComboBox;//查找的文件类型,本来打算实现,后来时间不足,决定放弃
	private JTextField logFileField;//日志文件的存放位置
	private JButton crawlButton;//按钮
	
	
	//搜索状态信息
	private JLabel crawlingLabel2;
	private JLabel crawledLabel2;
	private JLabel toCrawlLabel2;
	private JLabel resultUrlLabel2;
	private JLabel notHostLinkLabel2;
	private JLabel gt30kbLinkLabel2;
	
	//列出已经找到的网页
	private JTable resultTable;
	
	//是否正在crawl
	private boolean crawling;
	
	//记录非本地链接
	// private HashSet notHostLink = new HashSet();
	 
	 //记录大于30KB的网页
	 //private HashSet gt30kbList = new HashSet();
	 
	
	//日志文件
	private PrintWriter logFileWriter;
	/**
	 * Launch the application
	 * @param args
	 */
	public static void main(String args[]) {
		try {
			MyCrawlerFrame frame = new MyCrawlerFrame();
			frame.setVisible(true);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * Create the frame
	 */
	public MyCrawlerFrame() {
		super();
		setSize(new Dimension(600, 600));
		getContentPane().setLayout(new GridBagLayout());
		setTitle("Christ's Web Crawler");
		setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);

		final JMenuBar FileMenubar = new JMenuBar();
		setJMenuBar(FileMenubar);

		final JMenu FileMenu = new JMenu();
		setLocationRelativeTo(FileMenu);
		FileMenu.setText("File");
		FileMenubar.add(FileMenu);

		final JMenuItem AboutmenuItem = new JMenuItem();
		AboutmenuItem.addActionListener(new ActionListener() {
			public void actionPerformed(ActionEvent arg0) {
				JOptionPane.showMessageDialog(null,"Christ's web Crawler v 1.0");
			}
		});
		AboutmenuItem.setText("About");
		FileMenu.add(AboutmenuItem);

		final JMenuItem ExitMenuItem = new JMenuItem();
		ExitMenuItem.addActionListener(new ActionListener() {
			public void actionPerformed(ActionEvent e) {
				System.exit(0);
	
			}
		});
		ExitMenuItem.setText("Exit");
		FileMenu.add(ExitMenuItem);

		

		
        JPanel controlPanel = new JPanel();
		GridBagConstraints constraints;
		GridBagLayout layout = new GridBagLayout();
		controlPanel.setLayout(layout);
		
		JLabel startLabel = new JLabel("Start URL:");
	    constraints = new GridBagConstraints();
		constraints.anchor = GridBagConstraints.EAST;
		constraints.insets= new Insets(5,5,0,0);
		layout.setConstraints(startLabel,constraints);
		controlPanel.add(startLabel,constraints);
		
		
		startField = new JTextField();
		constraints = new GridBagConstraints();
		constraints.fill = GridBagConstraints.HORIZONTAL;
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,0,5);
		layout.setConstraints(startField,constraints);
		controlPanel.add(startField,constraints);
		
		
		JLabel maxUrls = new JLabel("Max URLS to Crawl:");
		constraints = new GridBagConstraints();
		constraints.anchor = constraints.EAST;
		constraints.insets = new Insets(5,5,0,0);
		layout.setConstraints(maxUrls,constraints);
		controlPanel.add(maxUrls,constraints);
		
		maxUrlsComboBox = new JComboBox(MAXURLS);
		maxUrlsComboBox.setEditable(true);
		constraints = new GridBagConstraints();
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,0,0);
		layout.setConstraints(maxUrlsComboBox,constraints);
		controlPanel.add(maxUrlsComboBox,constraints);
		
		/*JLabel fileTypeLabel = new JLabel("File Type to Crawl:");
		constraints = new GridBagConstraints();
		constraints.anchor = constraints.EAST;
		constraints.insets = new Insets(5,5,0,0);
		layout.setConstraints(fileTypeLabel,constraints);
		controlPanel.add(fileTypeLabel,constraints);
		
		fileTypeComboBox = new JComboBox(FILETYPE);
		fileTypeComboBox.setEditable(true);
		constraints = new GridBagConstraints();
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,0,0);
		layout.setConstraints(fileTypeComboBox,constraints);
		controlPanel.add(fileTypeComboBox,constraints);*/
		
		JLabel logLabel = new JLabel("Log File:");
		constraints = new GridBagConstraints();
		constraints.anchor = GridBagConstraints.EAST;
		constraints.insets = new Insets(5,5,0,0);
		layout.setConstraints(logLabel,constraints);
		controlPanel.add(logLabel,constraints);
		
		//日志文件的存放位置
		String file = System.getProperty("user.dir") +
					System.getProperty("file.separator")+
					"MyCrawler.log";
		logFileField = new JTextField(file);
		constraints = new GridBagConstraints();
		constraints.fill = GridBagConstraints.HORIZONTAL;
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,0,5);
		layout.setConstraints(logFileField,constraints);
		controlPanel.add(logFileField,constraints);
		
		crawlButton = new JButton("Crawl");
		crawlButton.addKeyListener(new KeyAdapter() {
			public void keyPressed(KeyEvent arg0) {
				actionCrawl();
			}
		});
		crawlButton.addMouseListener(new MouseAdapter() {
			public void mouseClicked(MouseEvent arg0) {
				actionCrawl();
				
			}
		});
		constraints = new GridBagConstraints();
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,5,5);
		layout.setConstraints(crawlButton,constraints);
		controlPanel.add(crawlButton,constraints);
		
		//控制和状态的分界线
		JSeparator separator = new JSeparator();
		constraints = new GridBagConstraints();
		constraints.fill = GridBagConstraints.HORIZONTAL;
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,5,5);
		layout.setConstraints(separator,constraints);
		controlPanel.add(separator,constraints);
		
		
		JLabel crawlingLabel = new JLabel("Crawling:");
		constraints = new GridBagConstraints();
		constraints.anchor = GridBagConstraints.EAST;
		constraints.insets = new Insets(5,5,0,0);
		layout.setConstraints(crawlingLabel,constraints);
		controlPanel.add(crawlingLabel,constraints);
		
		
		crawlingLabel2 = new JLabel();
		crawlingLabel2.setFont(
				crawlingLabel2.getFont().deriveFont(Font.PLAIN)
		);
		constraints = new GridBagConstraints();
		constraints.fill = GridBagConstraints.HORIZONTAL;
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,0,5);
		layout.setConstraints(crawlingLabel2,constraints);
		controlPanel.add(crawlingLabel2,constraints);
		
		
		JLabel crawledLabel1 = new JLabel("Crawled URLS:");
		constraints = new GridBagConstraints();
		constraints.anchor = GridBagConstraints.EAST;
		constraints.insets = new Insets(5,5,0,0);
		layout.setConstraints(crawledLabel1,constraints);
		controlPanel.add(crawledLabel1,constraints);
		
		crawledLabel2 = new JLabel();
		crawledLabel2.setFont(
				crawledLabel2.getFont().deriveFont(Font.PLAIN)
		);
		constraints = new GridBagConstraints();
		constraints.fill = GridBagConstraints.HORIZONTAL;
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,0,5);
		layout.setConstraints(crawledLabel2,constraints);
		controlPanel.add(crawledLabel2,constraints);
		
		JLabel toCrawlLabel1 = new JLabel("URL to Crawl:");
		constraints = new GridBagConstraints();
		constraints.anchor = GridBagConstraints.EAST;
		constraints.insets = new Insets(5,5,0,0);
		layout.setConstraints(toCrawlLabel1,constraints);
		controlPanel.add(toCrawlLabel1,constraints);
		
		
		toCrawlLabel2 = new JLabel();
		toCrawlLabel2.setFont(
				toCrawlLabel2.getFont().deriveFont(Font.PLAIN)
		);
		constraints = new GridBagConstraints();
		constraints.fill = GridBagConstraints.HORIZONTAL;
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,0,5);
		layout.setConstraints(toCrawlLabel2,constraints);
		controlPanel.add(toCrawlLabel2,constraints);
		
		//发现的站外链接数目
		JLabel notHostLinkLabel1 = new JLabel("URLs not this host:");
		constraints = new GridBagConstraints();
		constraints.anchor = GridBagConstraints.EAST;
		constraints.insets = new Insets(5,5,0,0);
		layout.setConstraints(notHostLinkLabel1,constraints);
		controlPanel.add(notHostLinkLabel1,constraints);
		
		
		notHostLinkLabel2 = new JLabel();
		notHostLinkLabel2.setFont(
				notHostLinkLabel2.getFont().deriveFont(Font.PLAIN)
		);
		constraints = new GridBagConstraints();
		constraints.fill = GridBagConstraints.HORIZONTAL;
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,0,5);
		layout.setConstraints(notHostLinkLabel2,constraints);
		controlPanel.add(notHostLinkLabel2,constraints);
		
		
		//大小超过30kb的网页数
		JLabel gt30kbLinkLabel1 = new JLabel("Pages size>30 KB:");
		constraints = new GridBagConstraints();
		constraints.anchor = GridBagConstraints.EAST;
		constraints.insets = new Insets(5,5,0,0);
		layout.setConstraints(gt30kbLinkLabel1,constraints);
		controlPanel.add(gt30kbLinkLabel1,constraints);
		
		
		gt30kbLinkLabel2 = new JLabel();
		gt30kbLinkLabel2.setFont(
				gt30kbLinkLabel2.getFont().deriveFont(Font.PLAIN)
		);
		constraints = new GridBagConstraints();
		constraints.fill = GridBagConstraints.HORIZONTAL;
		constraints.gridwidth = GridBagConstraints.REMAINDER;
		constraints.insets = new Insets(5,5,0,5);
		layout.setConstraints(gt30kbLinkLabel2,constraints);
		controlPanel.add(gt30kbLinkLabel2,constraints);
		
		//设置结果表格
		resultTable = new JTable(new DefaultTableModel(new Object[][]{},
								new String[]{"TimeStamp","Action","URL"})
			{
				public boolean isCellEditable(int row,int column)
				{
					return false;
				}
			}
		
		);
		
		//Set up result panel
		JPanel resultPanel = new JPanel();
		resultPanel.setBorder(BorderFactory.createTitledBorder("Result"));
		resultPanel.setLayout(new BorderLayout());
		resultPanel.add(new JScrollPane(resultTable),BorderLayout.CENTER);
		
		
		//Add panels  to display
		
		getContentPane().setLayout(new BorderLayout());
		getContentPane().add(controlPanel,BorderLayout.NORTH);
		getContentPane().add(resultPanel,BorderLayout.CENTER);
		
		
		
		
	}
	
	private void actionCrawl()
	{
		//If stop button is clicked,turn crawling flag off
		if(crawling)
		{
			crawling = false;
			return;
		}
		
		ArrayList errorList = new ArrayList();
		
		
		//检查开始地址是否为空
		String startUrl = startField.getText().trim();
		if(startUrl.length()<1)
		{
			errorList.add("Missing Start URL,Please Enter it");
		}
		else if(verifyUrl(startUrl)==null)
		{
			errorList.add("Invalid Start URL,Please check it again");
		}
		
		//判断要查找的网址的最大数目
		int maxUrls = 50;
		String max =((String)maxUrlsComboBox.getSelectedItem()).trim();
		if(max.length()<1)
		{
			errorList.add("Invalid number");
		}
		if(max.length()>0)
		{
			try
			{
				maxUrls = Integer.parseInt(max);
			}
			catch(NumberFormatException e)
			{
				
			}
			if(maxUrls < 1 )
			{
				errorList.add("Invalid max urls");
			}
		}
		
		
		
		//检查日志文件是否已经输入
		String logFile = logFileField.getText().trim();
		if(logFile.length()<1)
		{
			errorList.add("Missing Log file");
		}
		
		
		//显示所有错误信息
		if(errorList.size()>0)
		{
			StringBuffer message = new StringBuffer();
			//将所有错误信息合并
			for(int i= 0;i<errorList.size();i++)
			{
				message.append(errorList.get(i));
				if(i+1 <errorList.size())
				{
					message.append("\n");
				}
			}
			showError(message.toString());
			return;
		}
		
		
		//开始crawl
		crawl(logFile,startUrl,maxUrls);
		
		
	}
	
	//crawl,整个程序的核心
	private void crawl(final String logFile,final String startUrl,
			final int maxUrls)
	{
		//用新的线程开始crawl
		Thread thread = new Thread(new Runnable()
			{
				public void run()
				{
					//设置鼠标形状为等待模式;
					setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
					
					//Disable control
					startField.setEnabled(false);
					maxUrlsComboBox.setEnabled(false);
					logFileField.setEnabled(false);
					
					//Switch crawl button to "stop"
					crawlButton.setText("Stop");
					
					//Reset stats
					resultTable.setModel(new DefaultTableModel(new Object[][]{},
							new String[]{"TimeStamp","Action","URL"})
						{
							public boolean isCellEditable(int row,int column)
							{
								return false;
							}
						}
					);
					updateStats(startUrl,0,0,maxUrls,0,0); 
					
					//打开结果日志文件
					try
					{
						logFileWriter = new PrintWriter(new FileWriter(logFile));
					}
					catch(Exception e)
					{
						showError("Unable to open result log file");
					}
					
					//Turn crawling flag on
					crawling = true;
					
					//正式进行crawl
					crawlAction(startUrl,maxUrls);
					
					//Turn crawling flag off
					crawling = false;
					
					//Close result log file
					try
					{
						logFileWriter.close();
					}
					catch(Exception e)
					{
						showError("Unable to close result log file");
					}
					
					//Mark crawl as done
					crawlingLabel2.setText("Done");
					
					//Enable controls
					startField.setEnabled(true);
					maxUrlsComboBox.setEnabled(true);
					logFileField.setEnabled(true);
					
					//Switch crawl button back to crawl
					crawlButton.setText("Crawl");
					
					//Return to default cursor
					setCursor(Cursor.getDefaultCursor());
					
					//show message if there is no any links
					if(resultTable.getRowCount()==0)
					{
						JOptionPane.showMessageDialog(MyCrawlerFrame.this, "There is no links in your start url." +
								"please try another","No any links",JOptionPane.WARNING_MESSAGE);
					}
				}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -