⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawlerui.java

📁 一个主题相关的网络爬虫,实现与某一主题相关的网页的爬取
💻 JAVA
字号:
package com.crawler;

import java.awt.BorderLayout;
import java.awt.Color;
import java.awt.FlowLayout;
import java.awt.Font;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Vector;
import java.util.logging.FileHandler;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;

import javax.swing.JButton;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTabbedPane;
import javax.swing.JTextArea;
import javax.swing.JTextField;
import javax.swing.text.html.HTMLEditorKit;

import com.manner.MannerGather;
import com.parser.HTTP;
import com.parser.HtmlParser;
import com.parser.Parser;
import com.relative.Relative;

// 界面
public class CrawlerUI extends JFrame {
	private JTabbedPane centerPane;

	private JButton startButton;

	private JButton stopButton;

	private JButton exitButton;

	private JTextField depthField;

	private JTextField threadField;

	private JScrollPane allUrlPane;

	private JScrollPane relativeUrlPane;

	private JScrollPane sitePane;

	private JPanel parameterTab;

	private JPanel resultTab;

	private JTextArea allUrlArea;

	private JTextArea relativeUrlArea;

	private JTextArea siteArea;

	private JLabel depthLabel;

	private JLabel threadLabel;

	private JLabel siteLabel;

	private JLabel statusLabel;

	private JLabel allUrlLabel;

	private JLabel relativeUrlLabel;

	private JPanel toolBar;

	private int allUrl;

	private int relativeUrl;

	private int timeoutUrl;

	// 待爬队列
	private ArrayList<String> waitingQueue;

	// 已经处理过的队列
	private ArrayList<String> processedQueue;

	// 对放入待爬队列的URL建立索引
	private HashMap<String, Integer> indexQueue;

	// 停止搜索的标识
	private boolean stopSearch;

	private Logger logger;

	private int index1;

	private int index2;

	public CrawlerUI() {
		initComponent();
		setSize(650, 600);
		setTitle("topic crawler");
		setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
		setVisible(true);
	}

	class Crawler extends Thread {
		private Relative r = new Relative();

		private MannerGather mg = new MannerGather();

		private long startTime;

		private long endTime;

		// 限定深度
		private int depthLimit;

		private FileHandler logFile;

		String url;

		public Crawler(String name, int depthLimit) {
			super(name);
			this.depthLimit = depthLimit;
		}

		public void run() {
			startTime = System.currentTimeMillis();
			allUrl = 0;
			relativeUrl = 0;
			updateStatus();
			while (true) {

				url = deWaitingQueue();
				// 当deWaitingQueue返回空字符串时退出循环;
				if (url.equals(""))
					break;
				enProcessedQueue(url);
				relativeUrlArea.append(getName() + "运行" + "\n");
				int level = ((Integer) indexQueue.get(url)).intValue();
				// 当出来的URL深度为depthLimit时,通过队列是否为空且URL深度是否为depthLimit来继续判断
				// 不能URL深度为depthLimit就退出循环,因为多线程原因,队列中可能还有深度小于depthLimit的URL
				while (level == depthLimit) {
					if (waitingQueue.isEmpty()) {
						relativeUrlArea.append("超过限定深度" + getName() + "停止"
								+ "\n");
						break;
					} else {
						url = deWaitingQueue();
						enProcessedQueue(url);
						level = ((Integer) indexQueue.get(url)).intValue();
					}
					if (stopSearch)
						break;
				}
				// 当队列为空,且URL深度为depthLimit时才退出整个循环
				if (waitingQueue.isEmpty() && level == depthLimit)
					break;
				// 在爬取之前判断一次是否停止
				if (stopSearch) {
					relativeUrlArea.append(getName() + "停止" + "\n");
					break;
				}
				crawler(url, level);
				// 爬取返回时判断一次是否停止
				if (stopSearch) {
					relativeUrlArea.append(getName() + "停止" + "\n");
					break;
				}

			}
			endTime = System.currentTimeMillis();
			lastUpdateStatus();
		}

		// 爬取
		public void crawler(String url, int level) {
			try {
				HTMLEditorKit.Parser parser = new HtmlParser().getParser();
				Parser p = new Parser(url);
				HTTP http = new HTTP();

				parser.parse(new StringReader(http.getBody(url)), p, true);

				Vector v = p.getLinks();
				v = r.delRepeat(v);
				//System.out.println(v.size());
				for (int i = 0; i < v.size(); i++) {
					String checkUrl = (String) v.get(i);

					getAllUrl(checkUrl);

					if (urlHasBeenVisited(checkUrl)) {
						continue;
					}
					if (!mg.isRobotAllowed(new URL(checkUrl)))
						continue;
					// 调用相关函数后的返回值
					int returnValue = r.urlRelative(checkUrl);
					if (returnValue == -1) {
						getTimeoutUrl();
						continue;
					} else if (!waitingQueue.contains(checkUrl)
							&& returnValue == 1) {
						indexQueue.put(checkUrl, level + 1);
						writeText("./log/relativePage/" + (index1++) + ".txt", http
								.getBody(checkUrl));
						getRelativeUrl(checkUrl);

					}

					// System.out.println(""+v.get(i)+" "+i);
					Thread.sleep(1000);
					if (stopSearch)
						return;
				}

			} catch (Exception e) {
			}

		}

		// 将text写入对应path
		public void writeText(String path, String text) throws Exception {
			FileOutputStream out = new FileOutputStream(path);
			out.write(text.getBytes("gbk"));
			out.close();
		}

		public synchronized void getTimeoutUrl() {
			timeoutUrl++;
			updateStatus();
		}

		// 在relativeUrlArea显示相关的URL,relativeUrl自增,以及相关信息写入文本文件
		public synchronized void getRelativeUrl(String url) throws IOException {
			enWaitingQueue(url);

			relativeUrlArea.append(url + "\n");
			relativeUrl++;

			logFile = new FileHandler("./log/relativeUrl/"+(index2++)+".txt");
			logFile.setFormatter(new SimpleFormatter());
			logger.addHandler(logFile);
			logger.setUseParentHandlers(false);
			logger.info(url);
			updateStatus();
		}

		// allUrlArea显示所有的URL,allUrl++自增
		public synchronized void getAllUrl(String url) throws IOException {
			allUrlArea.append(url + "\n");
			allUrl++;

			updateStatus();
		}

		// 在statusLabel显示爬取的变化信息
		public synchronized void updateStatus() {
			statusLabel.setText("allUrl: " + allUrl + "   relativeUrl: "
					+ relativeUrl + "    timeoutUrl: " + timeoutUrl);
		}

		// 在statusLabel显示爬取的最终信息
		public synchronized void lastUpdateStatus() {
			statusLabel.setText("allUrl: " + allUrl + "   relativeUrl: "
					+ relativeUrl + "    timeoutUrl: " + timeoutUrl
					+ "    time:" + ((double) (endTime - startTime)) / 1000
					+ "s   speed: " + (double) (1000 * allUrl)
					/ (endTime - startTime) + "个/s");
		}

		// 爬取的相关url进入待爬队列
		public synchronized void enWaitingQueue(String url) {

			waitingQueue.add(url);
		}

		// 出队列
		public synchronized String deWaitingQueue() {
			if (waitingQueue.isEmpty())
				try {
					Thread.sleep(30000);// 当线程数大于初始种子数时,休息60000秒是为了让多余的线程等待相关url进入waitingQueue,以便能利用多余的线程
				} catch (InterruptedException e) {
				}
			// 当让线程等待30秒后waitingQueue仍没url则返回空字符串
			if (waitingQueue.isEmpty())
				return "";

			return waitingQueue.remove(0);
		}

		// url进入已经处理的队列
		public synchronized void enProcessedQueue(String url) {
			processedQueue.add(url);
		}

		// url是否已经爬取过
		public synchronized boolean urlHasBeenVisited(String url) {
			if (processedQueue.contains(url))
				return true;
			return false;
		}
	}

	// 设置stopSearch的值
	public void stopSearch() {
		stopSearch = true;
	}

	// 初始化界面组件
	public void initComponent() {
		timeoutUrl = 0;
		index1 =index2= 1;
		logger = Logger.getLogger("url");
		stopSearch = false;
		waitingQueue = new ArrayList<String>();
		processedQueue = new ArrayList<String>();
		indexQueue = new HashMap<String, Integer>();
		centerPane = new JTabbedPane();
		startButton = new JButton();
		stopButton = new JButton();
		exitButton = new JButton();
		depthField = new JTextField();
		threadField = new JTextField();

		allUrlPane = new JScrollPane();
		relativeUrlPane = new JScrollPane();
		sitePane = new JScrollPane();

		toolBar = new JPanel();
		resultTab = new JPanel();
		parameterTab = new JPanel();
		allUrlArea = new JTextArea();
		relativeUrlArea = new JTextArea();
		siteArea = new JTextArea();

		depthLabel = new JLabel();
		threadLabel = new JLabel();
		siteLabel = new JLabel();
		statusLabel = new JLabel();
		allUrlLabel = new JLabel();
		relativeUrlLabel = new JLabel();
		setBackground(new Color(153, 153, 255));
		addWindowListener(new WindowAdapter() {
			public void windowClosing(WindowEvent evt) {
				exitForm(evt);
			}
		});

		toolBar.setLayout(new FlowLayout(FlowLayout.LEFT));

		toolBar.setBackground(new Color(204, 204, 204));
		startButton.setFont(new Font("Arial", 1, 11));
		startButton.setText("Start");
		startButton.setToolTipText("Start the search");
		startButton.addActionListener(new ActionListener() {
			public void actionPerformed(ActionEvent evt) {
				startButtonActionPerformed(evt);
			}
		});

		toolBar.add(startButton);

		stopButton.setFont(new Font("Arial", 1, 11));
		stopButton.setText("Stop");
		stopButton.setToolTipText("Stop the search ");
		stopButton.addActionListener(new ActionListener() {
			public void actionPerformed(java.awt.event.ActionEvent evt) {
				stopButtonActionPerformed(evt);
			}
		});

		toolBar.add(stopButton);

		exitButton.setFont(new Font("Arial", 1, 11));
		exitButton.setText("Exit");
		exitButton.addActionListener(new ActionListener() {
			public void actionPerformed(ActionEvent evt) {
				exitButtonActionPerformed(evt);
			}
		});

		toolBar.add(exitButton);

		getContentPane().add(toolBar, BorderLayout.NORTH);

		centerPane.setBorder(new javax.swing.border.EtchedBorder());
		parameterTab.setLayout(null);

		parameterTab.setBackground(new Color(204, 204, 204));
		threadLabel.setText("threads : ");
		parameterTab.add(threadLabel);
		threadLabel.setBounds(20, 30, 250, 15);

		threadField.setColumns(8);
		threadField.setText("1");

		parameterTab.add(threadField);
		threadField.setBounds(260, 30, 70, 21);

		depthLabel.setText("depth : ");
		parameterTab.add(depthLabel);
		depthLabel.setBounds(20, 80, 230, 15);

		depthField.setColumns(8);
		depthField.setText("5");
		parameterTab.add(depthField);
		depthField.setBounds(260, 80, 70, 21);

		// parameterTab.add(sitePane);
		// sitePane.setBounds(260, 270, 170, 60);

		siteLabel.setText("starting site: ");
		parameterTab.add(siteLabel);
		siteLabel.setBounds(20, 140, 120, 15);

		// siteArea.setBounds(260, 140, 130, 100);

		sitePane.setViewportView(siteArea);
		parameterTab.add(sitePane);
		sitePane.setBounds(260, 140, 200, 150);
		;

		centerPane.addTab("Search Parameters", parameterTab);

		resultTab.setLayout(null);

		allUrlLabel.setText("all url: ");
		resultTab.add(allUrlLabel);
		allUrlLabel.setBounds(10, 20, 120, 15);

		allUrlPane.setBackground(new Color(204, 204, 204));
		allUrlPane.setViewportView(allUrlArea);
		resultTab.add(allUrlPane);
		allUrlPane.setBounds(10, 50, 250, 300);

		relativeUrlLabel.setText("relative url: ");
		resultTab.add(relativeUrlLabel);
		relativeUrlLabel.setBounds(370, 20, 120, 15);

		relativeUrlPane.setBackground(new Color(204, 204, 204));
		relativeUrlPane.setViewportView(relativeUrlArea);
		resultTab.add(relativeUrlPane);
		relativeUrlPane.setBounds(370, 50, 250, 300);

		centerPane.addTab("Search result", resultTab);

		getContentPane().add(centerPane, BorderLayout.CENTER);

		statusLabel.setText("Inactive");
		getContentPane().add(statusLabel, BorderLayout.SOUTH);

	}

	// 响应停止按钮
	public void stopButtonActionPerformed(ActionEvent evt) {
		stopSearch();
	}

	// 响应开始按钮
	public void startButtonActionPerformed(ActionEvent evt) {
		int depthLimit = Integer.parseInt(depthField.getText().trim());
		int threadLimit = Integer.parseInt(threadField.getText().trim());
		BufferedReader in = new BufferedReader(new StringReader(siteArea
				.getText()));
		String s;
		try {
			while ((s = in.readLine()) != null) {

				waitingQueue.add(s);
				indexQueue.put(s, new Integer(0));
			}
		} catch (IOException e) {
		}

		centerPane.setSelectedIndex(1);
		for (int i = 0; i < threadLimit; i++)
			new Crawler("crawler" + i, depthLimit).start();
	}

	// 响应退出按钮
	public void exitButtonActionPerformed(ActionEvent evt) {

		System.exit(0);
	}

	public void exitForm(WindowEvent evt) {
		System.exit(0);
	}

	public static void main(String[] args) {
		new CrawlerUI();
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -