⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 webcrawler.java

📁 这是一个用JAVA写的网络蜘蛛
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
import java.applet.Applet;
import java.awt.*;
import java.awt.*;
import java.awt.event.*;
import java.util.*;
import java.net.*;
import java.io.*;
import javax.swing.*;
import java.lang.String.*;
import javax.swing.text.html.*;
import javax.swing.text.*;
import java.util.Formatter;
import java.util.Locale;

public class WebCrawler extends Applet implements ActionListener, Runnable {
	public static final String SEARCH = "Search";
        public static final String STOP = "Stop";
        public static final String DISALLOW = "Disallow:";
        public static final int SEARCH_LIMIT = 1000;
        int i=0;
        Panel panelMain;
        //to display the URLs
	TextArea textinMatches;
        TextArea textoutMatches;
	Label labelStatus;
         
	// URLs to be searched
	Vector vectorToSearch;
        
        // URLs already searched
	Vector vectorSearched;

	// URLs which match
	Vector vectorMatches;
       
	Thread searchThread;
        //entry the URL
	TextField textURL;
        TextField textdepth;
	public void init() {
                //构造界面
		// set up the main UI panel
		panelMain = new Panel();
		panelMain.setLayout(new BorderLayout(5, 5));

		// text entry components
		Panel panelEntry = new Panel();
		panelEntry.setLayout(new BorderLayout(5, 5));

		Panel panelURL = new Panel();
		panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
		Label labelURL = new Label("        输入URL: ", Label.RIGHT);
		panelURL.add(labelURL);
		textURL = new TextField("", 40);
		panelURL.add(textURL);
		panelEntry.add("North", panelURL);
                panelMain.add("North", panelEntry);
                
                Panel panelEntrydepth = new Panel();
		panelEntrydepth.setLayout(new BorderLayout(5, 5));
                Panel paneldepth = new Panel();
		paneldepth.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
		Label labeldepth = new Label("输入搜索深度: ", Label.RIGHT);
		paneldepth.add(labeldepth);
		textdepth = new TextField("", 40);
		paneldepth.add(textdepth);
		panelEntrydepth.add("North", paneldepth);
                panelMain.add("Center", panelEntrydepth);
                
		Panel panelType = new Panel();
		panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
		

		//TextArea of result URLs
		Panel panelListButtons = new Panel();
		panelListButtons.setLayout(new BorderLayout(5, 5));
		Panel panelList = new Panel();
		panelList.setLayout(new BorderLayout(5, 5));
		Label labelResults = new Label("搜索结果:");
		panelList.add("North", labelResults);
		Panel panelListCurrent = new Panel();
		panelListCurrent.setLayout(new BorderLayout(5, 5));
		textinMatches = new TextArea("站内"+"\n",10,10);
		panelListCurrent.add("North", textinMatches);
                textoutMatches = new TextArea("站外"+"\n",10,10);
		panelListCurrent.add("Center", textoutMatches);
		labelStatus = new Label("");
		panelListCurrent.add("South", labelStatus);
		panelList.add("South", panelListCurrent);

		panelListButtons.add("North", panelList);

		// control buttons
		Panel panelButtons = new Panel();
		Button buttonSearch = new Button(SEARCH);
		buttonSearch.addActionListener(this);
		panelButtons.add(buttonSearch);
		Button buttonStop = new Button(STOP);
		buttonStop.addActionListener(this);
		panelButtons.add(buttonStop);

		panelListButtons.add("South", panelButtons);
                panelMain.add("South", panelListButtons);
                add(panelMain);
		setVisible(true);
                repaint();

		// initialize search data structures
		vectorToSearch = new Vector();
		vectorSearched = new Vector();
		vectorMatches = new Vector();

		// set default for URL access
		URLConnection.setDefaultAllowUserInteraction(false);
	}

	public void start() {
        }

	public void stop() {
		if (searchThread != null) {
			setStatus("stopping...");
			searchThread = null;
		}          
	}

	public void destroy() {
	}
        
        //构造robotSafe函数分析网站的robot,遵守礼貌原则
	boolean robotSafe(URL url) {
		String strHost = url.getHost();
           
		// form URL of the robots.txt file
		String strRobot = "http://" + strHost + "/robots.txt";
		URL urlRobot;
		try {
			urlRobot = new URL(strRobot);
		} catch (MalformedURLException e) {
			// something weird is happening, so don't trust it
			return false;
		}

		String strCommands;
		try {
			InputStream urlRobotStream = urlRobot.openStream();

			// read in entire file
			byte b[] = new byte[4096];
			int numRead = urlRobotStream.read(b);
			strCommands = new String(b, 0, numRead);
			while (numRead != -1) {
				if (Thread.currentThread() != searchThread)
					break;
				numRead = urlRobotStream.read(b);
				if (numRead != -1) {
					String newCommands = new String(b, 0, numRead);
					strCommands += newCommands;
				}
			}
			urlRobotStream.close();
		} catch (IOException e) {
			// if there is no robots.txt file, it is OK to search
			return true;
		}

		/*assume that this robots.txt refers to us and
		 search for "Disallow:" commands.*/
                
		String strURL = url.getFile();
		int index = 0;
		while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
			index += DISALLOW.length();
			String strPath = strCommands.substring(index);
			StringTokenizer st = new StringTokenizer(strPath);

			if (!st.hasMoreTokens())
				break;

			String strBadPath = st.nextToken();

			// if the URL starts with a disallowed path, it is not safe
			if (strURL.indexOf(strBadPath) == 0)
				return false;
		}

		return true;
	}

	public void paint(Graphics g) {
		//Draw a Rectangle around the applet's display area.
		g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);

		panelMain.paint(g);
		panelMain.paintComponents(g);
	}

	public void run() {
                
		String strURL = textURL.getText();
                int intdepth=Integer.parseInt(textdepth.getText());
                
		int numberSearched = 0;        
		int numberFound = 0;            // to statistic the the number of pages
                int pageFound=0;                // to statistic the total number of pages
                int inpageFound=1;              // to statistic  the number of pages which inside
                int outpageFound=1;             // to statistic the the number of pages which outstation
		if (strURL.length() == 0) {
			setStatus("ERROR: must enter a starting URL");
			return;
		}

		// initialize search data structures
		vectorToSearch.removeAllElements();
		vectorSearched.removeAllElements();
		vectorMatches.removeAllElements();
		

		vectorToSearch.addElement(strURL);

		while ((vectorToSearch.size() > 0)
				&& (Thread.currentThread() == searchThread)) {
			// get the first element from the to be searched list
			strURL = (String) vectorToSearch.elementAt(0);

			setStatus("searching " + strURL);

			URL url;
			try {
				url = new URL(strURL);
			} catch (MalformedURLException e) {
				setStatus("ERROR: invalid URL " + strURL);
				break;
			}

			// mark the URL as searched (we want this one way or the other)
			vectorToSearch.removeElementAt(0);
			vectorSearched.addElement(strURL);

			// can only search http: protocol URLs
			if (url.getProtocol().compareTo("http") != 0)
				break;

			// test to make sure it is before searching
			if (!robotSafe(url)){
                             setStatus("ERROR: NO! it forgive to view the webpage:" + strURL);
			    break;
                        }
			try {
				// try opening the URL
				URLConnection urlConnection = url.openConnection();
                                //使用User-agent向服务器表明自己的身份

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -