📄 webcrawler.java
字号:
import java.applet.Applet;
import java.awt.*;
import java.awt.*;
import java.awt.event.*;
import java.util.*;
import java.net.*;
import java.io.*;
import javax.swing.*;
import java.lang.String.*;
import javax.swing.text.html.*;
import javax.swing.text.*;
import java.util.Formatter;
import java.util.Locale;
public class WebCrawler extends Applet implements ActionListener, Runnable {
public static final String SEARCH = "Search";
public static final String STOP = "Stop";
public static final String DISALLOW = "Disallow:";
public static final int SEARCH_LIMIT = 1000;
int i=0;
Panel panelMain;
//to display the URLs
TextArea textinMatches;
TextArea textoutMatches;
Label labelStatus;
// URLs to be searched
Vector vectorToSearch;
// URLs already searched
Vector vectorSearched;
// URLs which match
Vector vectorMatches;
Thread searchThread;
//entry the URL
TextField textURL;
TextField textdepth;
public void init() {
//构造界面
// set up the main UI panel
panelMain = new Panel();
panelMain.setLayout(new BorderLayout(5, 5));
// text entry components
Panel panelEntry = new Panel();
panelEntry.setLayout(new BorderLayout(5, 5));
Panel panelURL = new Panel();
panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelURL = new Label(" 输入URL: ", Label.RIGHT);
panelURL.add(labelURL);
textURL = new TextField("", 40);
panelURL.add(textURL);
panelEntry.add("North", panelURL);
panelMain.add("North", panelEntry);
Panel panelEntrydepth = new Panel();
panelEntrydepth.setLayout(new BorderLayout(5, 5));
Panel paneldepth = new Panel();
paneldepth.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labeldepth = new Label("输入搜索深度: ", Label.RIGHT);
paneldepth.add(labeldepth);
textdepth = new TextField("", 40);
paneldepth.add(textdepth);
panelEntrydepth.add("North", paneldepth);
panelMain.add("Center", panelEntrydepth);
Panel panelType = new Panel();
panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
//TextArea of result URLs
Panel panelListButtons = new Panel();
panelListButtons.setLayout(new BorderLayout(5, 5));
Panel panelList = new Panel();
panelList.setLayout(new BorderLayout(5, 5));
Label labelResults = new Label("搜索结果:");
panelList.add("North", labelResults);
Panel panelListCurrent = new Panel();
panelListCurrent.setLayout(new BorderLayout(5, 5));
textinMatches = new TextArea("站内"+"\n",10,10);
panelListCurrent.add("North", textinMatches);
textoutMatches = new TextArea("站外"+"\n",10,10);
panelListCurrent.add("Center", textoutMatches);
labelStatus = new Label("");
panelListCurrent.add("South", labelStatus);
panelList.add("South", panelListCurrent);
panelListButtons.add("North", panelList);
// control buttons
Panel panelButtons = new Panel();
Button buttonSearch = new Button(SEARCH);
buttonSearch.addActionListener(this);
panelButtons.add(buttonSearch);
Button buttonStop = new Button(STOP);
buttonStop.addActionListener(this);
panelButtons.add(buttonStop);
panelListButtons.add("South", panelButtons);
panelMain.add("South", panelListButtons);
add(panelMain);
setVisible(true);
repaint();
// initialize search data structures
vectorToSearch = new Vector();
vectorSearched = new Vector();
vectorMatches = new Vector();
// set default for URL access
URLConnection.setDefaultAllowUserInteraction(false);
}
public void start() {
}
public void stop() {
if (searchThread != null) {
setStatus("stopping...");
searchThread = null;
}
}
public void destroy() {
}
//构造robotSafe函数分析网站的robot,遵守礼貌原则
boolean robotSafe(URL url) {
String strHost = url.getHost();
// form URL of the robots.txt file
String strRobot = "http://" + strHost + "/robots.txt";
URL urlRobot;
try {
urlRobot = new URL(strRobot);
} catch (MalformedURLException e) {
// something weird is happening, so don't trust it
return false;
}
String strCommands;
try {
InputStream urlRobotStream = urlRobot.openStream();
// read in entire file
byte b[] = new byte[4096];
int numRead = urlRobotStream.read(b);
strCommands = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlRobotStream.read(b);
if (numRead != -1) {
String newCommands = new String(b, 0, numRead);
strCommands += newCommands;
}
}
urlRobotStream.close();
} catch (IOException e) {
// if there is no robots.txt file, it is OK to search
return true;
}
/*assume that this robots.txt refers to us and
search for "Disallow:" commands.*/
String strURL = url.getFile();
int index = 0;
while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
index += DISALLOW.length();
String strPath = strCommands.substring(index);
StringTokenizer st = new StringTokenizer(strPath);
if (!st.hasMoreTokens())
break;
String strBadPath = st.nextToken();
// if the URL starts with a disallowed path, it is not safe
if (strURL.indexOf(strBadPath) == 0)
return false;
}
return true;
}
public void paint(Graphics g) {
//Draw a Rectangle around the applet's display area.
g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);
panelMain.paint(g);
panelMain.paintComponents(g);
}
public void run() {
String strURL = textURL.getText();
int intdepth=Integer.parseInt(textdepth.getText());
int numberSearched = 0;
int numberFound = 0; // to statistic the the number of pages
int pageFound=0; // to statistic the total number of pages
int inpageFound=1; // to statistic the number of pages which inside
int outpageFound=1; // to statistic the the number of pages which outstation
if (strURL.length() == 0) {
setStatus("ERROR: must enter a starting URL");
return;
}
// initialize search data structures
vectorToSearch.removeAllElements();
vectorSearched.removeAllElements();
vectorMatches.removeAllElements();
vectorToSearch.addElement(strURL);
while ((vectorToSearch.size() > 0)
&& (Thread.currentThread() == searchThread)) {
// get the first element from the to be searched list
strURL = (String) vectorToSearch.elementAt(0);
setStatus("searching " + strURL);
URL url;
try {
url = new URL(strURL);
} catch (MalformedURLException e) {
setStatus("ERROR: invalid URL " + strURL);
break;
}
// mark the URL as searched (we want this one way or the other)
vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL);
// can only search http: protocol URLs
if (url.getProtocol().compareTo("http") != 0)
break;
// test to make sure it is before searching
if (!robotSafe(url)){
setStatus("ERROR: NO! it forgive to view the webpage:" + strURL);
break;
}
try {
// try opening the URL
URLConnection urlConnection = url.openConnection();
//使用User-agent向服务器表明自己的身份
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -