📄 webcrawler.java
字号:
urlConnection.setRequestProperty("User-Agent", "jpspider");
urlConnection.setAllowUserInteraction(false);
InputStream urlStream = url.openStream();
String type = URLConnection.guessContentTypeFromStream(urlStream);
// search the input stream for links
// first, read in the entire URL
byte b[] = new byte[4096];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close();
if (Thread.currentThread() != searchThread)
break;
String lowerCaseContent = content.toLowerCase();
int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1) {
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;
if (Thread.currentThread() != searchThread)
break;
index++;
String remaining = content.substring(index);
StringTokenizer st = new StringTokenizer(remaining,
"\t\n\r\">#");
String strLink = st.nextToken();
URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
setStatus("ERROR: bad URL " + strLink);
continue;
}
if(strLink.startsWith("mailto:")){
setStatus("ERROR: can't open the: " + strLink);
break;
}
//不抓去那些大小大于 50KB的网页
if(urlLink.openConnection().getContentLength()>409600){
setStatus("ERROR:the webpage is bigger than 50 KB " + strLink);
break;
}
if (Thread.currentThread() != searchThread)
break;
//to make sure the URL belong to instationg or out station
// we only get the homepage if the URL belong to outstation
if(!(urlLink.getHost().toString().equalsIgnoreCase(textURL.getText().substring(7)))){
i=0;
Date date=new Date();
String t=date.toString();
pageFound=numberFound+1;
if ((!vectorSearched.contains(strLink))&& (!vectorToSearch.contains(strLink))&&robotSafe(urlLink)
&&(!vectorMatches.contains(strLink))){
textoutMatches.append("已经搜索到的页面数:"+outpageFound+" "+strLink+"\n");
//输出文本日志 ,格式:时间戳,URL
try{
PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
out.println("搜索时间"+t+""+strLink+" "+"已经搜索到的页面数:"+ pageFound+"(站外第"+outpageFound+"个)"+"\n");
out.close();
}catch(IOException e){
PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
out.println("output have some problem");
out.close();
}
outpageFound++;
numberFound++;
}
vectorMatches.addElement(strLink);
continue;
}
try {
// try opening the URL
URLConnection urlLinkConnection = urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType = URLConnection.guessContentTypeFromStream(linkStream);
linkStream.close();
// check to see if this URL has already been
// searched or is going to be searched
if ((!vectorSearched.contains(strLink))
&& (!vectorToSearch.contains(strLink))) {
//test to make sure it is robot-safe!
if (robotSafe(urlLink))
vectorToSearch.addElement(strLink);
}
// if the proper type, add it to the results list
// unless we have already seen it
if (vectorMatches.contains(strLink) == false) {
if(urlLink.getHost().toString().equalsIgnoreCase(textURL.getText().substring(7))&&(i<=intdepth)){
i++;
Date date=new Date();
String t=date.toString();
pageFound=numberFound+1;
textinMatches.append("已经搜索到的页面数:"+inpageFound+" "+strLink+"\n");
//输出文本日志,,格式:时间戳,URL
try{
PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
out.println("搜索时间"+t+""+strLink+" "+"已经搜索到的页面数:"+ pageFound+"(站内第"+inpageFound+"个)"+"\n");
out.close();
}catch(IOException e){
PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
out.println("output have some problem");
out.close();
}
vectorMatches.addElement(strLink);
inpageFound++;
numberFound++;
}
if (numberFound >= SEARCH_LIMIT)
break;
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strURL);
break;
}
numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
//抓完一个网页后停两秒
try{ // Date date=new Date();
// String tt=date.toString();
Thread.sleep(2000);
// textoutMatches.append("the spider must sleep 2s"+" "+tt+"\n");
}catch (InterruptedException e){
setStatus("have some problem");
}
}
if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
setStatus("reached search limit of " + SEARCH_LIMIT);
else
setStatus("done");
Date date=new Date();
String t=date.toString();
textinMatches.append("搜索结束时间:"+t+"\n");
textoutMatches.append("搜索结束时间:"+t+"\n");
//输出文本日志
try{
PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
out.println("本次搜索结束!"+"\n");
out.close();
}catch(IOException e){}
searchThread = null;
searchThread.stop();
}
// to show the imformation below the panel
void setStatus(String status) {
labelStatus.setText(status);
}
public void actionPerformed(ActionEvent event) {
String command = event.getActionCommand();
if (command.compareTo(SEARCH) == 0) {
setStatus("searching...");
// launch a thread to do the search
if (searchThread == null) {
searchThread = new Thread(this);
}
searchThread.start();
Date date=new Date();
String t=date.toString();
textinMatches.append("开始搜索时间:"+t+"\n");
textoutMatches.append("开始搜索时间:"+t+"\n");
try{
PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
out.println("本次搜索深度为:"+Integer.parseInt(textdepth.getText())+"\n");
out.println("本次开始搜索时间:"+t+"\n");
out.close();
}catch(IOException e){}
} else if (command.compareTo(STOP) == 0) {
Date date=new Date();
String t=date.toString();
textinMatches.append("搜索结束时间:"+t+"\n");
textoutMatches.append("搜索结束时间:"+t+"\n");
//输出文本日志
try{
PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
out.println("本次搜索结束!"+"\n");
out.close();
}catch(IOException e){}
stop();
}
}
public static void main(String argv[]) {
//创建图形界面
JFrame f = new JFrame("web spider");
WebCrawler applet = new WebCrawler();
f.add("Center", applet);
//创建两个线程抓取网页中的URL
applet.start();
new WebCrawler().start();
applet.init();
f.pack();
f.setVisible(true);
f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -