📄 webcrawler.java

📁 这是一个用JAVA写的网络蜘蛛
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
                                urlConnection.setRequestProperty("User-Agent", "jpspider");
                                urlConnection.setAllowUserInteraction(false);
                                InputStream urlStream = url.openStream();
				String type = URLConnection.guessContentTypeFromStream(urlStream);
				
			        // search the input stream for links
				// first, read in the entire URL
				byte b[] = new byte[4096];
				int numRead = urlStream.read(b);
				String content = new String(b, 0, numRead);
				while (numRead != -1) {
					if (Thread.currentThread() != searchThread)
						break;
					numRead = urlStream.read(b);
					if (numRead != -1) {
						String newContent = new String(b, 0, numRead);
						content += newContent;
					}
				}
				urlStream.close();

				if (Thread.currentThread() != searchThread)
					break;

				String lowerCaseContent = content.toLowerCase();

				int index = 0;
				while ((index = lowerCaseContent.indexOf("<a", index)) != -1) {
					if ((index = lowerCaseContent.indexOf("href", index)) == -1)
						break;
					if ((index = lowerCaseContent.indexOf("=", index)) == -1)
						break;
                                    
					if (Thread.currentThread() != searchThread)
						break;

					index++;
					String remaining = content.substring(index);

					StringTokenizer st = new StringTokenizer(remaining,
							"\t\n\r\">#");
					String strLink = st.nextToken();

					URL urlLink;
					try {
						urlLink = new URL(url, strLink);
						strLink = urlLink.toString();
					} catch (MalformedURLException e) {
						setStatus("ERROR: bad URL " + strLink);
						continue;
					}
                                       if(strLink.startsWith("mailto:")){
                                            setStatus("ERROR: can't open the: " + strLink);
                                            break;
                                       }
                                    //不抓去那些大小大于 50KB的网页
                                    if(urlLink.openConnection().getContentLength()>409600){
                                            setStatus("ERROR:the webpage is bigger than 50 KB  " + strLink);
                                       break;
                                    }
				     if (Thread.currentThread() != searchThread)
				        break;
                                        
                          //to make sure the URL belong to  instationg or out station 
                          // we  only get the homepage if the URL belong to outstation                
                          if(!(urlLink.getHost().toString().equalsIgnoreCase(textURL.getText().substring(7)))){
                             i=0;              
                             Date  date=new Date();
                             String t=date.toString();
                             pageFound=numberFound+1;
                             if ((!vectorSearched.contains(strLink))&& (!vectorToSearch.contains(strLink))&&robotSafe(urlLink)
                                                                      &&(!vectorMatches.contains(strLink))){
                            
                            textoutMatches.append("已经搜索到的页面数:"+outpageFound+" "+strLink+"\n");
                             
                             //输出文本日志 ,格式：时间戳，URL                                     
                       try{
                            PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
                            out.println("搜索时间"+t+""+strLink+" "+"已经搜索到的页面数:"+ pageFound+"(站外第"+outpageFound+"个)"+"\n");
                            out.close();
                           }catch(IOException e){
                                PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
                                out.println("output have some problem");
                                out.close();	
                              }  
                             outpageFound++;
                             numberFound++;
                               }                              
				vectorMatches.addElement(strLink);
                           
                              continue;                                   
                             }                 
                                        
                                        
                              try {
				       // try opening the URL
				        URLConnection urlLinkConnection = urlLink.openConnection();
				        urlLinkConnection.setAllowUserInteraction(false);
				        InputStream linkStream = urlLink.openStream();
					String strType = URLConnection.guessContentTypeFromStream(linkStream);
					linkStream.close();
						
					// check to see if this URL has already been
					// searched or is going to be searched
					if ((!vectorSearched.contains(strLink))
								&& (!vectorToSearch.contains(strLink))) {

					//test to make sure it is robot-safe!
					   if (robotSafe(urlLink))
						vectorToSearch.addElement(strLink);
							}
                                        
					// if the proper type, add it to the results list
					// unless we have already seen it
					if (vectorMatches.contains(strLink) == false) {
                                            if(urlLink.getHost().toString().equalsIgnoreCase(textURL.getText().substring(7))&&(i<=intdepth)){
                                                 i++;
                                                 Date  date=new Date();
                                                 String t=date.toString();
                                                 pageFound=numberFound+1;
						 textinMatches.append("已经搜索到的页面数:"+inpageFound+" "+strLink+"\n");
                                      //输出文本日志,,格式：时间戳，URL 
                                    try{
                                        PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
                                        out.println("搜索时间"+t+""+strLink+" "+"已经搜索到的页面数:"+ pageFound+"(站内第"+inpageFound+"个)"+"\n");
                                        out.close();
                                         }catch(IOException e){
                                         PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
                                         out.println("output have some problem");
                                         out.close();	
                                               }
                                           vectorMatches.addElement(strLink);
                                           inpageFound++;
					   numberFound++;
                 }
                                                            
         
                                       
				if (numberFound >= SEARCH_LIMIT)
				break;
                                                              	
                               }
						
					} catch (IOException e) {
						setStatus("ERROR: couldn't open URL " + strLink);
						continue;
					}
                                       
                                           
				}
			} catch (IOException e) {
				setStatus("ERROR: couldn't open URL " + strURL);
				break;
			}

			numberSearched++;
                       
			if (numberSearched >= SEARCH_LIMIT)
				break;
                        
                        //抓完一个网页后停两秒
                        try{          //     Date  date=new Date();
                                      //      String tt=date.toString();
                                           Thread.sleep(2000);
                                      //      textoutMatches.append("the spider must sleep 2s"+" "+tt+"\n");
                                         }catch (InterruptedException e){
                                             setStatus("have some problem");
                                                   }
		    }
           
               if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
			setStatus("reached search limit of " + SEARCH_LIMIT);
		else
			setStatus("done");
                
                      Date  date=new Date();
                      String t=date.toString();
                      textinMatches.append("搜索结束时间:"+t+"\n");
                      textoutMatches.append("搜索结束时间:"+t+"\n");
                      
                      //输出文本日志
                      try{
                        PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
                        out.println("本次搜索结束!"+"\n");
                        out.close();
                       }catch(IOException e){}
               
		       searchThread = null;
		       searchThread.stop();
	}
        
        // to show the imformation below the panel
	void setStatus(String status) {
		labelStatus.setText(status);
	}
       
	public void actionPerformed(ActionEvent event) {
		String command = event.getActionCommand();

		if (command.compareTo(SEARCH) == 0) {
			setStatus("searching...");
                        
			// launch a thread to do the search
			if (searchThread == null) {
				searchThread = new Thread(this);
			}
                      searchThread.start();
                      
                      Date  date=new Date();
                      String t=date.toString();
                      textinMatches.append("开始搜索时间:"+t+"\n");
                      textoutMatches.append("开始搜索时间:"+t+"\n");   
                      
                       try{
                      PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
                      out.println("本次搜索深度为:"+Integer.parseInt(textdepth.getText())+"\n");
                      out.println("本次开始搜索时间:"+t+"\n");
                      out.close();
                      }catch(IOException e){}
                      
		  } else if (command.compareTo(STOP) == 0) {
                 
                       Date  date=new Date();
                       String t=date.toString();
                       textinMatches.append("搜索结束时间:"+t+"\n");
                       textoutMatches.append("搜索结束时间:"+t+"\n");
                      
                      //输出文本日志
                     try{
                      PrintWriter out=new PrintWriter(new FileWriter("spider日志记录.txt",true));
                      out.println("本次搜索结束!"+"\n");
                      out.close();
                      }catch(IOException e){}
                      stop();
		}
	}

	public static void main(String argv[]) {
                //创建图形界面
		JFrame f = new JFrame("web spider");
		WebCrawler applet = new WebCrawler();
		f.add("Center", applet);
                //创建两个线程抓取网页中的URL 
                applet.start();
                new WebCrawler().start();
                applet.init();  
		f.pack();
		f.setVisible(true);
                f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
	}    

}
上一页 12
💿 文件大小 5 K
👤 上传用户 ljw128
📂 所属分类 Jsp/Servlet
📄 代码行数 504 行
💻 语言类型 Java
🏷️ 相关标签

#JAVA #网络
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -