📄 spiderthread.java

📁 一个简单的网络搜索引擎的代码
💻 JAVA
字号:
package cs;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.*;
import java.util.*;

import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

import vo.UrlQueueNode;







public class SpiderThread implements Runnable {
	
	MulThreadSpiderMainclass ms;
	
	private List WaiteUrlQueue;
	private List ReasultUrlQueue;
	//MulThreadSpiderMainclass main;
	private String keywordList[];
	
	/*ip domain列表 */
	private static String ipDomainList[];
	/*搜索站点的深度上限*/
	 private static int depthLimit;
	 /*搜索站点的上限*/
	 private static int siteLimit;
	private int number;
	public SpiderThread(MulThreadSpiderMainclass ms ,List ReasultUrlQueue,List WaiteUrlQueue,String keywordList[],String ipDomainList[],int depthLimit,int siteLimit,int number){
		this.ReasultUrlQueue=ReasultUrlQueue;
		this.ms = ms;
		
		this.WaiteUrlQueue=WaiteUrlQueue;
		this.keywordList=keywordList;
		this.ipDomainList=ipDomainList;
		this.depthLimit=depthLimit;
		this.siteLimit=siteLimit;
		this.number=number;
	}
	
	
	public void run() {
		while(ms.getUrlParsed()<siteLimit){
			UrlQueueNode reslovingNode;
			synchronized(WaiteUrlQueue){
				while(ms.getUrlParsed()==ms.getUrlGeted()){
					if(ms.getUrlParsed()==siteLimit){
						System.out.println("Thread ending");
						return;
					}
					try{
					
						WaiteUrlQueue.wait();
					}
					catch(InterruptedException ex){
						
					}
				}
				System.out.println(number+".......操作......");
				reslovingNode=(UrlQueueNode)WaiteUrlQueue.get(ms.addUrlParsed());
			}
				searchWeb(reslovingNode);
				synchronized(ReasultUrlQueue){
					if(reslovingNode.isMatch()){
						ReasultUrlQueue.add(reslovingNode);
					}
				}
				
				//Thread.yield();
				
		 		System.out.println(reslovingNode.toString()+"    "+reslovingNode.isMatch());
		 		System.out.println(reslovingNode.getText());
		 		System.out.println("有"+ms.getSitesFound()+"个站点发现了该关键字"+"    "+"  查找了"+ms.getSitesSearched()+"个站点");
			
		   
		}
		//System.out.println("线程结束！");
		//ms.dispatcher();
		   
	}
	
	public void searchWeb(UrlQueueNode reslovingNode)
    {
        
       if(urlHasBeenVisited(reslovingNode)) {          // 是否已经被处理过了
       	System.out.println("该页面已经背查找过了！！！");
	           return; // 是， 返回
       }
       	
       
       if(depthLimitExceeded(reslovingNode))
           return;
       
       if(ms.getSitesSearched()>= siteLimit)
           return;

       System.out.println("Searching :"+reslovingNode.toString()+" \n");
       ms.addSitesSearched();
      //
      // 现在开始检查文件
      //
      try{
         URL url = reslovingNode.getUrl(); // create the url object from a string.
         
         String protocol = url.getProtocol(); // ask the url for its protocol
         if(!protocol.equalsIgnoreCase("http") && !protocol.equalsIgnoreCase("file"))
         {
       	  System.out.println("    Skipping : "+reslovingNode.toString()+" not a http site\n\n");
            return;
         }
          
         String path = url.getPath();  // ask the url for its path
         int lastdot = path.lastIndexOf("."); // check for file extension
         if(lastdot > 0)
         {
          String extension = path.substring(lastdot);  // just the file extension
          if(!extension.equalsIgnoreCase(".html") && !extension.equalsIgnoreCase(".htm"))
             return;  // skip everything but html files
         }
         
         if(!isDomainOk(url))
         {
       	  System.out.println("    Skipping : "+reslovingNode.toString()+" not in domain list\n\n");
            return;
         }
     
         InputStream in = url.openStream(); // ask the url object to create an input stream
         InputStreamReader isr = new InputStreamReader(in); // convert the stream to a reader.  
     
         MySpiderParserCallback cb = new MySpiderParserCallback(reslovingNode); // create a callback object
         ParserDelegator pd = new ParserDelegator(); // create the delegator
         pd.parse(isr,cb,true); // 解析这个输入流
      
         isr.close();  // 关闭这个输入流
         
         
      } // end try
       catch(MalformedURLException ex)
        {
       	System.out.println("  (1)  Bad URL encountered : "+reslovingNode.toString()+"\n\n");   
         }
        catch(IOException e)
        {
       	 System.out.println("    IOException, could not access site : "+e.getMessage()+"\n\n");   
        }
      // Thread.yield();
       return;
        
    }    
	
	 private boolean isDomainOk(URL url)
     {
       if(url.getProtocol().equals("file"))  
           return true;  // file protocol always ok
       
       String host = url.getHost();
       int lastdot = host.lastIndexOf(".");
       if(lastdot <= 0)
           return true;
       
       String domain = host.substring(lastdot);  // just the .com or .edu part
       
       if(ipDomainList.length == 0)
           return true;
       
       for(int i=0; i < ipDomainList.length; i++)
       {
         if(ipDomainList[i].equalsIgnoreCase("<any>"))
             return true;
         if(ipDomainList[i].equalsIgnoreCase(domain))
             return true;
       }
       return false;
        
     }
	
	 public boolean depthLimitExceeded(UrlQueueNode managing)
     {
      
       if(managing.getDepthLevel() >= depthLimit)
           return true;
       else
           return false;
      
     }
	 
	public boolean urlHasBeenVisited(UrlQueueNode reslovingNode){
		
		for(int i=0;i<ms.getUrlParsed()-5;i++){
   		 if(((UrlQueueNode)WaiteUrlQueue.get(i)).equals(reslovingNode.toString1())){
   			 return true;
   		 }
   	 }
   	 return false;
   	 
    }
	
	public boolean urlHasBeenInsert(UrlQueueNode reslovingNode){
   	 for(int i=0;i<ms.getCurrentQueueNum();i++){
   		 if(((UrlQueueNode)WaiteUrlQueue.get(i)).equals(reslovingNode.toString1())){
   			 return true;
   		 }
   	 }
   	 return false;
   	 
    }
    
	
	
	
    
	
//	private static synchronized void addSitesFound(){
//		sitesFound++;
//	}
	
	
	
	public static String fixHref(String href)
    {
     String newhref = href.replace('\\', '/'); // fix sloppy web references
     int lastdot = newhref.lastIndexOf('.');
     int lastslash = newhref.lastIndexOf('/');
     if(lastslash > lastdot)
     {
      if(newhref.charAt(newhref.length()-1) != '/')
         newhref = newhref+"/";  // add on missing /
     }
    
     return newhref;     
       
    }
	
	/**
	 * Inner class used to html handle parser callbacks
	 */
	  class MySpiderParserCallback extends HTMLEditorKit.ParserCallback {
	     /** url node being parsed */
	     private UrlQueueNode node;
	     /** contents of last text element */
	     private String lastText = "";
	     /**
	      * Creates a new instance of SpiderParserCallback
	      * @param atreenode search tree node that is being parsed
	      */
	    public MySpiderParserCallback(UrlQueueNode Queuenode) {
	           node = Queuenode;
	    }
	     
	   
	    /**
	     *  take care of start tags
	     * @param t HTML tag
	     * @param a HTML attributes
	     * @param pos Position within file
	     */
	     public void handleStartTag(HTML.Tag t,
	                            MutableAttributeSet a,
	                            int pos)
	    {
	       if(t.equals(HTML.Tag.TITLE))
	      {
	        lastText="";
	        return;
	      }
	     
	      
	         
	    }
	     
	     /**
	     *  take care of start tags
	     * @param t HTML tag
	     * @param pos Position within file
	     */
	     public void handleEndTag(HTML.Tag t,
	                               int pos)
	    {
	      if(t.equals(HTML.Tag.TITLE) && lastText != null)
	      {
	        node.setTitle(lastText.trim());
	       }
	         
	    }
	     /**
	      * take care of text between tags, check against keyword list for matches, if
	      * match found, set the node match status to true
	      * @param data Text between tags
	      * @param pos position of text within web page
	      */
	     public void handleText(char[] data, int pos)
	     {
	       int index;
	       lastText = new String(data);
	       node.addChars(lastText.length());
	       String text = lastText.toUpperCase();
	       for(int i = 0; i < keywordList.length; i++)
	       {
	         if((index=text.indexOf(keywordList[i])) >= 0)
	         {
	           if(!node.isMatch())
	           {
	        	   if(lastText.length()>=100){
	        		   String temp;
	        		   temp=lastText.substring(index);
	        		   if(temp.length()>100){
	        			   node.setText(temp.substring(0, 99));
	        		   }
	        		   else{
	        			   node.setText(temp);  
	        		   }
	        		   
	        	   }
	        	   else{
	        		   node.setText(lastText);
	        	   }
	               
	               ms.addSitesFound();
	               
	           }
	           node.setMatch(keywordList[i]); 
	           return;
	         }
	       }
	     
	 }
}
}
💿 文件大小 7 K
👤 上传用户 quzhengjie
📂 所属分类 Java编程
🏷️ 相关标签

#网络搜索引擎 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -