📄 spider.java

📁 1、锁定某个主题抓取； 2、能够产生日志文本文件
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
  	try{
  	URL urlHost = new URL (url.getProtocol(),url.getHost(),"/");
	
  	if(true){ //check robots.txt under host root only //url.toString().equalsIgnoreCase(urlHost.toString())){
		try{
		URL hostRobots = new URL (url.getProtocol(),url.getHost(),"/robots.txt");
		log("Checking : "+hostRobots.toString());
		//open robots.txt
              URLConnection connection = hostRobots.openConnection();
		InputStream is = connection.getInputStream();
              Reader r = new InputStreamReader(is);
		BufferedReader stdio = new BufferedReader (r);
		String robotsLine;
		String desc;
		String dir;
		URL forbidenUrl;
		
		while((robotsLine = stdio.readLine()) !=null){ //read until the file ends
			if(robotsLine.indexOf(":")!=-1 ){
			desc = (robotsLine.substring(robotsLine.indexOf(":")+1,robotsLine.length())).trim();	
			if( desc.equals("*") || desc.equals("Baiduspider"))
					 	{
                                          if((robotsLine = stdio.readLine()) !=null &&  robotsLine.indexOf(":")<robotsLine.length()-1){
						dir = (robotsLine.substring(robotsLine.indexOf(":")+3,robotsLine.length())).trim();
						forbidenUrl= new URL(urlHost.toString()+dir);
						
						if(!getWorkloadForbiden().contains(forbidenUrl)){
						getWorkloadForbiden().add(forbidenUrl);
							}
						log("Add to workloadForbiden: "+urlHost.toString()+dir);
                                          	}
					 }
				}
			}
			}catch (IOException e){
				;
				}
  		}
  	}catch (MalformedURLException ex) {

  		}
  	}

  public boolean checkIfThisUrlForbiden(URL url) //@Author Kelven.JU
  	{
  	String urlString = url.toString();
	String urlDirString = urlString.substring(0,urlString.lastIndexOf("/")+1);
	try
		{
	URL urlDir = new URL(urlDirString);
       log(urlDir.toString());
	if(getWorkloadForbiden().contains(urlDir))
		{
		log(url.toString()+" IS NOT allowed to be Crawled");
		return false;
		}
	else{
		log(url.toString()+" IS allowed to be Crawled");
		return true;
		}
		}
	catch (MalformedURLException ex)
		{
		System.out.println("Error accourd at function checkIfThisUrlForbiden");
		return false;
		}
  	}
  /**
   * Called internally to process a URL
   * 
   * @param url The URL to be processed.
   */
  public void processURL(URL url)
  {
  
  boolean parseableUrl  = true;
    try {
      log("Processing: " + url );
      //log("Host: "+url.getHost());	  
      //fetch the file type
      String urlString = url.toString();
      String currentFileType ="";
	  if(urlString.lastIndexOf(".")>urlString.lastIndexOf("/")){
	  	 currentFileType = urlString.substring(urlString.lastIndexOf(".")+1,urlString.length());
		 if(workloadFileType.contains(currentFileType))
		 parseableUrl = false;
	  	}
	  
	/******************************************/
  	  
	checkHostRobots(url);
       if(checkIfThisUrlForbiden(url))
	   	{
    //URL processing completes, log event
    getWorkloadWaiting().remove(url);
    getWorkloadProcessed().add(url);
    log("Complete: " + url );

       	}
	 /******************************************/
	 
      //open URL
      URLConnection connection = url.openConnection();
      //if parseable
      if(parseableUrl){
      //if the Content Type is null or not starts with "text/"
      if ( (connection.getContentType()!=null) && !connection.getContentType().toLowerCase().startsWith("text/") ) {
        getWorkloadWaiting().remove(url);
        getWorkloadProcessed().add(url);
        log("Not processing because content type is: " +
             connection.getContentType() );
        return;
      }
      }
      else{
	  	 if(connection.getContent()!=null){
	  	     log("Not parse but complete (file type: "+currentFileType+"):"+connection.getURL().toString());
                   getWorkloadWaiting().remove(url);
                   getWorkloadProcessed().add(url);
		return;
	  	 	}
		 else{
		 	getWorkloadWaiting().remove(url);
                     getWorkloadError().add(url);
			log("Error:"+connection.getURL().toString());
		       report.spiderURLError(url);
			return;
		 	}
	  	}
	  
      //download web page
      InputStream is = connection.getInputStream();
      Reader r = new InputStreamReader(is);
      // parse the URL
      scoreOfPage=0.0;
      HTMLEditorKit.Parser parse = new HTMLParse().getParser();
      parse.parse(r,new Parser(url),true);

      //分词，统计得分
      //System.out.println("Page Scoue = "+getPageScore(url.toString().replace(":","").replace("/","").replace(".","")+".tmp"));
      //File tmpFileDel=new File("tmp/"+url.toString().replace(":","").replace("/","").replace(".","")+".tmp");
      //tmpFileDel.delete();
	log("Page Score ["+url.toString()+"]: "+scoreOfPage);
	report.spiderOutputPageScore(url,  scoreOfPage);
	resultOut.newLine();
	resultOut.write(new Date()+"[完成]"+"{得分"+scoreOfPage+"}  ->"+url);
	resultOut.flush();

	  
    } catch ( IOException e ) { //if any error during processing the URL
      getWorkloadWaiting().remove(url);
      getWorkloadError().add(url);
      log("Error: " + url );
      report.spiderURLError(url);
      return;
    }
	
    //URL processing completes, log event
    getWorkloadWaiting().remove(url);
    getWorkloadProcessed().add(url);
    log("Complete: " + url );

  }

  //call to begin the spider
  public void begin()
  {
    cancel = false;
      if(checkRobotsOption && !checkMetaTagOption){
        log("Begin WITH checking robots.txt but WITHOUT mata tags!");
      }else if(checkMetaTagOption && !checkRobotsOption){
      	log("Begin WITH checking Meta Tags but WITHOUT robots.txt!");
      }else if(checkRobotsOption && checkMetaTagOption){
      	log("Begin WITH checking robots.txt and meta tags!");
      }else{
      	log("Begin WITHOUT checking robots.txt and meta tags!");
      }

    while ( !getWorkloadWaiting().isEmpty() && !cancel ) {
      Object list[] = getWorkloadWaiting().toArray();
      for ( int i=0;(i<list.length)&&!cancel;i++ )
        processURL((URL)list[i]);

    }
  }

/*** Connection Timer***/

/** * HTML parser@Author Kelven.JU* **/

  protected class Parser
  extends HTMLEditorKit.ParserCallback {
    protected URL base;
    
    //friendly crow 
    protected boolean indexEnable;
    protected boolean followEnable;
    

    //file to output the textual content
    protected BufferedWriter fileOut ;
    protected File tmpTextFilePath;
    public Parser(URL base)
    {
        this.base = base;
	 indexEnable = true;
	 followEnable = true;
	 try{
	 tmpTextFilePath=new File("tmp");
	 tmpTextFilePath.mkdir();
	 fileOut=new BufferedWriter ( new FileWriter (new File("tmp/"+base.toString().replace(":","").replace("/","").replace(".","")+".tmp")));
	 	}catch(IOException e){
	 	System.out.println("Construtor Parser [Spider.java 623] IO error!");
		}
		
    }
    
    //process textual content
    public void handleText(char[] text, int position){
    String tmpString=new String(text);
    int tmpIndex=-1;
    for(int i=0;i<wordList.size();i++){
		while((tmpIndex=tmpString.indexOf((String)wordList.get(i),tmpIndex+1))!=-1){
			scoreOfPage=scoreOfPage+(Double)wordWeight.get(i);
			}
		//System.out.println(scoreOfPage);
		
    	}
     //log("Outputing texts into file : "+base.toString().replace(":","").replace("/","").replace(".","")+".tmp");
     /*try{
	//fileOut.write(text);
	//fileOut.newLine();
	//fileOut.flush();
    //fileOut.close();
    }catch (IOException e){
    System.out.println("Function handleText [Spider.java 391] IO error!");}
    */
    }
    

    public void handleSimpleTag(HTML.Tag t,
                                MutableAttributeSet a,int pos)
    {
    	if(t == HTML.Tag.META){ //chekc META TAG
    		String metaName = (String)a.getAttribute(HTML.Attribute.NAME );
    		String metaContent = (String)a.getAttribute(HTML.Attribute.CONTENT );
    		if(metaName != null && metaContent != null){
    			metaName = metaName.toLowerCase();
    		       metaContent = metaContent.toLowerCase();
    		
    		if( metaName.equals("robots"))
    		{
    			int j = metaContent.indexOf(',');
    			if(j!=-1)
				{
					String metaContent1 = metaContent.substring(0,j);
					String metaContent2 = metaContent.substring(j,metaContent.length());
					if(metaContent1.equals("noindex") ||metaContent2.equals("noindex"))
						indexEnable = false;
					else if(metaContent1.equals("nofollow") ||metaContent2.equals("nofollow"))
						followEnable = false;
					else
						{;}
    				}
		       else
		       	{
		       	if(metaContent.equals("noindex")){
					indexEnable = false;
		       		}
				else if(metaContent.equals("nofollow")){
					followEnable = false;
					}
				else if(metaContent.equals("none")){
					indexEnable = false;
					followEnable = false;
					}
				else
					{;}
		       	}
			
    			
    		}
			}
    		//if((String)a.getAttribute(HTML.Attribute.CONTENT ) == "robots")
    	}
	
    	String href = (String)a.getAttribute(HTML.Attribute.HREF);
      
      //handle frame
      if( (href==null) && (t==HTML.Tag.FRAME) )
        href = (String)a.getAttribute(HTML.Attribute.SRC);
        
      if ( href==null )
        return;
      
      //handle URL ends with "#"
      int i = href.indexOf('#');
      if ( i!=-1 )
        href = href.substring(0,i);
        
      //handle email adress
      if ( href.toLowerCase().startsWith("mailto:") ) {
        report.spiderFoundEMail(href);
        return;
      }
	  
      if( followEnable ) {
      handleLink(base,href);
		}
	else
		{
		log("Meta tag shows following is NOT allowed:"+base.toString());
		}
	
    }

    public void handleStartTag(HTML.Tag t,
                               MutableAttributeSet a,int pos)
    {
      handleSimpleTag(t,a,pos);    

    }
    
    //handle relative URL
    protected void handleLink(URL base,String str)
    {
      
      	try {
			URL url;
        if(str.startsWith("http://") ){
			 url = new URL(str);
        	}
		else if(str.startsWith("www.")){
			 url = new URL("http://"+str);
			}
		else{
                      url = new URL(base,str);
			}
        if ( report.spiderFoundURL(base,url) )
          addURL(url);
      } catch ( MalformedURLException e ){
        log("Found malformed URL: " + str );
      }
       
    }
  }


/*** Connection Timer***/

/** * HTML parser*@Author Kelven.JU **/

  protected class Parser2
  extends HTMLEditorKit.ParserCallback {
    protected URL base;
    private File tmpFilePath;
    

    //file to output the textual content
    protected BufferedWriter fileOut2 ;
    public Parser2(URL base)
    {
        this.base = base;
	 try{
	 tmpFilePath=new File("train");
	 tmpFilePath.mkdir();
	 fileOut2=new BufferedWriter ( new FileWriter ( "train/"+base.toString().replace(":","").replace("/","").replace(".","")+".train" ));
	 	}catch(IOException e){
	 	System.out.println("Construtor Parser2 [Spider.java 715] IO error!");
		}
		
    }
    
    //process textual content
    public void handleText(char[] text, int position){
     //log("Outputing texts into file : "+base.toString().replace(":","").replace("/","").replace(".","")+".train");
     try{
	 	 
		 fileOut2.write(text);
		 fileOut2.newLine();
		 fileOut2.flush();
		  //fileOut.close();
     	}catch (IOException e)
		  	{
		  	System.out.println("Function handleText [Spider.java 586] IO error!");
			}
    	}
    

    public void handleSimpleTag(HTML.Tag t,
                                MutableAttributeSet a,int pos)
    	{
    	}

    public void handleStartTag(HTML.Tag t,
                               MutableAttributeSet a,int pos)
    {
    }
    
    //handle relative URL
    protected void handleLink(URL base,String str)
    {
    }
	
  }


  
  //log events
  public void log(String entry)
  {
    String logMessage = (new Date()) + ":" + entry;
    System.out.println( logMessage );
    try{
    fileOut.newLine();
    fileOut.write(logMessage,0,logMessage.length());
    //fileOut.newLine();
    fileOut.flush();
    //fileOut.close();
    }catch (IOException e){
    System.out.println("Function log [Spider.java 507] IO error!");}
  }
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -