⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 procurl.java

📁 针对音乐论坛的爬虫程序 给出地址匹配特征
💻 JAVA
字号:
package procURL;

import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.Collection;
import java.net.MalformedURLException;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;


public class ProcURL {
	   
	   /**
	    * 用于标记版块页面URL中除页码的其他部分
	    */
	   private String urlhead;
	   
	   /**
	    * 版块列表完成标记,若为真,则表示版块列表已完成,开始进行主题过滤
	    */
	   private boolean blockListFlag = true;
	
	   /**
	    * 等待处理的版块URL列表
	    */
	   protected Collection blockList = new ArrayList(3);

	   /**
	    * 等待处理的主题URL列表
	    */
	   protected Collection topicList = new ArrayList(3);
	   
	   /**
	    * 版块页码列表
	    */
	   protected Collection blockPageNumList = new ArrayList(3);
	   
	   /**
	    * 主题页码列表
	    */
	   protected Collection topicPageNumList = new ArrayList(3);
	   
	   /**
	    * 存储版块每一页中的多页主题的tid
	    */
	   protected Collection tidList = new ArrayList(3);
	   /**
	    * 存储版块中的单页主题tid
	    */
	   protected Collection onePageList = new ArrayList(3);
	   
	   /**
	    * 存储版块每一页中的多页主题的最大页码
	    */
	   protected Collection maxPageList = new ArrayList(100);
	   /**
	    * 获取等待处理的URL列表
	    * @return 等待处理的URL列表
	    */
	   @SuppressWarnings("unchecked")
	public Collection<URL> getBlockList()
	   {
	     return blockList;
	   }
	   
	   /**
	    * 获取已完成URL列表
	    * @return 已完成URL列表
	    */
	   @SuppressWarnings("unchecked")
	public Collection<URL> getTopicList()
	   {
	     return topicList;
	   }
	   
	   /**
	    * 获取主题URL列表
	    * @return tidList
	    */
	   @SuppressWarnings("unchecked")
	public Collection<Integer> getTidList(){
		   return tidList;
		 
	   }
	   
	   public Collection getOnePageList(){
		   return onePageList;
	   }
	   
	   @SuppressWarnings("unchecked")
	public Collection<Object> getMaxPageList(){
		   return maxPageList;
	   }
	  
	   /**
	    * 获取已完成的版块页面标号列表
	    * @return blockPageNumList
	    */
	   @SuppressWarnings("unchecked")
	public Collection<Integer> getBlockPageNumList(){
		   return blockPageNumList;
	   }
	   
	   /**
	    * 获取存储的主题页码标号列表
	    * @return topicPageNumList
	    */
	   @SuppressWarnings("unchecked")
	public Collection<Integer> getTopicPageNumList(){
		   return topicPageNumList;
	   }
	   
	   /**
	    * 链接处理函数
	    * @param url
	    */
		//处理URL,分析并得出该页面的所有URL
		public void processURL(URL url)
		   {
			URLConnection connection =null; 
		     try {
		    	 connection = url.openConnection();
		       System.out.println("Processing: " + url );
		       //获取URL内容
		       if ( (connection.getContentType()!=null) &&
		            !connection.getContentType().toLowerCase().startsWith("text/") ) {
		         System.out.println("Not processing because content type is: " +
		              connection.getContentType() );
		         return;
		       }
		      
		       // read the URL
		       InputStream is = connection.getInputStream();
		       Reader r = new InputStreamReader(is);
		       // parse the URL
		       HTMLEditorKit.Parser parse = new HTMLParse().getParser();
		       parse.parse(r,new Parser(url),true);
		     } catch ( IOException e ) {
		       System.out.println("Error: " + url );
		       return;
		     }
		   }
		protected class Parser
		   extends HTMLEditorKit.ParserCallback {
		     protected URL base;

		     public Parser(URL base)
		     {
		       this.base = base;
		     }

		     public void handleSimpleTag(HTML.Tag t,
		                                 MutableAttributeSet a,int pos)
		     {
		       String href = (String)a.getAttribute(HTML.Attribute.HREF);
		      
		       if( (href==null) && (t==HTML.Tag.FRAME) )
		         href = (String)a.getAttribute(HTML.Attribute.SRC);
		        
		       if ( href==null )
		         return;

		       int i = href.indexOf('#');
		       if ( i!=-1 )
		         href = href.substring(0,i);
		       
		       handleLink(base,href);
		     }

		     public void handleStartTag(HTML.Tag t,
		                                MutableAttributeSet a,int pos)
		     {
		       handleSimpleTag(t,a,pos);     // handle the same way

		     }

		     protected void handleLink(URL base,String str)
		     {
		       try {
		         URL url = new URL(base,str);
		         System.out.println("URL"+url);
		         
		         // 需修改,将版块列表全部获取后才可获取主题列表。。。????
		         //若版块列表为空,则获取版块页面URL;否则,获取主题页面URL
		         if(blockListFlag == true)
		        	 filterBlockPageURL(url);
		         else
		        	 filterTopicPageURL(url);
		         
		       } catch ( MalformedURLException e ) {
		    	   System.out.println("Found malformed URL: " + str );
		       }
		     }
		     public void filterBlockPageURL(URL url){
		    	 String str1;
		    	 int index,page;
		    	 str1 = url.toString();
		    	 
		    	 //提取可抽取的版块页面的URL中的页码,存入BlockPageNumList中
		    	 //(http://bbs.breezecn.com/thread.php?fid=5&search=&page=1)
		    	 if (str1.regionMatches(str1.indexOf("fid"),"fid",0,3)&&str1.regionMatches(str1.indexOf("&search=&page"),"&search=&page",0,13)){
//		    		 str0 = str1;
			    	 index = str1.lastIndexOf("=");
			    	 urlhead = str1.substring(0, index+1);
			    	 if(!((str1.substring(index+1)).equals("e"))){
			    		 page = Integer.parseInt(str1.substring(index+1));
			    		 getBlockPageNumList().add(page);
			    	 }
		    	 }
		     }

		     @SuppressWarnings("unchecked")
			public void filterTopicPageURL(URL url){
		    	 
		    	 String str1,str2;
		    	 int index1,index2,index3,index4,index5,tid,page;
		    	 str1 = url.toString();
		    	 
		    	 //获取页数大于一页的主题的URL
		    	 //(http://bbs.breezecn.com/read.php?tid=112735&page=3&fpage=1)  
		    	 if(str1.regionMatches(str1.indexOf("tid"),"tid",0,3)&&str1.regionMatches(str1.indexOf("&page"),"&page",0,5)){
		    		 index1 = str1.indexOf("=");
		    		 index2 = str1.indexOf("&");
		    		 tid = Integer.parseInt(str1.substring(index1+1, index2));
		    		 index3 = str1.indexOf("=", index2);
		    		 index4 = str1.lastIndexOf("&");
		    		 str2 = str1.substring(index3+1,index4);
		    		 
		    		 //将page=e的链接过滤掉,最大页码为最后一项
		    		 if(!str2.equals("e")){
		    			 page = Integer.parseInt(str2);
			    		 if(getTidList().contains(tid)){
			    			 getTopicPageNumList().add(page);
			    		 }
			    		 else{
			    			 getTidList().add(tid);
			    			 if(((ArrayList)getTopicPageNumList()).isEmpty()){
			    				  return;//为空表示添加第一个多页信息,所以不做任何处理
			    			 }
			    			 //topicPageNumList不为空,则获取其中最后一个元素,将之添加到maxPageList,作为与
			    			 //tidlist相对应的最大页数
			    			 else{
			    				 index5 = ((ArrayList)getTopicPageNumList()).size();
				    			 getMaxPageList().add(((ArrayList)getTopicPageNumList()).get(index5-1));
				    			 getTopicPageNumList().clear();
			    			 }
		    		 }
		    		 
		    		 }
		    	 }
		    	 //获取第1页中页数为一页的主题的URL。。。
		    	 //(http://bbs.breezecn.com/read.php?tid=112735
		    	 if(str1.regionMatches(str1.indexOf("tid"),"tid",0,3)&&(!str1.regionMatches(str1.indexOf("&page"),"&page",0,5))&&(!str1.regionMatches(str1.indexOf("&fpage"),"&fpage",0,6))){
		    		 index1 = str1.indexOf("=");
		    		 str2 = str1.substring(index1+1);
		    		 tid = Integer.parseInt(str2);
		    		 getOnePageList().add(tid);
		    	 }
		    	 //获取除第1页外页数为一页的主题的URL。。。
		    	 //http://bbs.breezecn.com/read.php?tid=141834&fpage=2
		    	 if(str1.regionMatches(str1.indexOf("tid"),"tid",0,3)&&(!str1.regionMatches(str1.indexOf("&page"),"&page",0,5))&&str1.regionMatches(str1.indexOf("&fpage"),"&fpage",0,6)){
		    		 index1 = str1.indexOf("=");
		    		 index2 = str1.indexOf("&");
		    		 str2 = str1.substring(index1+1,index2);
		    		 tid = Integer.parseInt(str2);
		    		 getOnePageList().add(tid);
		    	 }
		     }
		}
		/**
		 * 获取版块URL列表,加入blockList中
		 *
		 */
		public void getBlockPageURL(){
			int size;
			int maxPage;
			String blockURL = null;
			Object maxPage1;
			size = getBlockPageNumList().size();
			maxPage1 = ((ArrayList)getBlockPageNumList()).get(size-1);
			maxPage= Integer.parseInt(maxPage1.toString());
			for(int i=1;i<maxPage+1;i++){
				try{
					blockURL = urlhead + i;
					URL url = new URL(blockURL);
					getBlockList().add(url);
				 } catch ( MalformedURLException e ) {
			         System.out.println("Found malformed URL: " + blockURL );
			       }
			}
			blockListFlag = false;
		}
		/**
		 * 获取主题页面URL列表,加入topicList中
		 *
		 */
		public void getTopicPageURL(){
			
			Object o1,o2;
			  String str;
			  int index;
			  
			  //将单页的URL添加到待处理列表中
			  for(int i=0;i<((ArrayList)getOnePageList()).size();i++){
				  o1=((ArrayList)getOnePageList()).get(i); 
				  if(getTidList().contains(o1)){
					  continue;
				  }
				  else{
					  str = "http://bbs.breezecn.com/read.php?tid="+o1.toString()+"&page=1";
					  try {
					         URL url = new URL(str);
					           addTopicURL(url);
					       } catch ( MalformedURLException e ) {
					         System.out.println("Found malformed URL: " + str );
					       }
				  }
			  }
			  //将多页的最后一个主题的最大页数添加到maxPageList列表中
			  if(!((ArrayList)getTopicPageNumList()).isEmpty()){
				  index = ((ArrayList)getTopicPageNumList()).size();
				  getMaxPageList().add(((ArrayList)getTopicPageNumList()).get(index-1));
			  }
			  
			  
			  //将多页的主题URL添加到待处理列表中http://bbs.breezecn.com/read.php?tid=112735&page=3&fpage=1
			  for(int i=0;i<((ArrayList)getTidList()).size();i++){
				  o1 = ((ArrayList)getTidList()).get(i);
				  o2 = ((ArrayList)getMaxPageList()).get(i);
				  for(int j=1;j<Integer.parseInt(o2.toString())+1;j++){
					  str = "http://bbs.breezecn.com/read.php?tid="+o1.toString()+"&page="+j;
					  try {
					         URL url = new URL(str);
					           addTopicURL(url);
					       } catch ( MalformedURLException e ) {
					         System.out.println("Found malformed URL: " + str );
					       }
				  }
				  
			  }
		}
		
		  public void addTopicURL(URL url){
		 		 if (getTopicList().contains(url))
		 		       return;
		 		getTopicList().add(url);
		  }
		  //清除tidList,tempList,maxPageList
		  public void clear(){
			  getTidList().clear();
			  getTopicPageNumList().clear();
			  getMaxPageList().clear();
			  getOnePageList().clear();
		  }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -