📄 mycrawlerframe.java

📁 java 开发的网页爬虫
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
			});
		thread.start();
		try
		{
			Thread.sleep(2000);
		}
		catch(Exception e)
		{
			
		}
	}
	
	//进行实际的crawl
	private void crawlAction(String startUrl,int maxUrls)
	{
		//Set up crawL list
		HashSet crawledList = new HashSet();
		LinkedHashSet toCrawlList = new LinkedHashSet();
		HashSet notHostLink = new HashSet();
		HashSet gt30kbList = new HashSet();
		
		//Add start Url to the to crawl list
		toCrawlList.add(startUrl);
		
		String urlActionTime = getTimeStamp();
		addResult(urlActionTime,"Add to list",startUrl);
		
		
		int i = 0;
		//Perform actual crawling by looping through the To Crawl list
		while(crawling && toCrawlList.size() > 0)
		{
			//check to see if the max url count has been reached
			if(maxUrls != -1)
			{
				if(crawledList.size()== maxUrls)
				{
					break;
				}
			}
			
			//Get URL at bottom of the list
			String url = null;
			if(toCrawlList.iterator().hasNext())
			{
				try{
					url = (String)toCrawlList.iterator().next();
				}
				catch(ClassCastException e)
				{
				}
				
				
				
			}
		
			
			//Remove URL from the to Crawl list
			toCrawlList.remove(url);
			
			
			
			
			//convert string url to URL object
			
			URL verifiedUrl = verifyUrl(url);

			String verifiedUrlActionTime = getTimeStamp();
			addResult(verifiedUrlActionTime,"verify",url);
			
			//Skip URL if robots are not allow to access it
			if(!isRobotAllowed(verifiedUrl))
			{
				continue;
			}
			
			//Update crawling stats
			updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls,notHostLink.size(),gt30kbList.size());
			
			//Add page to the crawled list
			crawledList.add(url);
			
			//Download the page at the given Url
			String pageContents = downloadPage(verifiedUrl);
			String processUrlActionTime = getTimeStamp();
			addResult(processUrlActionTime,"process",url);
	
		

			writePage(pageContents,verifiedUrl,Integer.toString(i));
			i++;
			
			//If the page was downloaded sucessfully,retrive all its
			//links 
			if(pageContents != null && pageContents.length()>0)
			{
				//首先要检查机器人元标签,检查这个网页是否允许
				if(!allowFollow(pageContents,verifiedUrl))
				{
					continue;
				}
					
				
				ArrayList links = retrieveLinks(verifiedUrl,pageContents,crawledList,notHostLink,gt30kbList);
             
				
				//Add links to the To Crawl list
				toCrawlList.addAll(links);
			}
			
			//Update crawling stats
			updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls,notHostLink.size(),gt30kbList.size());
			
			
		}
	}
	
	
	//检查机器人元标签
	private boolean allowFollow(String pageContents,URL verifiedUrl)
	{
		Pattern p = Pattern.compile("<meta\\s+name\\s*=\\s*\"robots\"\\s*\"?(.*?)[\"|>]",
				Pattern.CASE_INSENSITIVE);
		Matcher m= p.matcher(pageContents);
		
		while(m.find())
		{
			String content = m.group(1).trim();
			if(content.indexOf("all")!=-1)
			{
					return true;
			}
			else if(content.indexOf("follow")!= -1)
			{
					return true;
			}
			else
			{
				return false;
			}
		}
		return true;
	}
	
	
	
	//add the result to the result table and log file
	private void addResult(String actionTime,String action,String url)
	{
		//Add action time,action,url to result table
		DefaultTableModel model =
			(DefaultTableModel) resultTable.getModel();
		model.addRow(new Object[]{actionTime,action,url});
		
		//Add actiontime ,action url to log file
		String result = actionTime +"\t"+action+"\t"+url;
		try
		{
			logFileWriter.println(result);
		}
		catch(Exception e)
		{
			showError("Unable to log file");
		}
	}
	
	private void addResult(String actionTime,String action,URL url)
	{
		//Add action time,action,url to result table
		DefaultTableModel model =
			(DefaultTableModel) resultTable.getModel();
		model.addRow(new Object[]{actionTime,action,url});
		
		//Add actiontime ,action url to log file
		String result = actionTime +"\t"+action+"\t"+url;
		try
		{
			logFileWriter.println(result);
		}
		catch(Exception e)
		{
			showError("Unable to log file");
		}
	}
	
	//Get current time stamp
	private String getTimeStamp()
	{
		Date d = new Date(System.currentTimeMillis());
		Timestamp ts = new Timestamp(d.getTime());
		String tsStr=ts.toString();
		return tsStr;

	}
	
	
	//Parse through page contents and retrieve links
	private ArrayList retrieveLinks(URL pageUrl,String pageContents,HashSet crawledList,HashSet notHostLink,
							HashSet gt30kbList)
	{
		//complie link matching pattern
		Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",Pattern.CASE_INSENSITIVE);
		Matcher m= p.matcher(pageContents);
		
		//Create list of link matches
		ArrayList linkList = new ArrayList();
		while(m.find())
		{
			String link = m.group(1).trim();
			
			//Skip empty links
			if(link.length()<1)
			{
				continue;
			}
			
			//skip links that are just page anchors
			if(link.charAt(0)=='#')
			{
				continue;
			}
			
			//skip mailto links
			if(link.indexOf("mailto")!= -1)
			{
				continue;
			}
			
			//Skip javascript links
			if(link.toLowerCase().indexOf("javascript")!=-1)
			{
				continue;
			}
			
			//Prefix absolute and relative URLs if necessary
			if(link.indexOf("://")==-1)
			{
				//Handle absolute URLS
				if(link.startsWith("./"))
				{
					link = "http://"+pageUrl.getHost()+"/"+pageUrl.getFile()+"/"+link.substring(2,link.length());
				}
				else if(link.startsWith("../"))
				{
					link = "http://"+pageUrl.getHost()+"/"+link.substring(3,link.length());
				}
				else
				{
					String file = pageUrl.getFile();
					if(file.indexOf('/')==-1)
					{
						link = "http://"+pageUrl.getHost() +"/"+link;
					}
					else
					{
						String path = file.substring(0, file.lastIndexOf('/')+1);
						link = "http://"+pageUrl.getHost()+path+link;
					}
				}
			}
			
			//Remove anchors from link
			int index = link.indexOf('#');
			if(index!=-1)
			{
				link = link.substring(0,index);
				
			}
			
			
			//verified link and skip  if invalid
			URL verifiedLink = verifyUrl(link);
			if(verifiedLink == null)
			{
				continue;
			}
			
	
			
			
			
	 
			//link must in the same host as the start url,if find an link is not this host,just keep a 
			//record of it,not to process it
			String startHost = verifiedLink.getHost().toLowerCase();
			int indexofpoint;
			if((indexofpoint=startHost.indexOf("."))!= -1)
			{
				startHost=startHost.substring(indexofpoint+1);
				
			}
			String pageHost = pageUrl.getHost().toLowerCase();
			if((indexofpoint=pageHost.indexOf("."))!= -1)
			{
				pageHost = pageHost.substring(indexofpoint+1);
				
			}
			
			if(!pageHost.equalsIgnoreCase(startHost))
			{
				if(! notHostLink.contains(verifiedLink))
				{
					notHostLink.add(verifiedLink);
					String notHostLinkAction = getTimeStamp();
					
					addResult(notHostLinkAction,"link not this host",verifiedLink);
				}
				
				continue;
			}
			
			//如果网页大小超过30kb，记录该ip，不作download处理
			try{
				int size;
				if((size = verifiedLink.openConnection().getContentLength())>1024*30)
				{
					if(!gt30kbList.contains(verifiedLink))
					{
						gt30kbList.add(verifiedLink);
						String gt30kbLinkAction = getTimeStamp();
						addResult(gt30kbLinkAction,"page size > 30 KB",verifiedLink);
					}
					continue;
					
				}
			}
			catch(IOException e)
			{
				e.printStackTrace();
			}
			
			//
			
			
			
			//Skip link if it has already been crawled
			if(crawledList.contains(link))
			{
				continue;
			}
			
			//Add link to list
			linkList.add(link);
			
			String urlActionTime = getTimeStamp();
			addResult(urlActionTime,"Add to list",link);
		}
		return (linkList);
	}
	
	//Check if robot is allowed to access the given url
	private boolean isRobotAllowed(URL urlToCheck)
	{
		String host = null;
		try
		{
			 host = (String)urlToCheck.getHost().toLowerCase();
		}
		catch(NullPointerException e)
		{
			
		}
		//Retrive host's disallow list from cache
		ArrayList disallowList = (ArrayList)disallowListCache.get(host);
		
		//If list is not in the cache,download and cache it
		if(disallowList == null)
		{
			disallowList = new ArrayList();
			
			try
			{
				URL robotsFileUrl = new URL("http://"+host+"/robots.txt");
				
				//Open connection to robot file for reading
				BufferedReader reader = new BufferedReader(new InputStreamReader(robotsFileUrl.openStream()));
				
				
				//Read robot file,creating list of disallowed paths
				String line;
				while((line=reader.readLine())!=null)
				{
					if(line.indexOf("Disallow:")== 0)
					{
						String disallowPath = line.substring("Disallow:".length());
						
						//Check disallow path for comments and remove if present
						int commentIndex = disallowPath.indexOf("#");
						if(commentIndex != -1)
						{
							disallowPath.substring(0,commentIndex);
						}
						
						//Remove leading or trailing sapces from disallow path
						disallowPath = disallowPath.trim();
						
						//Add disallow path to list
						disallowList.add(disallowPath);
					}
				}
			}
			catch(Exception e)
			{
				//Assum robot is allowed since an exception is 
				//throw if the robot file doesn't exist
				return true;
			}
		}
		
		//loop through disallow list to see if crawling is allowed for the given url
		String file =urlToCheck.getFile();
		for(int i= 0;i<disallowList.size();i++)
		{
			String disallow = (String)disallowList.get(i);
			if(file.startsWith(disallow))
			{
				return false;
				
			}
		}
		return true;
	}
	
	//设置新的状态
	private void updateStats(String crawling,int crawled,int toCrawl,int maxUrls,int notHostLinkSize,int gt30kbListSize)
	{
		crawlingLabel2.setText(crawling);
		crawledLabel2.setText(" "+ crawled);
		toCrawlLabel2.setText(" "+ toCrawl);
		notHostLinkLabel2.setText(" "+notHostLinkSize);
		gt30kbLinkLabel2.setText(" "+gt30kbListSize);
	}
	
//	Verify URL format,
	private URL verifyUrl(String url)
	{

		//only allow http urls
		try
		{
			if(!(url.toLowerCase()).startsWith("http://"))
			{
				return null;
			}
			
		}
		catch(NullPointerException e)
		{
			
		}
			
		URL verifiedUrl = null;
		try
		{
			verifiedUrl = new URL(url);
			URLConnection verifiedConnection = verifiedUrl.openConnection();
			verifiedConnection.setRequestProperty("User-Agent", "Test Crawler for Course IR");
		}
		catch(Exception e)
		{
			return null;
		}
		return verifiedUrl;
	}
	
	//显示错误信息
	private void showError(String message)
	{
		JOptionPane.showMessageDialog(this,message,"Error",JOptionPane.ERROR_MESSAGE);
	}
	

	//现在指定页面
	private String downloadPage(URL pageUrl)
	{
		try
		{
			//open connection to URL for reading
			BufferedReader reader = new BufferedReader(new InputStreamReader(pageUrl.openStream()));
			
			//Read page into buffer
			String line;
			StringBuffer pageBuffer = new StringBuffer();
			
			while((line=reader.readLine())!= null)
			{
				pageBuffer.append(line);

			}
			return pageBuffer.toString();
		}
		catch(Exception e)
		{
			return null;
		}
	}
	private void writePage(String bufferString,URL pageUrl,String fileName)
	{
		String path= System.getProperty("user.dir") + System.getProperty("file.separator")+"tmp"+
		System.getProperty("file.separator");
		try
		{
			PrintWriter pw = new PrintWriter( new FileWriter( path+fileName+".txt" ) ); 
			pw.println(pageUrl.toString());
			pw.print(bufferString); 
			pw.close( );
		}
		catch(IOException e)
		{
			
		}

	}
	
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -