⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spiderbaixaurl.java

📁 另外一个网络机器人spider源码(java).rar 另外一个网络机器人spider源码(java).rar 另外一个网络机器人spider源码(java).rar
💻 JAVA
字号:
import java.io.*;
import java.net.*;
//import java.util.*;
import java.util.regex.*;
import java.sql.*;

public class SpiderBaixaURL extends Thread
{
	URL url_a_baixar;
	Statement stmt = null; 
	ResultSet rsVistos = null;
	ResultSet rsPerVeure = null; 
	ResultSet rs = null;
	concurrencia conc;
	boolean control;
	Connection conn;
	int n_links=0; //number of links the current url has to all pages (own, and other)
	int n_links_other=0; //number of links the page has to other pages
	int id_scanning;

	public SpiderBaixaURL(concurrencia conc,Connection conn,Statement stmt)
	{
		this.conc=conc;
		
		try{
			this.conn=conn;
			this.stmt=stmt;
			
			primerABaixar();
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	}

	public void run()
	{
			try
			{
				InputStream b;
				String r = url_a_baixar.toString();
				String nomfitx, nomfitx2;

				URL url_a_baixar1 = new URL(r);

				URLConnection url_conn;
				url_conn = url_a_baixar1.openConnection();

				//nomfitx = url_a_baixar.getFile();

				//we get the id of the page we are scanning
				id_scanning();
				
				sense_zip(url_a_baixar);
				
				freeResultSets();
			}
		catch(Exception e)
		{
				e.printStackTrace();
		}
	}
		
			
		public void sense_zip(URL b)
		{
			try
			{
					String a=new String();
					String ascii=new String();
					String c=new String();
					Character f= new Character('1');
					int i;

		
					System.out.println("Downloading Page..."+url_a_baixar.toString());
				
					BufferedReader dis  = new BufferedReader(new InputStreamReader( b.openStream())); 
					
					i=dis.read();
					f= new Character((char)i);
					
					c=f.toString();
					a+=c;
					while(i!=-1)
					{
						i=dis.read();
						f= new Character((char)i);
						c=f.toString();
				
						if(i!=-1)
							a+=c;
					}
					
			/*		a=eliminaExpresion("<font[^>]*>",a);
					a=eliminaExpresion("</font>",a);
					a=eliminaExpresion("<FONT[^>]*>",a);
					a=eliminaExpresion("</FONT>",a);
					a=eliminaExpresion("<SCRIPT[^>]*>.*</SCRIPT>",a);
					a=eliminaExpresion("<script[^>]*>.*</script>",a);
*/
					HTML2ASCII d = new HTML2ASCII(b.openStream());
	
					do
					{
						i=d.read();
						f= new Character((char)i);
						c=f.toString();

						if(i!=-1)
							ascii+=c;
					}while(i!=-1);
					ascii=ascii.toLowerCase();
					
					//control de la concurrencia
					control=false;
					while(!control)
					{	
						System.out.print(".");
						if(!conc.isLocked())
						{
							conc.lock();
							afegirHTML(a,url_a_baixar.toString());
							coincidencies coin = new coincidencies(url_a_baixar,ascii,a,conn,stmt);
							control=true;
							conc.unLock();
						}
						else
						{
							System.out.print(".");
							control=false;
							this.sleep(5000);
						}
					}
					
					System.out.println("Pagina en mem騬ia.");
					Pattern p;
					Matcher m;
					boolean resultado;

			//MAIL		p = Pattern.compile("([a-z0-9_]|\\-|\\.)+@(([a-z0-9_]|\\-)+\\.)+[a-z]{2,4}");
					p = Pattern.compile("(www\\.*)+(([a-z0-9_]|\\-)+\\.)+[a-z]{2,4}");

					m = p.matcher(a);
					
					resultado=m.find();
					i=0;
					//marquem com a visitada la url q estem llegint

					
					while(resultado)
					{
						i++;
						String nova_adreca= new String(a.subSequence(m.start(),m.end()).toString());
			
						//comprovem que comen鏸 amb http://			
						if(!nova_adreca.startsWith("http://"))
						{
							nova_adreca="http://"+nova_adreca;
						}
			
						
						
						//afegim cadascuna de les adreces trobades
						afegirNovaURL(nova_adreca);
						
						
						
					//	System.out.println(i+". "+nova_adreca);
						resultado=m.find();
						
						//el poses a vistos i l'elimines
					}								
						
					linkCounter();
	
			}
			catch(IOException e)
			{
				posarAVistos(url_a_baixar.toString());
				System.out.println("La URL "+url_a_baixar.toString()+" no existeix.");	
			}
			catch(Exception e)
			{
					e.printStackTrace();
			}
			finally
			{ 
				// it is a good idea to release
				// resources in a finally{} block 
				// in reverse-order of their creation 
				// if they are no-longer needed 

				if (rs != null) 
				{ 
					try 
					{
						rs.close(); 
					}
					catch (SQLException sqlEx) 
					{ // ignore 
					} 

					rs = null; 
				}
			}
		}
		
		
	//This method will delete all tags that contains format and javascript. Like <font...> and <script>...
	public String eliminaExpresion(String pattern,String text)
	{
		Pattern p;
		Matcher m;
		boolean resultado=true;
		String paraula="";
		
		try
		{
			text=text.replaceAll(pattern,"");//we delete the tag
		}
		catch(Exception e)
		{
			System.out.println("ERROR: Replacing Pattern!");
			e.printStackTrace();
		}
		
		return text;
	}
		
	
	public void posarAVistos(String novaAdreca)
	{
		try
		{
			if(stmt.execute("SELECT * FROM vistes WHERE url='"+novaAdreca+"'"))
			{
				try
				{
					int id_pagina;
					//si existeix ens quedem amb el seu id
					rsPerVeure = stmt.getResultSet();
					rsPerVeure.first(); 
					id_pagina=rsPerVeure.getInt("id");
				}
				catch(SQLException ex)
				{
						stmt.execute("insert into vistes (url) values ('"+novaAdreca+"')");
				}
			}
		}
		catch(SQLException ex) 
		{
		   // handle any errors 
		   System.out.println("SQLException: " + ex.getMessage()); 
		   System.out.println("SQLState: " + ex.getSQLState()); 
		   System.out.println("VendorError: " + ex.getErrorCode()); 
		}
		
//		l'elimines de la taula de per veure
		eliminaPerVeure();
	}
	
	public void eliminaPerVeure()
	{
			
		try{
		//		eliminem la url que acabem de fer

			 stmt.execute("DELETE FROM perVeure where url ='"+url_a_baixar.toString()+"' ");	  
		  }
		  catch(SQLException ex) 
		  {
			 // handle any errors 
			 System.out.println("SQLException: " + ex.getMessage()); 
			 System.out.println("SQLState: " + ex.getSQLState()); 
			 System.out.println("VendorError: " + ex.getErrorCode()); 
			 ex.printStackTrace();
		  }

		  //fi
		
		
	
	}
	
	public void id_scanning()
	{
		try
		{
			rs = stmt.executeQuery("SELECT id FROM vistes WHERE url='"+url_a_baixar.toString()+"'");
			
			rs.first(); 
			id_scanning=rs.getInt("id");//we have the id
		}
		catch(SQLException ex) 
		{
		   // handle any errors 
		   System.out.println("SQLException: " + ex.getMessage()); 
		   System.out.println("SQLState: " + ex.getSQLState()); 
		   System.out.println("VendorError: " + ex.getErrorCode()); 
		   ex.printStackTrace();
		}
	}
	
	//this method will count the number of links on the web we are scanning
	public void linkCounter()
	{
		  try
   		  {
			//Afegim la nova URL
		  	int propis=n_links-n_links_other;
		  	
			ResultSet rs = stmt.executeQuery("SELECT * FROM vistes WHERE id ='"+id_scanning+"' ");
			//rs.first();
			
			rs.absolute(1); // moves the cursor to the first row of rs
			rs.updateInt("n_links_propis",propis);
			rs.updateRow(); 
		  	
		  	
			//stmt.execute("insert into vistes (n_links_propis) values ('"+propis+"')");
		  }
		  catch(SQLException ex) 
		  {
			 // handle any errors 
			 System.out.println("SQLException: " + ex.getMessage()); 
			 System.out.println("SQLState: " + ex.getSQLState()); 
			 System.out.println("VendorError: " + ex.getErrorCode()); 
			 ex.printStackTrace();
			 
		  }	
	}
	public void linkA(String newLink)
	{
		
		try
		{
			int id_scanned,n_times=1;


			
			//have we allready visited the page?
			rsPerVeure = stmt.executeQuery("SELECT id FROM vistes WHERE url='"+newLink+"'");
			
			if(countItems(rsPerVeure)>0)
			{
				rsPerVeure.first(); 
				id_scanned=rsPerVeure.getInt("id");
			
				if(id_scanned!=id_scanning)
				{
					//how many times have we linked to this page
					rsPerVeure = stmt.executeQuery("SELECT COUNT(*) FROM links WHERE pagina_els_rep='"+id_scanned+"' AND pagina_envia='"+id_scanning+"'");
					
					if(countItems(rsPerVeure)>0)
					{
						rsPerVeure.first(); 
						n_times=rsPerVeure.getInt(1);
				
						n_times++;
					}
					else
					{
						n_times=1;
					}
					
					stmt.execute("insert into links (pagina_els_rep,pagina_envia,num) values ('"+id_scanned+"','"+id_scanning+"','"+n_times+"')");
				}
			}
			else // we havn't visited yet the web page that we link at, so we can't take the id
			{	// TODO in a future!
				
			
				System.out.println("Link a: "+newLink+" no ha estat possible");
			
			
			}
			
		}
		catch(SQLException ex) 
		{
		   // handle any errors 
		   System.out.println("SQLException: " + ex.getMessage()); 
		   System.out.println("SQLState: " + ex.getSQLState()); 
		   System.out.println("VendorError: " + ex.getErrorCode()); 
		   ex.printStackTrace();
		}
	}
	
	public void afegirNovaURL(String novaAdreca)
	{
	
		linkA(novaAdreca);
		n_links++;		
		
		if((!hiEs(novaAdreca,new String("perVeure")))&&(!hiEs(novaAdreca,new String("vistes"))))
		{ 
			n_links_other++;
			System.out.println("AFEGIDA "+novaAdreca);
			
			try{
				//Afegim la nova URL
				stmt.execute("insert into perVeure (url) values ('"+novaAdreca+"')");
			  }
				  catch(SQLException ex) 
				  {
					 // handle any errors 
					 System.out.println("SQLException: " + ex.getMessage()); 
					 System.out.println("SQLState: " + ex.getSQLState()); 
					 System.out.println("VendorError: " + ex.getErrorCode()); 
				  }

				  //fi
		}
	}
	
	public void primerABaixar()
	{
		try{
		//		baixem les dades del servidor MYSQL
				if (stmt.execute("SELECT * FROM perVeure ORDER BY id DESC"))
				{
				
				  rsPerVeure = stmt.getResultSet();
				  rsPerVeure.last(); 

				  url_a_baixar=new URL(rsPerVeure.getString(2));

				  System.out.println("A Baixar!: "+url_a_baixar.toString());
				//  rsPerVeure.close();
				  posarAVistos(url_a_baixar.toString());
				}			
		  }
		  catch(SQLException ex) 
		  {
			 // handle any errors 
			 System.out.println("SQLException: " + ex.getMessage()); 
			 System.out.println("SQLState: " + ex.getSQLState()); 
			 System.out.println("VendorError: " + ex.getErrorCode()); 
		  }
		  catch(Exception ex )
		  {
			ex.printStackTrace();
		  }

		  //fi

	}

	public boolean hiEs(String url,String taula)
		{
			int i=0;
			try
			{
				ResultSet rs = stmt.executeQuery("SELECT * FROM "+taula+" WHERE url='"+url+"'");
				
				 while (rs.next()) 
				 {
					//System.out.println(i);
						 i++;	
				 }
				// rs.close();

			}
			catch(SQLException ex) 
			{
				   // handle any errors 
				   System.out.println("SQLException: " + ex.getMessage()); 
				   System.out.println("SQLState: " + ex.getSQLState()); 
				   System.out.println("VendorError: " + ex.getErrorCode()); 
			}
	
			if(i>0)
				return true;
			else
				return false;
	
		}

	public void afegirHTML(String a,String url)
	{
		try
		{
			ResultSet rs = stmt.executeQuery("SELECT * FROM vistes WHERE url ='"+url+"' ");
			//rs.first();
			
			rs.absolute(1); // moves the cursor to the first row of rs
			rs.updateString(3,a);
			rs.updateRow(); // updates the row in the data source
			//rs.close();
 
		}
		catch(SQLException ex) 
		{
			   // handle any errors 
			   System.out.println("SQLException: " + ex.getMessage()); 
			   System.out.println("SQLState: " + ex.getSQLState()); 
			   System.out.println("VendorError: " + ex.getErrorCode()); 
		}
	}

	public void freeResultSets()
	{
		
		if (rs != null) 
		{ 
			try 
			{
				rs.close(); 
			}
			catch (SQLException sqlEx) 
			{ // ignore 
			} 

			rs = null; 
		}
		if (rsPerVeure != null) 
		{ 
			try 
			{
				rsPerVeure.close(); 
			}
			catch (SQLException sqlEx) 
			{ // ignore 
			} 

			rs = null; 
		}
		if (rsVistos != null) 
		{ 
			try 
			{
				rsVistos.close(); 
			}
			catch (SQLException sqlEx) 
			{ // ignore 
			} 

			rs = null; 
		}
	
	}
	
	//count the number of items are in this query
	public int countItems(ResultSet rs)
	{
		int i=0;
		try{
		
		while(rs.next())
			i++;
				
		}
		catch(Exception e)
		{
			System.out.println("ERROR: while counting items");
			e.printStackTrace();
			
		
		}
		return i;
	}	
	
}



⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -