📄 spiderbaixaurl.java
字号:
import java.io.*;
import java.net.*;
//import java.util.*;
import java.util.regex.*;
import java.sql.*;
public class SpiderBaixaURL extends Thread
{
URL url_a_baixar;
Statement stmt = null;
ResultSet rsVistos = null;
ResultSet rsPerVeure = null;
ResultSet rs = null;
concurrencia conc;
boolean control;
Connection conn;
int n_links=0; //number of links the current url has to all pages (own, and other)
int n_links_other=0; //number of links the page has to other pages
int id_scanning;
public SpiderBaixaURL(concurrencia conc,Connection conn,Statement stmt)
{
this.conc=conc;
try{
this.conn=conn;
this.stmt=stmt;
primerABaixar();
}
catch(Exception e)
{
e.printStackTrace();
}
}
public void run()
{
try
{
InputStream b;
String r = url_a_baixar.toString();
String nomfitx, nomfitx2;
URL url_a_baixar1 = new URL(r);
URLConnection url_conn;
url_conn = url_a_baixar1.openConnection();
//nomfitx = url_a_baixar.getFile();
//we get the id of the page we are scanning
id_scanning();
sense_zip(url_a_baixar);
freeResultSets();
}
catch(Exception e)
{
e.printStackTrace();
}
}
public void sense_zip(URL b)
{
try
{
String a=new String();
String ascii=new String();
String c=new String();
Character f= new Character('1');
int i;
System.out.println("Downloading Page..."+url_a_baixar.toString());
BufferedReader dis = new BufferedReader(new InputStreamReader( b.openStream()));
i=dis.read();
f= new Character((char)i);
c=f.toString();
a+=c;
while(i!=-1)
{
i=dis.read();
f= new Character((char)i);
c=f.toString();
if(i!=-1)
a+=c;
}
/* a=eliminaExpresion("<font[^>]*>",a);
a=eliminaExpresion("</font>",a);
a=eliminaExpresion("<FONT[^>]*>",a);
a=eliminaExpresion("</FONT>",a);
a=eliminaExpresion("<SCRIPT[^>]*>.*</SCRIPT>",a);
a=eliminaExpresion("<script[^>]*>.*</script>",a);
*/
HTML2ASCII d = new HTML2ASCII(b.openStream());
do
{
i=d.read();
f= new Character((char)i);
c=f.toString();
if(i!=-1)
ascii+=c;
}while(i!=-1);
ascii=ascii.toLowerCase();
//control de la concurrencia
control=false;
while(!control)
{
System.out.print(".");
if(!conc.isLocked())
{
conc.lock();
afegirHTML(a,url_a_baixar.toString());
coincidencies coin = new coincidencies(url_a_baixar,ascii,a,conn,stmt);
control=true;
conc.unLock();
}
else
{
System.out.print(".");
control=false;
this.sleep(5000);
}
}
System.out.println("Pagina en mem騬ia.");
Pattern p;
Matcher m;
boolean resultado;
//MAIL p = Pattern.compile("([a-z0-9_]|\\-|\\.)+@(([a-z0-9_]|\\-)+\\.)+[a-z]{2,4}");
p = Pattern.compile("(www\\.*)+(([a-z0-9_]|\\-)+\\.)+[a-z]{2,4}");
m = p.matcher(a);
resultado=m.find();
i=0;
//marquem com a visitada la url q estem llegint
while(resultado)
{
i++;
String nova_adreca= new String(a.subSequence(m.start(),m.end()).toString());
//comprovem que comen鏸 amb http://
if(!nova_adreca.startsWith("http://"))
{
nova_adreca="http://"+nova_adreca;
}
//afegim cadascuna de les adreces trobades
afegirNovaURL(nova_adreca);
// System.out.println(i+". "+nova_adreca);
resultado=m.find();
//el poses a vistos i l'elimines
}
linkCounter();
}
catch(IOException e)
{
posarAVistos(url_a_baixar.toString());
System.out.println("La URL "+url_a_baixar.toString()+" no existeix.");
}
catch(Exception e)
{
e.printStackTrace();
}
finally
{
// it is a good idea to release
// resources in a finally{} block
// in reverse-order of their creation
// if they are no-longer needed
if (rs != null)
{
try
{
rs.close();
}
catch (SQLException sqlEx)
{ // ignore
}
rs = null;
}
}
}
//This method will delete all tags that contains format and javascript. Like <font...> and <script>...
public String eliminaExpresion(String pattern,String text)
{
Pattern p;
Matcher m;
boolean resultado=true;
String paraula="";
try
{
text=text.replaceAll(pattern,"");//we delete the tag
}
catch(Exception e)
{
System.out.println("ERROR: Replacing Pattern!");
e.printStackTrace();
}
return text;
}
public void posarAVistos(String novaAdreca)
{
try
{
if(stmt.execute("SELECT * FROM vistes WHERE url='"+novaAdreca+"'"))
{
try
{
int id_pagina;
//si existeix ens quedem amb el seu id
rsPerVeure = stmt.getResultSet();
rsPerVeure.first();
id_pagina=rsPerVeure.getInt("id");
}
catch(SQLException ex)
{
stmt.execute("insert into vistes (url) values ('"+novaAdreca+"')");
}
}
}
catch(SQLException ex)
{
// handle any errors
System.out.println("SQLException: " + ex.getMessage());
System.out.println("SQLState: " + ex.getSQLState());
System.out.println("VendorError: " + ex.getErrorCode());
}
// l'elimines de la taula de per veure
eliminaPerVeure();
}
public void eliminaPerVeure()
{
try{
// eliminem la url que acabem de fer
stmt.execute("DELETE FROM perVeure where url ='"+url_a_baixar.toString()+"' ");
}
catch(SQLException ex)
{
// handle any errors
System.out.println("SQLException: " + ex.getMessage());
System.out.println("SQLState: " + ex.getSQLState());
System.out.println("VendorError: " + ex.getErrorCode());
ex.printStackTrace();
}
//fi
}
public void id_scanning()
{
try
{
rs = stmt.executeQuery("SELECT id FROM vistes WHERE url='"+url_a_baixar.toString()+"'");
rs.first();
id_scanning=rs.getInt("id");//we have the id
}
catch(SQLException ex)
{
// handle any errors
System.out.println("SQLException: " + ex.getMessage());
System.out.println("SQLState: " + ex.getSQLState());
System.out.println("VendorError: " + ex.getErrorCode());
ex.printStackTrace();
}
}
//this method will count the number of links on the web we are scanning
public void linkCounter()
{
try
{
//Afegim la nova URL
int propis=n_links-n_links_other;
ResultSet rs = stmt.executeQuery("SELECT * FROM vistes WHERE id ='"+id_scanning+"' ");
//rs.first();
rs.absolute(1); // moves the cursor to the first row of rs
rs.updateInt("n_links_propis",propis);
rs.updateRow();
//stmt.execute("insert into vistes (n_links_propis) values ('"+propis+"')");
}
catch(SQLException ex)
{
// handle any errors
System.out.println("SQLException: " + ex.getMessage());
System.out.println("SQLState: " + ex.getSQLState());
System.out.println("VendorError: " + ex.getErrorCode());
ex.printStackTrace();
}
}
public void linkA(String newLink)
{
try
{
int id_scanned,n_times=1;
//have we allready visited the page?
rsPerVeure = stmt.executeQuery("SELECT id FROM vistes WHERE url='"+newLink+"'");
if(countItems(rsPerVeure)>0)
{
rsPerVeure.first();
id_scanned=rsPerVeure.getInt("id");
if(id_scanned!=id_scanning)
{
//how many times have we linked to this page
rsPerVeure = stmt.executeQuery("SELECT COUNT(*) FROM links WHERE pagina_els_rep='"+id_scanned+"' AND pagina_envia='"+id_scanning+"'");
if(countItems(rsPerVeure)>0)
{
rsPerVeure.first();
n_times=rsPerVeure.getInt(1);
n_times++;
}
else
{
n_times=1;
}
stmt.execute("insert into links (pagina_els_rep,pagina_envia,num) values ('"+id_scanned+"','"+id_scanning+"','"+n_times+"')");
}
}
else // we havn't visited yet the web page that we link at, so we can't take the id
{ // TODO in a future!
System.out.println("Link a: "+newLink+" no ha estat possible");
}
}
catch(SQLException ex)
{
// handle any errors
System.out.println("SQLException: " + ex.getMessage());
System.out.println("SQLState: " + ex.getSQLState());
System.out.println("VendorError: " + ex.getErrorCode());
ex.printStackTrace();
}
}
public void afegirNovaURL(String novaAdreca)
{
linkA(novaAdreca);
n_links++;
if((!hiEs(novaAdreca,new String("perVeure")))&&(!hiEs(novaAdreca,new String("vistes"))))
{
n_links_other++;
System.out.println("AFEGIDA "+novaAdreca);
try{
//Afegim la nova URL
stmt.execute("insert into perVeure (url) values ('"+novaAdreca+"')");
}
catch(SQLException ex)
{
// handle any errors
System.out.println("SQLException: " + ex.getMessage());
System.out.println("SQLState: " + ex.getSQLState());
System.out.println("VendorError: " + ex.getErrorCode());
}
//fi
}
}
public void primerABaixar()
{
try{
// baixem les dades del servidor MYSQL
if (stmt.execute("SELECT * FROM perVeure ORDER BY id DESC"))
{
rsPerVeure = stmt.getResultSet();
rsPerVeure.last();
url_a_baixar=new URL(rsPerVeure.getString(2));
System.out.println("A Baixar!: "+url_a_baixar.toString());
// rsPerVeure.close();
posarAVistos(url_a_baixar.toString());
}
}
catch(SQLException ex)
{
// handle any errors
System.out.println("SQLException: " + ex.getMessage());
System.out.println("SQLState: " + ex.getSQLState());
System.out.println("VendorError: " + ex.getErrorCode());
}
catch(Exception ex )
{
ex.printStackTrace();
}
//fi
}
public boolean hiEs(String url,String taula)
{
int i=0;
try
{
ResultSet rs = stmt.executeQuery("SELECT * FROM "+taula+" WHERE url='"+url+"'");
while (rs.next())
{
//System.out.println(i);
i++;
}
// rs.close();
}
catch(SQLException ex)
{
// handle any errors
System.out.println("SQLException: " + ex.getMessage());
System.out.println("SQLState: " + ex.getSQLState());
System.out.println("VendorError: " + ex.getErrorCode());
}
if(i>0)
return true;
else
return false;
}
public void afegirHTML(String a,String url)
{
try
{
ResultSet rs = stmt.executeQuery("SELECT * FROM vistes WHERE url ='"+url+"' ");
//rs.first();
rs.absolute(1); // moves the cursor to the first row of rs
rs.updateString(3,a);
rs.updateRow(); // updates the row in the data source
//rs.close();
}
catch(SQLException ex)
{
// handle any errors
System.out.println("SQLException: " + ex.getMessage());
System.out.println("SQLState: " + ex.getSQLState());
System.out.println("VendorError: " + ex.getErrorCode());
}
}
public void freeResultSets()
{
if (rs != null)
{
try
{
rs.close();
}
catch (SQLException sqlEx)
{ // ignore
}
rs = null;
}
if (rsPerVeure != null)
{
try
{
rsPerVeure.close();
}
catch (SQLException sqlEx)
{ // ignore
}
rs = null;
}
if (rsVistos != null)
{
try
{
rsVistos.close();
}
catch (SQLException sqlEx)
{ // ignore
}
rs = null;
}
}
//count the number of items are in this query
public int countItems(ResultSet rs)
{
int i=0;
try{
while(rs.next())
i++;
}
catch(Exception e)
{
System.out.println("ERROR: while counting items");
e.printStackTrace();
}
return i;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -