📄 coincidencies.java
字号:
import java.util.regex.*;
import java.net.*;
import java.sql.*;
public class coincidencies
{
static Statement stmt = null;
static ResultSet rsPerVeure = null;
String paraula="";
Connection conn;
public coincidencies(URL url,String ascii,String textSencer,Connection conn,Statement stmt)
{
int id_pagina=0,i=0,id_paraula=0;
try
{
ascii=ascii.replaceAll(" "," ");
ascii=ascii.replaceAll(" "," ");
ascii=ascii.replaceAll("nbsp;"," ");
ascii=ascii.replaceAll("nbsp"," ");
Class.forName("com.mysql.jdbc.Driver").newInstance();
//Connection conn = DriverManager.getConnection("jdbc:mysql://192.168.1.2/spider?user=spider&password=spider");
//stmt = conn.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE,ResultSet.CONCUR_UPDATABLE);
this.conn=conn;
this.stmt=stmt;
//mirem si la p鄃ina ja existeix a la nostra base de dades
//if (stmt.execute("SELECT * FROM vistes WHERE url='"+url.toString()+"'"))
rsPerVeure = stmt.executeQuery("SELECT * FROM vistes WHERE url='"+url.toString()+"'");
if(this.countItems(rsPerVeure)>0)
{
//Si existeix agafem el seu id
// rsPerVeure = stmt.getResultSet();
rsPerVeure.first();
id_pagina=rsPerVeure.getInt("id");
}
else//if the program runs wel this part never have to run
{
// sino en creem un de nou i l'insertem a la BD
stmt.execute("INSERT INTO vistes (url) VALUES ('"+url.toString()+"')");
rsPerVeure = stmt.executeQuery("SELECT id FROM vistes WHERE url='"+url.toString()+"'");
//agafem el seu id
rsPerVeure.first();
id_pagina=rsPerVeure.getInt(1);
}
//comen鏴m a parsejar!
Pattern p;
Matcher m;
boolean resultado;
p = Pattern.compile("\\w+");
m = p.matcher(ascii);
resultado=m.find();
while(resultado)
{
paraula= new String(ascii.subSequence(m.start(),m.end()).toString());
paraula=paraula.replaceAll("\\<.*?\\>","").toLowerCase();
//comprovem si la paraula trobada existeix a la BD
rsPerVeure = stmt.executeQuery("SELECT id FROM paraules WHERE paraula='"+paraula+"'");
//System.out.println(countItems(rsPerVeure));
if(countItems(rsPerVeure)>0)
{
//si existeix ens quedem amb el seu id
// rsPerVeure = stmt.getResultSet();
rsPerVeure.first();
id_paraula=rsPerVeure.getInt(1);
}
else
{
// System.out.println("6");
// sino en creem un de nou i l'insertem a la BD
if(paraula.length()<50)
{
stmt.execute("INSERT INTO paraules (paraula) VALUES ('"+paraula+"')");
rsPerVeure=stmt.executeQuery("SELECT id FROM paraules WHERE paraula='"+paraula+"'");
//Si existeix agafem el seu id
//rsPerVeure = stmt.getResultSet();
rsPerVeure.first();
id_paraula=rsPerVeure.getInt(1);
}
}
if(paraula.length()<50)//max length of a word declared on the database
{
rsPerVeure = stmt.executeQuery("SELECT * FROM coincidencies WHERE paraula='"+id_paraula+"' AND pagina='"+id_pagina+"'");
int n_coincidencies=0,id_coincidencies=0;
if(countItems(rsPerVeure)>0)
{
//si existeix ens quedem amb el seu id
//rsPerVeure = stmt.getResultSet();
rsPerVeure.first();
n_coincidencies=rsPerVeure.getInt("n_coincidencies");
id_coincidencies=rsPerVeure.getInt("id");
n_coincidencies++;
// System.out.println("9");
//ja existeix, augmentem la coincidencia
rsPerVeure.updateString("n_coincidencies",new Integer(n_coincidencies).toString());
rsPerVeure.updateRow();
//ResultSet rs = stmt.executeQuery("UPDATE coincidencies SET n_coincidencies = '"+n_coincidencies+"' WHERE id ='"+id_coincidencies+"' ");
}
else
stmt.execute("INSERT INTO coincidencies (paraula,pagina,n_coincidencies) VALUES ('"+id_paraula+"','"+id_pagina+"','1')");
}
//busquem la seguent paraula
resultado=m.find();
}
// System.out.println("Indexat: "+paraula);
comprovaCoincidencies(textSencer,id_pagina,"<b>.+</b>");
comprovaCoincidencies(textSencer,id_pagina,"<B>.+</B>");
comprovaCoincidencies(textSencer,id_pagina,"<h1>.+</h1>");
comprovaCoincidencies(textSencer,id_pagina,"<H1>.+</H1>");
freeResultSets();
}
catch(SQLException ex)
{
// handle any errors
System.out.println("SQLException: " + ex.getMessage());
System.out.println("SQLState: " + ex.getSQLState());
System.out.println("VendorError: " + ex.getErrorCode());
return;
}
catch(Exception e)
{
e.printStackTrace();
return;
}
}
public void freeResultSets()
{
if (rsPerVeure != null)
{
try
{
rsPerVeure.close();
}
catch (SQLException sqlEx)
{ // ignore
}
}
/* if (stmt != null)
{
try
{
stmt.close();
}
catch (SQLException sqlEx)
{
// ignore
}
stmt = null;
} */
}
public int countItems(ResultSet rs)
{
int i=0;
try{
while(rs.next())
i++;
}
catch(Exception e)
{
System.out.println("ERROR: while counting items");
e.printStackTrace();
}
return i;
}
//this method count the words of a web page
public int wordCount(String ascii)
{
Pattern p;
Matcher m;
int wordCount=0;
boolean resultado;
p = Pattern.compile("\\w+");
m = p.matcher(ascii);
resultado=m.find();
while(resultado)
{
wordCount++;
resultado=m.find();
}
return wordCount;
}
//we find the words that are between <b></b> and <h1></h1> and <u></u>
//if we find words we increase the value of n_coincidencies on the table coincidencies
//with this we can give more importance to the words that are between tags <b></b> and <h1></h1> and <u></u>
//
//words that ar between tags and:
//length less than 20 caracters (not all the page)
public void comprovaCoincidencies(String text,int id_pagina,String pattern)
{
Pattern p,p1;
Matcher m,m1;
boolean resultado;
String paraula="",subParaula="";
int id_paraula=0,coincidencies=0;
try
{
text=text.toLowerCase();
p = Pattern.compile(pattern);
// p1 = Pattern.compile("<b>.+[</b>]");
m = p.matcher(text);
resultado=m.find();
while(resultado)
{
paraula= new String(text.subSequence(m.start(),m.end()).toString());
paraula=paraula.replaceAll("\\<.*?\\>","");//delete <b> </b> tags
paraula=paraula.replaceAll("!","");//delete the signs that won't be indexed
paraula=paraula.replaceAll("
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -