coincidencies.java

来自「一款很好的网络爬虫软件」· Java 代码 · 共 252 行
JAVA
252 行
import java.util.regex.*;
import java.net.*;
import java.sql.*;

public class coincidencies
{
	static Statement stmt = null; 
	static ResultSet rsPerVeure = null; 
	String paraula="";
	Connection conn;
	

	public coincidencies(URL url,String ascii,String textSencer,Connection conn,Statement stmt)
	{	
			int id_pagina=0,i=0,id_paraula=0;
			
			try
			{ 
				ascii=ascii.replaceAll("&nbsp;"," ");
				ascii=ascii.replaceAll("&nbsp"," ");
				ascii=ascii.replaceAll("nbsp;"," ");
				ascii=ascii.replaceAll("nbsp"," ");
				
				Class.forName("com.mysql.jdbc.Driver").newInstance();
							
				//Connection conn = DriverManager.getConnection("jdbc:mysql://192.168.1.2/spider?user=spider&password=spider");
	
				//stmt = conn.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE,ResultSet.CONCUR_UPDATABLE);
				this.conn=conn;
				this.stmt=stmt;
		
				//mirem si la p鄃ina ja existeix a la nostra base de dades
				//if (stmt.execute("SELECT * FROM vistes WHERE url='"+url.toString()+"'"))

				rsPerVeure = stmt.executeQuery("SELECT * FROM vistes WHERE url='"+url.toString()+"'");
				if(this.countItems(rsPerVeure)>0)
				{
					//Si existeix agafem el seu id
				//  rsPerVeure = stmt.getResultSet();
				  rsPerVeure.first(); 
				  id_pagina=rsPerVeure.getInt("id");
				}
				else//if the program runs wel this part never have to run
				{
//				  sino en creem un de nou i l'insertem a la BD
				  stmt.execute("INSERT INTO vistes (url) VALUES ('"+url.toString()+"')");
				  
				  rsPerVeure = stmt.executeQuery("SELECT id FROM vistes WHERE url='"+url.toString()+"'");
				  //agafem el seu id
				  rsPerVeure.first(); 
				  id_pagina=rsPerVeure.getInt(1);
				}	

				
				//comen鏴m a parsejar!
				Pattern p;
				Matcher m;
				boolean resultado;
				p = Pattern.compile("\\w+");
				m = p.matcher(ascii);
				resultado=m.find();

				while(resultado)
				{
					paraula= new String(ascii.subSequence(m.start(),m.end()).toString());
					
					paraula=paraula.replaceAll("\\<.*?\\>","").toLowerCase();

					//comprovem si la paraula trobada existeix a la BD
				
					rsPerVeure = stmt.executeQuery("SELECT id FROM paraules WHERE paraula='"+paraula+"'");
					
					//System.out.println(countItems(rsPerVeure));
					if(countItems(rsPerVeure)>0)
					{		
						//si existeix ens quedem amb el seu id
					//	rsPerVeure = stmt.getResultSet();
						rsPerVeure.first(); 
						id_paraula=rsPerVeure.getInt(1);
					}
					else 
					{
				//		System.out.println("6");
//							sino en creem un de nou i l'insertem a la BD
						if(paraula.length()<50)
						{
							stmt.execute("INSERT INTO paraules (paraula) VALUES ('"+paraula+"')");
							rsPerVeure=stmt.executeQuery("SELECT id FROM paraules WHERE paraula='"+paraula+"'");
							
							//Si existeix agafem el seu id
							//rsPerVeure = stmt.getResultSet();
							rsPerVeure.first(); 
							id_paraula=rsPerVeure.getInt(1);
						}
					}

					if(paraula.length()<50)//max length of a word declared on the database
					{
						rsPerVeure = stmt.executeQuery("SELECT * FROM coincidencies WHERE paraula='"+id_paraula+"' AND pagina='"+id_pagina+"'");
					
						int n_coincidencies=0,id_coincidencies=0;
						
						if(countItems(rsPerVeure)>0)
						{
						
							//si existeix ens quedem amb el seu id
							//rsPerVeure = stmt.getResultSet();
							rsPerVeure.first(); 
							n_coincidencies=rsPerVeure.getInt("n_coincidencies");
							id_coincidencies=rsPerVeure.getInt("id");
							n_coincidencies++;
							//	System.out.println("9");
							//ja existeix, augmentem la coincidencia
							rsPerVeure.updateString("n_coincidencies",new Integer(n_coincidencies).toString());
							rsPerVeure.updateRow();
							
							//ResultSet rs = stmt.executeQuery("UPDATE coincidencies SET n_coincidencies = '"+n_coincidencies+"' WHERE id ='"+id_coincidencies+"' ");
						}
						else
							stmt.execute("INSERT INTO coincidencies (paraula,pagina,n_coincidencies) VALUES ('"+id_paraula+"','"+id_pagina+"','1')");
					}
					
					//busquem la seguent paraula				
					resultado=m.find();
				}					
				//	System.out.println("Indexat: "+paraula);
				
					
			comprovaCoincidencies(textSencer,id_pagina,"<b>.+</b>");
			comprovaCoincidencies(textSencer,id_pagina,"<B>.+</B>");
			comprovaCoincidencies(textSencer,id_pagina,"<h1>.+</h1>");
			comprovaCoincidencies(textSencer,id_pagina,"<H1>.+</H1>");

			freeResultSets();
			
		}
		catch(SQLException ex) 
		{  
		   // handle any errors 
		   System.out.println("SQLException: " + ex.getMessage()); 
		   System.out.println("SQLState: " + ex.getSQLState()); 
		   System.out.println("VendorError: " + ex.getErrorCode()); 
		   return;
		}
			catch(Exception e)
			{
				e.printStackTrace();
				return;
			}
		}
	
	public void freeResultSets()
	{
		if (rsPerVeure != null) 
		{ 
			try 
			{
				rsPerVeure.close(); 
			}
			catch (SQLException sqlEx) 
			{ // ignore 
			} 

		}	
	/*	if (stmt != null)
		{ 
			try 
			{ 
				stmt.close(); 
			}
			 catch (SQLException sqlEx) 
			 {
				// ignore 
			 } 

			stmt = null; 
		} */
		
	
	}
		
		
		public int countItems(ResultSet rs)
		{
			int i=0;
			try{
			
			while(rs.next())
				i++;
					
			}
			catch(Exception e)
			{
				System.out.println("ERROR: while counting items");
				e.printStackTrace();
				
			
			}
			return i;
		}
		
		//this method count the words of a web page
		public int wordCount(String ascii)
		{
			Pattern p;
			Matcher m;
			int wordCount=0;
			boolean resultado;
			
			p = Pattern.compile("\\w+");
			m = p.matcher(ascii);
			resultado=m.find();
			
			while(resultado)
			{
				wordCount++;
				resultado=m.find();
			}
			return wordCount;
		
		
		}

		//we find the words that are between <b></b> and <h1></h1> and <u></u>
		//if we find words we increase the value of n_coincidencies on the table coincidencies
		//with this we can give more importance to the words that are between tags <b></b> and <h1></h1> and <u></u>
		//
		//words that ar between tags and:
		//length less than 20 caracters (not all the page)
		public void comprovaCoincidencies(String text,int id_pagina,String pattern)
		{
			Pattern p,p1;
			Matcher m,m1;
			boolean resultado;
			String paraula="",subParaula="";
			int id_paraula=0,coincidencies=0;
			
			try
			{
				text=text.toLowerCase();
				p = Pattern.compile(pattern);
			//	p1 = Pattern.compile("<b>.+[</b>]");
				m = p.matcher(text);
				
				resultado=m.find();
	
				while(resultado)
				{
					paraula= new String(text.subSequence(m.start(),m.end()).toString());
					paraula=paraula.replaceAll("\\<.*?\\>","");//delete <b> </b> tags
					paraula=paraula.replaceAll("!","");//delete the signs that won't be indexed
					paraula=paraula.replaceAll("
coincidencies.java - 源码说明

本页面展示了「一款很好的网络爬虫软件」中的 coincidencies.java 源码文件，采用 Java 编程语言编写，共 252 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与网络爬虫相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?