⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 parser.java

📁 一个简单网络搜索引擎了
💻 JAVA
字号:
import   java.lang.*;
import   java.net.*;
import   java.sql.*;
import   java.io.*;   
import   java.sql.*;
import   java.util.*;   
import   javax.swing.text.*;   
import   javax.swing.text.html.*;   
import   javax.swing.text.html.parser.*; 
import   javax.swing.text.html.HTMLEditorKit.ParserCallback;

import com.sun.org.apache.xalan.internal.xsltc.compiler.sym;
public class Parser extends ParserCallback     //HTML解析函数
{
 protected static ArrayList element = new ArrayList();
 public Parser()
 {
   
 }
 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) 
{
     handleStartTag(t, a, pos);
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) 
 {
	 if((t == HTML.Tag.A) && (t != HTML.Tag.BASE))
     {  
		   String   herf=   (String)a.getAttribute(HTML.Attribute.HREF);  
		   if (herf!= null && (herf.indexOf("http")==0) && herf.length()<30)
		   {
			   if(!element.contains(herf))
			       element.add(herf);     //先把第一层的URL放到容器中,然后再把他们一个个提取出来,
		   }
      }
}
private   static  void   startParse(String   sHtml)
{    
		  try
		  {   
				  ParserDelegator   ps   =   new   ParserDelegator();   
				  HTMLEditorKit.ParserCallback   parser   =   new   Parser();   
				  ps.parse(new StringReader(sHtml),   parser,   true); 
		  }
		  catch(Exception   e)
		  {   
				  e.printStackTrace();   
		  }   
}   
 static class FileReader2 extends InputStreamReader
{
        /*
		这里我重新写了一个函数,解决汉字的编码编码问题
        */
        public FileReader2(String fileName,String charSetName) throws FileNotFoundException, UnsupportedEncodingException
		{
            super(new FileInputStream(fileName),charSetName);
        }
}
public   static   void   main(String   args[])  throws Exception
{   
	String RL = "jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=TESTDB";
    String user ="sa";//这里替换成你自已的数据库用户名
    String password ="";//这里替换成你自已的数据库用户密码
	PreparedStatement psInsert =null;
	try
		{   
			  Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
			  System.out.println( "类实例化成功!" );
			  Connection con = DriverManager.getConnection(RL,user,password);
			  System.out.println( "创建连接对像成功!" );
			  Statement stmt = con.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE);  
			  BufferedWriter out=null;
			  PrintWriter out1=null;
			//  BufferedReader  br=new BufferedReader(new FileReader2("d:\\1.htm","UTF-8"));  //从文件中获得
			  URL url = new URL("http://www.sohu.com");
			  HttpURLConnection uc=(HttpURLConnection)url.openConnection();
			  System.out.println("openConnection connect sucessful");
			  BufferedReader br = new BufferedReader(new InputStreamReader(uc.getInputStream()));  //连网从网络上获得
			  ArrayList html=new ArrayList();  
			  ArrayList html1=new ArrayList();  //不要因为可以放到一个容器中
			  ArrayList allnum=new ArrayList();
			  String s1;
			  String s ;
			  while((s=br.readLine())!=null)
			  {
				 html.add(s);    
				 s = br.readLine();
			  }
			  s = new String(html.toString().getBytes("GBK"));
			  startParse(s);
			  psInsert=con.prepareStatement("Insert  INTO url1  Values (?)");
			  ArrayList link = element;
			  element.clear();
			  System.out.println("第一次清空成功");
			  for(int i3=0;i3<link.size();i3++)
			  {
		   		String s5=link.get(i3).toString();
		   	    System.out.println(s5);
		   	   /* if(!allnum.contains(s5))
		   	    {
			       element.add(s5);
		   	    }*/
			    psInsert.setString(1,s5);
		   		psInsert.executeUpdate();
			  }
			  ResultSet  rs1 = stmt.executeQuery("select * from url1");   
			  ArrayList  as  = new   ArrayList();   
			  while(rs1.next())  
			  {   
			          as.add(rs1.getString(1));   
			  } 
			  element.clear();
			  System.out.println("第一次插入成功");
			  System.out.println("最原始的是已经解析出来");
			  System.out.println("已经导入放到AS容器中");
			  System.out.println("已把element中的数据清空");
			  for(int i=0;i<as.size();i++)  //已替换
			  {
					System.out.println("i等于"+i);
			  	    System.out.println(as.get(i));
					String s3=as.get(i).toString();
					//从这里开始做第二层的URL
					URL url2=new URL(s3);
					HttpURLConnection uc2=(HttpURLConnection)url2.openConnection();
					System.out.println("openConnection1 connect sucessful");
					BufferedReader br1=new BufferedReader(new InputStreamReader(uc2.getInputStream()));
					while((s1=br1.readLine())!=null)
					{
						html1.add(s1);  //加入到不同的容器中;
						s1=br1.readLine();
					}
					s1=new String(html1.toString().getBytes("GBK"));
					//问题问题问题问题就是出现在这里,因为element是一个静态的全局变量。它不会因为函数的消失而没有
					startParse(s1);
					System.out.println("第二次解析成功");
					/*ResultSet rs = stmt.executeQuery("SELECT * from url1");
			        rs.last();
			        System.out.println("成功移动动动到最后一行");
					ArrayList link1=element;
			        for(int i1=0;i1<link1.size();i1++)
					{
				   		String s4=link1.get(i1).toString();
				   		System.out.println(s4);
				   		psInsert.setString(1,s4);
				   		psInsert.executeUpdate();
					}*/
					ArrayList link1=element;
					for(int i1=0;i1<link1.size();i1++)
					{
						String s4=link1.get(i1).toString();
				 		//System.out.println(s4);
				 		if(!allnum.contains(s4))
				 			allnum.add(s4);
					}
					link1.clear();
			        element.clear();
			        html1.clear();//问题问problem 就是出现在这这there;
			        System.out.println("link已经清空"+i);
			        System.out.println("element已经清空");
			        System.out.println("link已经清空");
				}
			  for(int i5=0;i5<allnum.size();i5++)
			  {
			  	String s5=allnum.get(i5).toString();
			  	System.out.println(s5);
			  	psInsert.setString(1,s5);
			  	psInsert.executeUpdate();
			  }
			   out1.flush();
			   out1.close();
			   br.close();
			   con.close();
	  }
	  catch(Exception   e)
	  {   
			 e.printStackTrace();   
	  }   
  }   

}
//http://java.chinaitlab.com/base/732677.html

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -