readhtml.java

来自「此源代码是实现一个网页净化的功能」· Java 代码 · 共 143 行

JAVA

143 行

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

import com.sun.org.apache.xerces.internal.dom.DocumentImpl;

public class ReadHtml
{
   
   public ReadHtml()
   {
	  
   }
   public static void main(String[] args)
   {
	   

	   	   
           String urlString="http://news.hit.edu.cn/articles/2008/11-27/11105934.htm";
           ReadHtml rh=new ReadHtml();
           try 
           {
        	   String contentString=rh.getDocumentAt(urlString);
        	   String Testcontent = contentString.replaceAll("<[^>]*>","");
        	   String printcontent=rh.htmlToStr(contentString);
        	 //  System.out.println(Testcontent+"ok");
     

           

           
           Document document=new DocumentImpl();  
           DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
           DocumentBuilder builder = factory.newDocumentBuilder();
           BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("index.xml")));
           document = builder.parse(new  InputSource(br));
          // Element x=document.getElementById("p");
           

           
           NodeList nl = document.getElementsByTagName("title"); 
         
           
           Node mynode = nl.item(0);          
           Node node = mynode.getFirstChild();
           String nameSpace = node.getNodeValue();
           System.out.print("标题: ");
           System.out.println(nameSpace); 
           System.out.println("正文: ");
           
           nameSpace="";
           
           nl = document.getElementsByTagName("p"); 
           //System.out.println(nl.getLength());
           for(int i =0;i<nl.getLength();i++)
           {
            
            mynode = nl.item(i);    
            int h=mynode.getChildNodes().getLength();
            for(int j=0;j<h; j++) 
            {
        	   
             node = mynode.getChildNodes().item(j);
             if(node.getNodeValue()!=null)
             nameSpace = nameSpace+node.getNodeValue();
           
            }

           
           }
           System.out.println(nameSpace); 
           }
           catch(Exception e)
           {
        	   e.printStackTrace();
           }
          
    }

    private String getDocumentAt(String urlString) throws Exception 
    {  
    	StringBuffer content = new StringBuffer(); 
    	// 初始化URL 
    	URL url = new URL(urlString); 
    	// 从URL地址获取连接 
    	URLConnection connection = url.openConnection(); 
    	// 读取页面 
    	BufferedReader reader = new BufferedReader(new InputStreamReader( 
    			connection.getInputStream())); 

    	String line = null; 

    	while ((line = reader.readLine()) != null) 
    	{ 
    		content.append(line + "\n"); 
    	} 
    	reader.close(); 
    	String contentString = content.toString(); 
    	return contentString;
    }  

    public String htmlToStr(String htmlStr)
    {
    	String result = "";
    	boolean flag = true;
    	if(htmlStr==null){
    		return null;
    	}
    	char[] a = htmlStr.toCharArray();
    	int length=a.length;
    	for(int i=0;i<length;i++){
    		if(a[i]=='<'){
    			flag=false;
    			continue;
    		}
    		if(a[i]=='>'){
    			flag=true;
    			continue;
    		}
    		if(flag==true){
    			result+=a[i];
    		}
    	}
    	return result.toString();   
   }

}

readhtml.java - 源码说明

本页面展示了「此源代码是实现一个网页净化的功能」中的 readhtml.java 源码文件，采用 Java 编程语言编写，共 143 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与源代码相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?