📄 filerlocal.java

📁 可以进行网页的过滤
💻 JAVA
字号:

	

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class FilerLocal {

	    public static void main(String[] args) throws Exception
	    {   
	    	long start_time = System.currentTimeMillis();//ms
	    	
	        String aFile = "d:\\cs\\china\\6.htm";
	        //String aFile = "d:\\cs\\tx.htm";
	        String content = readTextFile(aFile, "GBK");
	        //System.out.println(content);
	        System.out.println("新闻标题是：");
	        testtitle(content);
	        System.out.println("新闻内容是：");
	        testcontent(content);
	        System.out.println("核心内容是:");
	      //  outline(content);
	        long end_time = System.currentTimeMillis();
	        System.out.println("time is " + (end_time - start_time)); 
	        
}
	    
	    
	    
	    
	    private static void testtitle(String content) throws ParserException{
		     String sbStr="";
			 try{
			 Parser myParser=new Parser(content);
	         NodeFilter filter = new TagNameFilter("title");//
	         NodeList nl = myParser.extractAllNodesThatMatch(filter);
		 		for (int i = 0; i < nl.size(); i++)
		 			{
		 			  //System.out.println(replaceTitle(nl.elementAt(i).toHtml()));
		               sbStr=sbStr+replaceTitle(nl.elementAt(i).toHtml());
		            } 			
		 		}
		        catch (Exception e)
		        {
		            //LogMan.error("read Text File Error", e);
		        }
	            
		       // return getTitle(sbStr);
		        System.out.println(getTitle(sbStr));
	        }
	  
	  private static void testcontent(String content) throws ParserException{
		     String sbStr;
		     int p;
		     int q;
		     
		     int href;
		     int img;
		     
		     int ss=0;
			 Parser myParser=new Parser(content);
	         NodeFilter filter = new TagNameFilter("p");//
	         NodeList nl = myParser.extractAllNodesThatMatch(filter);
	         if(nl.size()!=0){
		 		for (int i = 0; i < nl.size(); i++)
		 			{
		 			// System.out.println(nl.elementAt(i).toHtml());
		 			String t=nl.elementAt(i).toHtml();
		 			
		 			href=t.indexOf("href=");
		 			img=t.indexOf("<IMG");
		 			
		 			if(href!=-1){
		 			 ss=cleanLink(t);	
		 			}
		 				
		 			
		 			if((href!=-1&&img==-1&&ss!=0)||href==-1)
		 			{
		                sbStr=replaceHtml(nl.elementAt(i).toHtml());
		                System.out.println(showContent(sbStr));
		              //  System.out.println();
		                    } 	
		 			}//for循环结束
	         }//if结束
	         else
	         {
	        	myParser.reset();
	        	filter = new TagNameFilter("div");//
	        	nl= myParser.extractAllNodesThatMatch(filter); 
	            // System.out.println(nl.size());
	             for (int i = 0; i < nl.size(); i++)
	             {   
	            	 //String t=nl.elementAt(i).toHtml();
	            	 
	            	 String text =nl.elementAt(i).toHtml();
	            	 String sub ="<div";
	     	         int s=count(text,sub);
	     	         href=text.indexOf("href=");
	     	         
	     	           if(s==1)
	     	         {
	            	// System.out.println(nl.elementAt(i).toHtml()); 
	            	 sbStr=replaceHtml(nl.elementAt(i).toHtml());
		             System.out.println(showContent(sbStr));
	     	         }
	             }
	             
	         }	 
		System.out.println("==============================");
    	 		
    	 		/*输出强调的字*/
		 		for(int i=0; i<nl.size(); i++ ){
		 			
		 			String t=nl.elementAt(i).toHtml();
		 			
		 			if(t.indexOf("<STRONG>")!=-1){
		 				p=t.indexOf("<STRONG>");
		 				q=t.indexOf("</STRONG>");
		 	
		 				sbStr=t.substring(p+8, q);
		 				//System.out.println(sbStr);
		 				
		 			}
		 			
		 			
		 		}
		 	//String t=replaceHtml(sbStr);
		 	//System.out.println(t);
		     //System.out.println(showContent(t));
		        
	    }
	  
		//输出核心提示
/*	  private static void outline(String content) throws ParserException{
		     int href;
		     //String sbStr="";
			
			 Parser myParser=new Parser(content);
	         NodeFilter filter = new TagNameFilter("ul");//
	         NodeList nl = myParser.extractAllNodesThatMatch(filter);
		 		for (int i = 0; i < nl.size(); i++)
		 			{
		 			    
		 			   String t=nl.elementAt(i).toHtml();
		 			   System.out.println(t);
		 			   href=t.indexOf("href=");
		 			   if(href==-1) 
		 			   {
	 				    //sbStr=sbStr+replaceHtml(nl.elementAt(i).toHtml());
		 				 System.out.println(replaceHtml(nl.elementAt(i).toHtml()));
		 			   }
		            } 			
		 		
		        
		       // return getTitle(sbStr);
		       // System.out.println(getTitle(sbStr));
	        }
	  
	  
	*/  
	   public static int cleanLink(String html){
		    String t=replaceLink(html);
		    int sin=0;
		    int a1;
		    int an;
		    String fstr;
		    String lstr;
		    
		    
		   //a1=t.indexOf(replaceLink("<a"));
		    //an=t.lastIndexOf(replaceLink("</a>"));
		    a1=t.indexOf("<A");
		    an=t.lastIndexOf("</A>");
		    
		    fstr=t.substring(0,a1);
		    if(fstr.getBytes().length != fstr.length())
		    	sin++;
		    lstr=t.substring(an+4,t.length());
		    if(lstr.getBytes().length != lstr.length())
		    	sin++;
		  /*  if(fstr.matches("[^\u4E00-\u9FA5]*")){
		    	sin++;
		    }
		    lstr=t.substring(an+4,t.length());
		    if(lstr.matches("[^\u4E00-\u9FA5]*")){
		    	sin++;
		    }
		   */
		   
		return sin;   
		   
		   
		   
	   }
	  
	  
		public static String readTextFile(String sFileName, String sEncode)
	    {
	        StringBuffer sbStr = new StringBuffer();

	        try
	        {
	            File ff = new File(sFileName);
	            InputStreamReader read = new InputStreamReader(new FileInputStream(ff),
	                    sEncode);
	            BufferedReader ins = new BufferedReader(read);

	            String dataLine = "";
	            while (null != (dataLine = ins.readLine()))
	            {
	                sbStr.append(dataLine);
	                sbStr.append(" ");
	            }

	            ins.close();
	        }
	        catch (Exception e)
	        {
	            e.printStackTrace();
	        }

	        return sbStr.toString();
	    }
	    
	    	public static String getTitle(String str)
     	 {
	        String rstr=replaceTitle(str);
   	        String[] array=rstr.split("-");
   	        return array[0].toString();
     	 }
		
		 public   static   String showContent(String content)
		 {
			StringBuffer sbStr = new StringBuffer();
			 int linenum=40;//每行显示字符数
			 int i=0;
			while(i<content.length())
			{
				if((i+linenum)<content.length())
				{			
				sbStr=sbStr.append(content.substring(i,i+linenum));
				}
				else 
				{
					sbStr=sbStr.append(content.substring(i,content.length()));
				}
				sbStr=sbStr.append("\r\n");
				i=i+linenum;
			}
			 return sbStr.toString();
			 
		 }
		 
		 public static String replaceLink(String html)
		   {
			 html=html.replaceAll("<a|<A", "<A");
			 html=html.replaceAll("</a>|</A>","</A>");
			 return html;
		   }
		 public   static   String replaceTitle(String   html)
	        {           
	        html=html.replaceAll("<.*?>","");
	        html=html.replaceAll("-|_", "-");
	         return   html;   
	        }
		 
		 
         public   static   String replaceHtml(String   html){             
		    	//html=html.replaceAll("<a href[^>]*>.*</a>","");
		    	html=html.replaceAll("&gt;","");
		        html=html.replaceAll("&gt","");
		        html=html.replaceAll("&nbsp;","");
		        html=html.replaceAll("<SCRIPT[^>]*>.*</SCRIPT>","");
		        html=html.replaceAll("<style[^>]*>.*</style>","");
		        html=html.replaceAll("<!--.*?-->","");
		        html=html.replaceAll("<.*?>","");
		        html=html.replaceAll("&middot;","");
		        html=html.replaceAll("&quot;","");
		              return   html;   
		      } 
         /**
         *计算一个字符串在另一个字符串中出现的次数
         *@author : zhuzhu
         */
         public static int count(String text,String sub){
 	        int count =0, start =0;
 	        while((start=text.indexOf(sub,start))>=0){
 	            start += sub.length();
 	            count ++;
 	        }
 	        return count;
 	    }
}
💿 文件大小 260 K
👤 上传用户 KuFly
📂 所属分类 Java编程
🏷️ 相关标签

#页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -