⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 webparserfilter.java

📁 Lucene+nuctch一书的全部源码 测试源码 和几个简单的项目
💻 JAVA
字号:

package chapter2;

import java.io.*;
import java.net.*;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;

public class WebParserFilter { 

private static String src_File_Path = "D:\\workshop\\ch2\\htmlsrc.html";
private static String dst_File_Path = "D:\\workshop\\ch2\\puresrc.txt";

 public static void main(String[] args) throws IOException {
	 
        try { 
        	
        	ParserFilter();
       } catch (IOException e) { 
           System.err.println("下载失败,请检查输入地址是否正确。");  
           System.exit(1);
       } 
  }
 
 public static void ParserFilter()  throws IOException {
     try { 
    	int j = 0; 
    	boolean bflag = true;

        boolean bContent = true;
        StringBuffer sBuffer = new StringBuffer(8096*2); 
        char[] cBuffer = new char[8096*2]; 
        char[] dstBuffer = new char[8096*2]; 
        int nCount = 0;
        
     	File srcfile = new File(src_File_Path);
		FileReader fpReader = new FileReader(srcfile);
		
		File dstfile = new File(dst_File_Path);
		FileWriter fpWriter = new FileWriter(dstfile);

    	nCount = fpReader.read(cBuffer);

		for(int i = 0; i < nCount;i++)
        {
            if( bContent == false )
            {
                if(cBuffer[i] == '>')
                	bContent = true;
                else
                    continue;
            } else {
                if(cBuffer[i] == '<')
                {
                	bContent = false;
                    continue;
                } else if(cBuffer[i] == '\n' || cBuffer[i] == ' ' || cBuffer[i] == ' ' ||  cBuffer[i] == '	')
                {
                	continue;
                }else if(  cBuffer[i]   == '&' && cBuffer[i+1] == 'n' 
                	    && cBuffer[i+2] == 'b' && cBuffer[i+3] == 's' 
                	    && cBuffer[i+4] == 'p' && cBuffer[i+5] == ';')
                {
                	i =i+5;
                	continue;
                }
                
                dstBuffer[j++] = cBuffer[i];
            }
        }
		
		bflag = true;
		for(int m = 0; m < j; m++)
        {   // 英文和数字不拆分
			if(    ( dstBuffer[m] <= 'Z' &&  dstBuffer[m] >= 'A' )
				|| ( dstBuffer[m] <= 'z' &&  dstBuffer[m] >= 'a' ) 
				|| ( dstBuffer[m] <= '9' &&  dstBuffer[m] >= '0' ))
			{
				if( bflag == false)
				{
					sBuffer.append(' ');
				}
				sBuffer.append(dstBuffer[m]);
				bflag = true;
				//sBuffer.append(' ');
			} else {
 				// 过滤标点符号
				if(    dstBuffer[m] == '、' || dstBuffer[m] == '|' 
					|| dstBuffer[m] == '”'  || dstBuffer[m] == ':'
					|| dstBuffer[m] == ';'  || dstBuffer[m] == '.')
				{
					sBuffer.append(' ');
					continue;
				}
				if( bflag == true)
				{
					sBuffer.append(' ');
				}
				// 中文字符用空格分离
				sBuffer.append(dstBuffer[m]);
				sBuffer.append(' ');
				bflag = false;
			}
        }

		System.out.println(sBuffer.toString());
		fpWriter.write(sBuffer.toString());
		
		fpReader.close();
		fpWriter.close();
     } catch (UnknownHostException e) {
         System.err.println("无法访问指定主机."); 
         System.exit(1);
    } catch (IOException e) { 
        throw e;
    }  
 }
}


⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -