⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 webparser.java

📁 java实现的web搜索引擎
💻 JAVA
字号:
/*
 * 除去html源码中的空格,提取正文存档txt文件
 */
package com.catking.webSearcher;

import java.net.*;
import java.io.*;

public class WebParser {

	/**
	 * @param args
	 */
	private static String txtFileName = "index.txt";
	
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		try{
			parser();
		}catch(IOException e){
			System.out.println("下载失败");
			System.exit(1);
		}
	}
	public static void parser() throws IOException{
		
		File htmlFile = new File(WebCrawler.htmlFileName);
		FileReader reader = new FileReader(htmlFile);
		
		File txtFile = new File(txtFileName);
		FileWriter writer = new FileWriter(txtFile);
		
		boolean firstMarkNotYet = true;
		boolean isContent = true;
		StringBuffer sb = new StringBuffer(8096*2);
		char[] cBuffer = new char[8096*2];
		int length = reader.read(cBuffer);
		
		for(int i=0; i < length; ++i){
			
			if(cBuffer[i] != '<' && firstMarkNotYet){
				continue;
			}//尚未遇到第一个<
			else if(cBuffer[i] == '<'){
				isContent = false;
				firstMarkNotYet = false;
				continue;
			}//第一次遇到<
			
			if(cBuffer[i] != '>' && !isContent){
				continue;
			}//标签内
			else if(cBuffer[i] == '>'){
				isContent = true;
				continue;
			}//标签结束
			
			//消除标点以及空格换行 12288-->. 183-->"  " 32-->" "
			if(cBuffer[i] == '.' || cBuffer[i] == '?' 
				|| cBuffer[i] == '\n' || (int)cBuffer[i] == 32 || cBuffer[i] == '\t'
				|| (int)cBuffer[i] == 12288 || (int)cBuffer[i] == 183)
				continue;
			//消除&nbsp;
			if(cBuffer[i] == '&' && cBuffer[i+1] == 'n' && cBuffer[i+2] == 'b'
				&& cBuffer[i+3] == 's' && cBuffer[i+4] == 'p' && cBuffer[i+5] == ';'){
				
				i = i + 5;
				continue;
			}
			//消除//--
			if(cBuffer[i] == '/' && cBuffer[i+1] == '/'
				&& cBuffer[i+2] == '-' && cBuffer[i+3] == '-'){
				
				i = i + 3;
				continue;
			}
			
			//记录正文
			System.out.print("*"+cBuffer[i]);
			System.out.println((int)cBuffer[i]+"*");
			sb.append(cBuffer[i]);
		}
		writer.append(sb.toString());
		
		reader.close();
		writer.close();
		
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -