⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filepreprocess.java

📁 luence索引抓取时
💻 JAVA
字号:
package preprocess;
import java.io.*;
import java.util.*;
public class FilePreprocess {

	//预处理文件的统一接口,把文件file预处理后分成若干小文件。
	public static void preprocess(File file, String outputDir){
		try{
			splitToSmallFiles(charactorProcess(file,"output.all"),outputDir);
		}catch(Exception e){
			e.printStackTrace();
		}
	}

	//把一个大文件切换成若干小文件
	public static void splitToSmallFiles(File file,String outputpath) throws IOException{
		int filePointer = 0 ;
		int MAX_SIZE = 10240;
		BufferedWriter writer = null;
		StringBuffer buffer = new StringBuffer();
		BufferedReader reader = new BufferedReader(new FileReader(file));
		String line = reader.readLine();
		while(line != null){
			buffer.append(line).append("\r\n");
			if(buffer.toString().getBytes().length >= MAX_SIZE){
				writer = new BufferedWriter(
						new FileWriter(outputpath + "output" +filePointer + ".txt"));
				writer.write(buffer.toString());
				writer.close();

				filePointer++;
				buffer = new StringBuffer();
			}
			line = reader.readLine();
		}
		writer = new BufferedWriter(
				new FileWriter(outputpath + "output" +filePointer + ".txt"));
		writer.write(buffer.toString());
		writer.close();
	}

//	将文件file预处理成destFile。
	public static File charactorProcess(File file,String destFile){
		try {
			BufferedWriter writer = new BufferedWriter(new FileWriter(destFile));
			BufferedReader reader = new BufferedReader(new FileReader(file));
			String line = reader.readLine();
			while(line != null){
				String newLine = replace(line);
				writer.write(newLine);
				writer.newLine();
				line = reader.readLine();
			}
			reader.close();
			writer.close();
		} catch (IOException e) {
			e.printStackTrace();
		}

		return new File(destFile);
	}

	//将全角标点转换为半角标点
	private static String replace(String line){
		HashMap map = new HashMap();
		map.put(",", ",");
		map.put("。", ".");
		map.put("〈", "<");
		map.put("〉", ">");
		map.put("‖", "|");
		map.put("《", "<");
		map.put("》", ">");
		map.put("〔", "[");
		map.put("〕", "]");
		map.put("﹖", "?");
		map.put("?", "?");
		map.put("“", "\"");
		map.put("”", "\"");
		map.put(":", ":");
		map.put("、", ",");
		map.put("(", "(");
		map.put(")", ")");
		map.put("【", "[");
		map.put("】", "]");
		map.put("—", "-");
		map.put("~", "~");
		map.put("!", "!");
		map.put("‵", "'");
		map.put("①", "1");
		map.put("②", "2");
		map.put("③", "3");
		map.put("④", "4");
		map.put("⑤", "5");
		map.put("⑥", "6");
		map.put("⑦", "7");
		map.put("⑧", "8");
		map.put("⑨", "9");

		int length = line.length();
		for(int i = 0 ; i < length; i++){
			String charat = line.substring(i,i+1);
			if(map.get(charat) != null){
				line = line.replace(charat, (String)map.get(charat));
			}
		}
		return line;

	}

	public static void main(String args[]){
		String inputFile = "D:/excise/lucene/2558.txt";
		String outputDir = "D:/excise/lucene/testfolder/";
		
		if(!new File(outputDir).exists()){
			new File(outputDir).mkdirs();
		}
		FilePreprocess filePreprocess = new FilePreprocess();
		filePreprocess.preprocess(new File(inputFile), outputDir);
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -