📄 filepreprocess.java
字号:
package preprocess;
import java.io.*;
import java.util.*;
public class FilePreprocess {
//预处理文件的统一接口,把文件file预处理后分成若干小文件。
public static void preprocess(File file, String outputDir){
try{
splitToSmallFiles(charactorProcess(file,"output.all"),outputDir);
}catch(Exception e){
e.printStackTrace();
}
}
//把一个大文件切换成若干小文件
public static void splitToSmallFiles(File file,String outputpath) throws IOException{
int filePointer = 0 ;
int MAX_SIZE = 10240;
BufferedWriter writer = null;
StringBuffer buffer = new StringBuffer();
BufferedReader reader = new BufferedReader(new FileReader(file));
String line = reader.readLine();
while(line != null){
buffer.append(line).append("\r\n");
if(buffer.toString().getBytes().length >= MAX_SIZE){
writer = new BufferedWriter(
new FileWriter(outputpath + "output" +filePointer + ".txt"));
writer.write(buffer.toString());
writer.close();
filePointer++;
buffer = new StringBuffer();
}
line = reader.readLine();
}
writer = new BufferedWriter(
new FileWriter(outputpath + "output" +filePointer + ".txt"));
writer.write(buffer.toString());
writer.close();
}
// 将文件file预处理成destFile。
public static File charactorProcess(File file,String destFile){
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(destFile));
BufferedReader reader = new BufferedReader(new FileReader(file));
String line = reader.readLine();
while(line != null){
String newLine = replace(line);
writer.write(newLine);
writer.newLine();
line = reader.readLine();
}
reader.close();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
return new File(destFile);
}
//将全角标点转换为半角标点
private static String replace(String line){
HashMap map = new HashMap();
map.put(",", ",");
map.put("。", ".");
map.put("〈", "<");
map.put("〉", ">");
map.put("‖", "|");
map.put("《", "<");
map.put("》", ">");
map.put("〔", "[");
map.put("〕", "]");
map.put("﹖", "?");
map.put("?", "?");
map.put("“", "\"");
map.put("”", "\"");
map.put(":", ":");
map.put("、", ",");
map.put("(", "(");
map.put(")", ")");
map.put("【", "[");
map.put("】", "]");
map.put("—", "-");
map.put("~", "~");
map.put("!", "!");
map.put("‵", "'");
map.put("①", "1");
map.put("②", "2");
map.put("③", "3");
map.put("④", "4");
map.put("⑤", "5");
map.put("⑥", "6");
map.put("⑦", "7");
map.put("⑧", "8");
map.put("⑨", "9");
int length = line.length();
for(int i = 0 ; i < length; i++){
String charat = line.substring(i,i+1);
if(map.get(charat) != null){
line = line.replace(charat, (String)map.get(charat));
}
}
return line;
}
public static void main(String args[]){
String inputFile = "D:/excise/lucene/2558.txt";
String outputDir = "D:/excise/lucene/testfolder/";
if(!new File(outputDir).exists()){
new File(outputDir).mkdirs();
}
FilePreprocess filePreprocess = new FilePreprocess();
filePreprocess.preprocess(new File(inputFile), outputDir);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -