⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filepreprocess.java

📁 对网页进行解析并抓取
💻 JAVA
字号:
package ch2.lucenedemo.preprocess;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;

public class FilePreprocess {


	
	/**
	 * 
	 * @param file
	 * @param outputDir
	 */
	public static void preprocess(File file, String outputDir) {
		try {
			splitToSmallFiles(charactorProcess(file, outputDir + "output.all"), outputDir);
			File fileDelete = new File(outputDir + "output.all");
			if(fileDelete.exists())
				fileDelete.delete();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * 
	 * @param file
	 * @param destFile
	 * @return
	 * @throws Exception
	 */
	public static File charactorProcess(File file, String destFile)
			throws Exception {

		BufferedWriter writer = new BufferedWriter(new FileWriter(destFile));

		BufferedReader reader = new BufferedReader(new FileReader(file));
		String line = reader.readLine();

		while (line != null) {
				String newline = replace(line);
				writer.write(newline);
				writer.newLine();
			line = reader.readLine();
		}

		reader.close();
		writer.close();

		return new File(destFile);

	}

	/**
	 * 
	 * @param file
	 * @param outputpath
	 */
	public static void splitToSmallFiles(File file, String outputpath) throws IOException {
		
		int filePointer = 0;
		
		int MAX_SIZE = 10240;
		
		BufferedWriter writer = null;
		BufferedReader reader = new BufferedReader(new FileReader(file));
		StringBuffer buffer = new StringBuffer();
		String line = reader.readLine();
		
		while (line != null) {
			buffer.append(line).append("\r\n");
			if (buffer.toString().getBytes().length >= MAX_SIZE)
			{
				writer = new BufferedWriter(new FileWriter(outputpath + "output" + filePointer + ".txt"));
				writer.write(buffer.toString());
				writer.close();
				filePointer++;
				
				buffer = new StringBuffer();
			}
			line = reader.readLine();
		}
		
		writer = new BufferedWriter(new FileWriter(outputpath + "output" + filePointer + ".txt"));
		writer.write(buffer.toString());
		writer.close();
		
		
	}

	private static String replace(String line) {

		HashMap map = new HashMap();
		map.put(",", ",");
		map.put("。", ".");
		map.put("〈", "<");
		map.put("〉", ">");
		map.put("‖", "|");
		map.put("《", "<");
		map.put("》", ">");
		map.put("〔", "[");
		map.put("〕", "]");
		map.put("﹖", "?");
		map.put("?", "?");
		map.put("“", "\"");
		map.put("”", "\"");
		map.put(":", ":");
		map.put("、", ",");
		map.put("(", "(");
		map.put(")", ")");
		map.put("【", "[");
		map.put("】", "]");
		map.put("—", "-");
		map.put("~", "~");
		map.put("!", "!");
		map.put("‵", "'");
		map.put("①", "1");
		map.put("②", "2");
		map.put("③", "3");
		map.put("④", "4");
		map.put("⑤", "5");
		map.put("⑥", "6");
		map.put("⑦", "7");
		map.put("⑧", "8");
		map.put("⑨", "9");

		int length = line.length();
		for (int i = 0; i < length; i++) {
			String charat = line.substring(i, i + 1);
			if (map.get(charat) != null) {
				line = line.replace(charat, (String) map.get(charat));
			}
		}

		return line;
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -