⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 txthandler.java

📁 java实现的全文搜索引擎
💻 JAVA
字号:
/**
 * Use java io to read the .txt files.
 */
package cn.edu.nju.software.ruse;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//import org.tartarus.snowball.SnowballStemmer;

/**
 * @author spring
 *
 */
public class TxtHandler implements ProcessTxt {
	
	private File[] list;
	
	/**
	 * A list to hold all the .txt files.
	 */
	private List<File> ll = new LinkedList<File>();
	
	//private Map<String,HashSet<File>> hm;
	private Index index;
	
	public TxtHandler(Index index) {
		this.list = index.getFileNameIndex();
		this.index = index;
	}
	
	/**
	 * Put the term and the name of files into the map.
	 */
	public void processTxt() {
//System.err.println("@TxtHandler starting to processTxt().");	
		
		parseFiles();
		initModTimeIndex();
		initFileSizeIndex();
		
		FileReader fr;
		BufferedReader br;
		String s;
		Pattern p = Pattern.compile("[a-zA-Z]+");
		Matcher m;
		HashMap<File,HashSet<Integer>> tempMap;
		HashSet<Integer> tempSet;
		//SnowballStemmer stemmer = new EnglishStemmer().createStemmer();
		HashMap<String, HashMap<File,HashSet<Integer>>> fileContentsIndex = index.getFileContentsIndex();
		for(File item:ll) {
			try {
				fr = new FileReader(item);
				br = new BufferedReader(fr);
				int i = -1;
				while((s = br.readLine()) != null) {
					s = s.toLowerCase();
					m = p.matcher(s);
					
					while(m.find()) {
						i++;
						String term = m.group();
//////System.err.println("Before stemming: " + item + " " + term);
						
						/**
						 * Handle the ordinary word.
						 * */
						if(!fileContentsIndex.containsKey(term)) {
							tempMap = new HashMap<File,HashSet<Integer>>();
							tempSet = new HashSet<Integer>();
							tempSet.add(new Integer(i));
							tempMap.put(item, tempSet);
							fileContentsIndex.put(term, tempMap);
						} else {
							tempMap = fileContentsIndex.get(term);
							if(!tempMap.containsKey(item)) {
								HashSet<Integer> temp = new HashSet<Integer>();
								temp.add(new Integer(i));
								tempMap.put(item, temp);
							} else {
								tempSet = tempMap.get(item);
								tempSet.add(new Integer(i));
							}
						}
						
//						/**
//						 * Handle the word after stemming.
//						 * */
//						stemmer.setCurrent(term);
//						stemmer.stem();
//						term = stemmer.getCurrent();
//	//System.out.println("After stemming: " + term);
//						if(!hm.containsKey(term)) {
//							temp = new HashSet<File>();
//							temp.add(item);
//							hm.put(term, temp);
//						} else {
//							temp = hm.get(term);
//							if(!temp.contains(item)) {
//								temp.add(item);
//							}
//						}
					}
				}
				br.close();
			} catch (FileNotFoundException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
//System.out.println(fileContentsIndex);
	}

	/**
	 * Cull other files but .txt.
	 */
	public void parseFiles() {
//System.err.println("@TxtHandler starting to parseFiles().");	
		Pattern p = Pattern.compile(".*\\.txt");
		for(int i = 0;i < list.length;i++) {
			if(p.matcher(list[i].getName()).matches()){
				ll.add(list[i]);
			}
		}
//		for(File item:ll) {
//			System.out.println(item.getName());
//		}
	}
	
	/**
	 * Initial the ModTimeIndex
	 * Map(modTime, Set[filename])
	 */
	private void initModTimeIndex() {
		HashSet<File> temp;
		
		DateFormat df = DateFormat.getDateFormat();
		String dateString;
		
		HashMap<String, HashSet<File>> modTimeIndex = index.getModTimeIndex();
		for(File item:ll) {
			dateString = df.format(item.lastModified());
//System.out.println("When parsing .txt files : " + dateString);
			if(!modTimeIndex.containsKey(dateString)) {
				temp = new HashSet<File>();
				temp.add(item);
				modTimeIndex.put(dateString, temp);
			}else {
				temp = modTimeIndex.get(dateString);
				if(!temp.contains(item)) {
					temp.add(item);
				}
			}
		}
//System.out.println("When parsing .txt files : " + modTimeIndex);
	}
	
	/**
	 * Initial the FileSizeIndex
	 * Map(size, Set[filename])
	 */
	private void initFileSizeIndex() {
		HashSet<File> temp;
		Long l;
		HashMap<Long, HashSet<File>> fileSizeIndex = index.getFileSizeIndex();
		for(File item:ll) {
			l = new Long(item.length());
//System.out.println("When parsing .txt files : " + l);
			if(!fileSizeIndex.containsKey(l)) {
				temp = new HashSet<File>();
				temp.add(item);
				fileSizeIndex.put(l, temp);
			}else {
				temp = fileSizeIndex.get(l);
				if(!temp.contains(item)) {
					temp.add(item);
				}
			}
		}
//System.out.println("When parsing .txt files : " + fileSizeIndex);
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -