📄 dochandler.java

📁 java实现的全文搜索引擎
💻 JAVA
字号:
/**
 *Use ApachePoi to read the .doc files.
 */
package cn.edu.nju.software.ruse;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.poi.hwpf.extractor.WordExtractor;

/**
 * @author spring
 */
public class DocHandler implements ProcessDoc {

	/**
	 * The whole file list,we parse the .doc files later by parseFiles().
	 */
	private File[] list;
	
	/**
	 * A list to hold all the .doc files.
	 */
	private List<File> ll = new LinkedList<File>();
	
	//private Map<String,HashSet<File>> hm;
	private Index index;
	
	public DocHandler(Index index) {
		this.list = index.getFileNameIndex();
		this.index = index;
	}
	
	/**
	 * Put the term and the name of files into the map.
	 */
	public void processDoc() {
//System.err.println("@DocHandler starting to processDoc().");		
		
		parseFiles();
		initModTimeIndex();
		initFileSizeIndex();

		FileInputStream fis;
		BufferedInputStream bis;
		WordExtractor extractor;
		String text;
		Pattern p = Pattern.compile("[a-zA-Z]+");
		Matcher m;
		HashMap<File,HashSet<Integer>> tempMap;
		HashSet<Integer> tempSet;
		//SnowballStemmer stemmer = new EnglishStemmer().createStemmer();
		HashMap<String, HashMap<File,HashSet<Integer>>> fileContentsIndex = index.getFileContentsIndex();
		for(File item:ll) {
			try {
				fis = new FileInputStream(item);
				bis = new BufferedInputStream(fis);
				extractor = new WordExtractor(bis);
				text = extractor.getText();
				text = text.toLowerCase();
				text = text.trim();
				m = p.matcher(text);
				int i = -1;
				while(m.find()) {
					i++;
					String term = m.group();
//System.err.println("Before stemming: " + item + " " + term);
					
					/**
					 * Handle the ordinary word.
					 * */
					if(!fileContentsIndex.containsKey(term)) {
						tempMap = new HashMap<File,HashSet<Integer>>();
						tempSet = new HashSet<Integer>();
						tempSet.add(new Integer(i));
						tempMap.put(item, tempSet);
						fileContentsIndex.put(term, tempMap);
					} else {
						tempMap = fileContentsIndex.get(term);
						if(!tempMap.containsKey(item)) {
							HashSet<Integer> temp = new HashSet<Integer>();
							temp.add(new Integer(i));
							tempMap.put(item, temp);
						} else {
							tempSet = tempMap.get(item);
							tempSet.add(new Integer(i));
						}
					}
					
//					/**
//					 * Handle the word after stemming.
//					 * */
//					stemmer.setCurrent(term);
//					stemmer.stem();
//					term = stemmer.getCurrent();
//System.out.println("After stemming: " + term);
//					if(!hm.containsKey(term)) {
//						temp = new HashSet<File>();
//						temp.add(item);
//						hm.put(term, temp);
//					} else {
//						temp = hm.get(term);
//						if(!temp.contains(item)) {
//							temp.add(item);
//						}
//					}
				}
				bis.close();
			} catch (FileNotFoundException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
//System.out.println(fileContentsIndex);
	}
	
	/**
	 * Cull other files but .doc.
	 */
	public void parseFiles() {
//System.err.println("@DocHandler starting to parseFiles().");	
		Pattern p = Pattern.compile(".*\\.doc");
		for(int i = 0;i < list.length;i++) {
			if(p.matcher(list[i].getName()).matches()){
				ll.add(list[i]);
			}
		}
//		for(File item:ll) {
//			System.out.println(item.getName());
//		}
	}
	
	/**
	 * Initial the ModTimeIndex
	 * Map(modTime, Set[filename])
	 */
	private void initModTimeIndex() {
		HashSet<File> temp;
		
		DateFormat df = DateFormat.getDateFormat();
		String dateString;
		
		HashMap<String, HashSet<File>> modTimeIndex = index.getModTimeIndex();
		for(File item:ll) {
			dateString = df.format(item.lastModified());
//System.out.println("When parsing .doc files : " + dateString);
			if(!modTimeIndex.containsKey(dateString)) {
				temp = new HashSet<File>();
				temp.add(item);
				modTimeIndex.put(dateString, temp);
			}else {
				temp = modTimeIndex.get(dateString);
				if(!temp.contains(item)) {
					temp.add(item);
				}
			}
		}
		//System.out.println("When parsing .doc files : " + modTimeIndex);
	}
	
	/**
	 * Initial the FileSizeIndex
	 * Map(size, Set[filename])
	 */
	private void initFileSizeIndex() {
		HashSet<File> temp;
		Long l;
		HashMap<Long, HashSet<File>> fileSizeIndex = index.getFileSizeIndex();
		for(File item:ll) {
			l = new Long(item.length());
//System.out.println("When parsing .doc files : " + l);
			if(!fileSizeIndex.containsKey(l)) {
				temp = new HashSet<File>();
				temp.add(item);
				fileSizeIndex.put(l, temp);
			}else {
				temp = fileSizeIndex.get(l);
				if(!temp.contains(item)) {
					temp.add(item);
				}
			}
		}
		//System.out.println("When parsing .doc files : " + fileSizeIndex);
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -