⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 invertedfile.java

📁 自己写的search engine, 有 boolean search, fuzzy search
💻 JAVA
字号:
package searchingEngine.dataPreprocessing.invertedFile;

import searchingEngine.utilites.dataConverter.RawConverter;
import searchingEngine.dataPreprocessing.wordPosition.*;

import java.io.*;
import java.util.*;

public class InvertedFile 
{
	/*object variables*/
	private final int HMAX = 500000;
	private final float HLOAD = (float) 0.5;
	public static final int TOTAL_NUMBER_OF_FILES = 64813;
	//private final String TXT_DB_PATH = "combined.dat";
	private final String FILE_TXT_PATH = "file.txt";

	private Hashtable<String,TermNode> inverted_file = null;
	private LinkedList<TermNode> txt_db = null;
	private double[] doc_vector_length_array = null; /*<<>>*/

	/*constructor*/
	public InvertedFile(Hashtable loaded) throws Exception 
	{
		inverted_file = loaded;
	}
	
	public InvertedFile(String txt_db_path) throws Exception 
	{
		inverted_file = new Hashtable<String, TermNode>(HMAX, HLOAD);
		doc_vector_length_array = new double[TOTAL_NUMBER_OF_FILES]; /*<<>>*/
		for (int i = 0; i < doc_vector_length_array.length; i++) /*<<>>*/
			doc_vector_length_array[i] = (double) 0; /*<<>>*/
		//txt_db = (LinkedList<TermNode>)RawConverter.loadObject(txt_db_path);
		buildInvertedFile(txt_db_path);
		for (int i = 0; i < doc_vector_length_array.length; i++)  /*<<>>*/
		{ /*<<>>*/
			if (doc_vector_length_array[i] > 0) /*<<>>*/
				System.out.println(doc_vector_length_array[i]); /*<<>>*/
		} /*<<>>*/
	}

	public void buildInvertedFile(String input) throws Exception 
	{
		BufferedReader br = new BufferedReader(new FileReader(input));
		String line;
		CombineDocNodeByLine cbnl = new CombineDocNodeByLine("","","");
		while ((line=br.readLine())!=null){
			insert(cbnl.loadTerm(line));
		}
		br.close();
	}

	private void insert(TermNode term_node) throws Exception 
	{
		//get term string as key
		String key = term_node.term;
		//get DocNodeWpos list
		LinkedList doc_list = term_node.doc_list;
		int df = doc_list.size();
		double idf = Math.log(TOTAL_NUMBER_OF_FILES / df);
		double ntf;
		double term_doc_wt;
		//for each DocNodeWpos in DocNodeWpos list
		DocNodeWpos doc_node_wpos = null;
		//for wpos_list in each DocNodeWpos
		LinkedList wpos_list = null;

		//for new doc_node
		DocNode doc_node = null;
		//create new doc_list for DocNode
		LinkedList doc_list_new = new LinkedList<DocNode>();
		//create new term_node without wpos_list
		TermNode term_node_new = new TermNode(key, doc_list_new);
		term_node_new.setIdf(idf);

		//get all DocNodeWpos in DocNodeWpos list
		for (int i = 0; i < df; i++) 
		{
			doc_node_wpos = (DocNodeWpos) doc_list.get(i);
			//get wpos_list from DocNodeWpos
			wpos_list = doc_node_wpos.wpos_list;
			//calc ntf
			ntf = (double) wpos_list.size() / (double) (RawConverter.loadFileNodeAt(FILE_TXT_PATH, doc_node_wpos.fileid)).doclen;
			//System.out.println(ntf);
			//calc tf-idf
			term_doc_wt = ntf * idf;

			//add the weigth^2 to doc_vector_length of the specific fileid /*<<>>*/
			doc_vector_length_array[doc_node_wpos.fileid] += (term_doc_wt * term_doc_wt); /*<<>>*/

			//create new doc_node
			doc_node = new DocNode(doc_node_wpos.fileid);
			//set ntf
			doc_node.setTf(ntf);
			//set tf-idf
			doc_node.setTermDocWt(term_doc_wt);
			//add the new doc_node to new doc_list
			doc_list_new.add(doc_node);
		}

		inverted_file.put(key, term_node_new);
	}
	
	public Hashtable<String,TermNode> getTable(){
		return inverted_file;
	}
	
	public static Hashtable<String,TermNode> loadHashTable(String path)throws IOException{
		return (Hashtable<String,TermNode>)RawConverter.loadObject(path);
	}

	public static void saveVLArray(double[] doc_vector_length_array,String output)throws IOException/*<<>>*/
	{/*<<>>*/
		BufferedWriter bw = new BufferedWriter(new FileWriter(output));/*<<>>*/
		for (int i = 0 ; i< doc_vector_length_array.length; i++) /*<<>>*/
		{/*<<>>*/
			bw.write("" + doc_vector_length_array[i]);/*<<>>*/
			bw.newLine();/*<<>>*/
		}/*<<>>*/
		bw.close();/*<<>>*/
	}/*<<>>*/

	public static double[] loadVLArray(String input)throws IOException/*<<>>*/
	{/*<<>>*/
		double[] result = new double[TOTAL_NUMBER_OF_FILES];/*<<>>*/
		BufferedReader br = new BufferedReader(new FileReader(input));/*<<>>*/
		String line;/*<<>>*/
		int count = 0;/*<<>>*/
		while ((line=br.readLine())!=null) {/*<<>>*/
			result[count] = Double.parseDouble(line);/*<<>>*/
			count++;/*<<>>*/
		}/*<<>>*/
		br.close();/*<<>>*/
		return result;
	}/*<<>>*/

	public double[] getDocVectorLengthArray() /*<<>>*/
	{ /*<<>>*/
		return doc_vector_length_array; /*<<>>*/
	} /*<<>>*/

	public static void main(String[] args) throws Exception{
		InvertedFile inv = new InvertedFile("combineXX64.txt");
		RawConverter.saveObject(inv.getTable(),"combInvertedFile.dat");
		InvertedFile.saveVLArray(inv.getDocVectorLengthArray(),"combDocVectorLengthArray.dat");/*<<>>*/
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -