📄 invertedfile.java
字号:
package searchingEngine.dataPreprocessing.invertedFile;
import searchingEngine.utilites.dataConverter.RawConverter;
import searchingEngine.dataPreprocessing.wordPosition.*;
import java.io.*;
import java.util.*;
public class InvertedFile
{
/*object variables*/
private final int HMAX = 500000;
private final float HLOAD = (float) 0.5;
public static final int TOTAL_NUMBER_OF_FILES = 64813;
//private final String TXT_DB_PATH = "combined.dat";
private final String FILE_TXT_PATH = "file.txt";
private Hashtable<String,TermNode> inverted_file = null;
private LinkedList<TermNode> txt_db = null;
private double[] doc_vector_length_array = null; /*<<>>*/
/*constructor*/
public InvertedFile(Hashtable loaded) throws Exception
{
inverted_file = loaded;
}
public InvertedFile(String txt_db_path) throws Exception
{
inverted_file = new Hashtable<String, TermNode>(HMAX, HLOAD);
doc_vector_length_array = new double[TOTAL_NUMBER_OF_FILES]; /*<<>>*/
for (int i = 0; i < doc_vector_length_array.length; i++) /*<<>>*/
doc_vector_length_array[i] = (double) 0; /*<<>>*/
//txt_db = (LinkedList<TermNode>)RawConverter.loadObject(txt_db_path);
buildInvertedFile(txt_db_path);
for (int i = 0; i < doc_vector_length_array.length; i++) /*<<>>*/
{ /*<<>>*/
if (doc_vector_length_array[i] > 0) /*<<>>*/
System.out.println(doc_vector_length_array[i]); /*<<>>*/
} /*<<>>*/
}
public void buildInvertedFile(String input) throws Exception
{
BufferedReader br = new BufferedReader(new FileReader(input));
String line;
CombineDocNodeByLine cbnl = new CombineDocNodeByLine("","","");
while ((line=br.readLine())!=null){
insert(cbnl.loadTerm(line));
}
br.close();
}
private void insert(TermNode term_node) throws Exception
{
//get term string as key
String key = term_node.term;
//get DocNodeWpos list
LinkedList doc_list = term_node.doc_list;
int df = doc_list.size();
double idf = Math.log(TOTAL_NUMBER_OF_FILES / df);
double ntf;
double term_doc_wt;
//for each DocNodeWpos in DocNodeWpos list
DocNodeWpos doc_node_wpos = null;
//for wpos_list in each DocNodeWpos
LinkedList wpos_list = null;
//for new doc_node
DocNode doc_node = null;
//create new doc_list for DocNode
LinkedList doc_list_new = new LinkedList<DocNode>();
//create new term_node without wpos_list
TermNode term_node_new = new TermNode(key, doc_list_new);
term_node_new.setIdf(idf);
//get all DocNodeWpos in DocNodeWpos list
for (int i = 0; i < df; i++)
{
doc_node_wpos = (DocNodeWpos) doc_list.get(i);
//get wpos_list from DocNodeWpos
wpos_list = doc_node_wpos.wpos_list;
//calc ntf
ntf = (double) wpos_list.size() / (double) (RawConverter.loadFileNodeAt(FILE_TXT_PATH, doc_node_wpos.fileid)).doclen;
//System.out.println(ntf);
//calc tf-idf
term_doc_wt = ntf * idf;
//add the weigth^2 to doc_vector_length of the specific fileid /*<<>>*/
doc_vector_length_array[doc_node_wpos.fileid] += (term_doc_wt * term_doc_wt); /*<<>>*/
//create new doc_node
doc_node = new DocNode(doc_node_wpos.fileid);
//set ntf
doc_node.setTf(ntf);
//set tf-idf
doc_node.setTermDocWt(term_doc_wt);
//add the new doc_node to new doc_list
doc_list_new.add(doc_node);
}
inverted_file.put(key, term_node_new);
}
public Hashtable<String,TermNode> getTable(){
return inverted_file;
}
public static Hashtable<String,TermNode> loadHashTable(String path)throws IOException{
return (Hashtable<String,TermNode>)RawConverter.loadObject(path);
}
public static void saveVLArray(double[] doc_vector_length_array,String output)throws IOException/*<<>>*/
{/*<<>>*/
BufferedWriter bw = new BufferedWriter(new FileWriter(output));/*<<>>*/
for (int i = 0 ; i< doc_vector_length_array.length; i++) /*<<>>*/
{/*<<>>*/
bw.write("" + doc_vector_length_array[i]);/*<<>>*/
bw.newLine();/*<<>>*/
}/*<<>>*/
bw.close();/*<<>>*/
}/*<<>>*/
public static double[] loadVLArray(String input)throws IOException/*<<>>*/
{/*<<>>*/
double[] result = new double[TOTAL_NUMBER_OF_FILES];/*<<>>*/
BufferedReader br = new BufferedReader(new FileReader(input));/*<<>>*/
String line;/*<<>>*/
int count = 0;/*<<>>*/
while ((line=br.readLine())!=null) {/*<<>>*/
result[count] = Double.parseDouble(line);/*<<>>*/
count++;/*<<>>*/
}/*<<>>*/
br.close();/*<<>>*/
return result;
}/*<<>>*/
public double[] getDocVectorLengthArray() /*<<>>*/
{ /*<<>>*/
return doc_vector_length_array; /*<<>>*/
} /*<<>>*/
public static void main(String[] args) throws Exception{
InvertedFile inv = new InvertedFile("combineXX64.txt");
RawConverter.saveObject(inv.getTable(),"combInvertedFile.dat");
InvertedFile.saveVLArray(inv.getDocVectorLengthArray(),"combDocVectorLengthArray.dat");/*<<>>*/
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -