📄 vectorspacemodel.java
字号:
package searchingEngine.VectorSpaceModel;
import java.io.*;
import java.util.*;
import searchingEngine.dataPreprocessing.invertedFile.*;
import searchingEngine.utilites.dataConverter.RawConverter;
public class VectorSpaceModel
{
private InvertedFile hash_table;
private String docNameList[];
private double[] doc_vector_length_array; /*<<>>*/
/** Constructor **/
public VectorSpaceModel(InvertedFile hash_table, String[] docNameList, double[] doc_vector_length_array) throws Exception /*<<>>*/
{
this.hash_table = hash_table;
this.docNameList = docNameList;
this.doc_vector_length_array = doc_vector_length_array; /*<<>>*/
}
//if cosineSimilarity = true, do cosineSim, else do vectorSim only
public LinkedList<MatchedDocNode> retrieveDocs(String[] queryTerms, boolean cosineSimilarity)
{
double queryVectorLength = 0;
/** hardcode the hashtable size for storing matched documents... **/
Hashtable matchedDocs = new Hashtable(10000, (float)0.5); // should be (#doc/2, 0.5)
/** construct the hashtable for unique queryTerms of query words **/
Hashtable queryTermFreqTable = new Hashtable((queryTerms.length*4), (float)0.25);
//hash each query word into the hashtable
for (int i=0; i<queryTerms.length; i++)
{
System.out.println(i+". "+queryTerms[i]);
/** add query term into queryTermFreqTable, if exists then adjust the frequency **/
if(queryTermFreqTable.containsKey(queryTerms[i]))
{
queryTermFreqTable.put(queryTerms[i], new Integer(((Integer)queryTermFreqTable.get(queryTerms[i])).intValue()+1));
System.out.println("entryset: "+queryTermFreqTable.entrySet());
// System.out.println("keyset: "+queryTermFreqTable.keySet());
}
else
{
queryTermFreqTable.put(queryTerms[i], new Integer(1));
System.out.println("entryset: "+queryTermFreqTable.entrySet());
// System.out.println("keyset: "+queryTermFreqTable.keySet());
}
}
/** Calculate the score for each document in each term **/
//based on each unique query term
for (Enumeration e=queryTermFreqTable.keys(); e.hasMoreElements(); )
{
String queryTerm = (String)e.nextElement();
int queryTermFreq = ((Integer)queryTermFreqTable.get(queryTerm)).intValue();
//System.out.println("queryTermFreq = "+queryTermFreq);
double normQueryTermFreq = (double) queryTermFreq / (double) queryTerms.length;
//System.out.println("normQueryTermFreq = "+normQueryTermFreq);
// System.out.println("*queryTerm*: "+queryTerm+"\t *queryTermFreq*: "+queryTermFreq);
/** do only if hash_table has this term **/
TermNode tNode = (TermNode)hash_table.getTable().get(queryTerm);
System.out.println("tNode: "+tNode.term);
if(tNode!=null)
{
double idf = tNode.getIdf();
System.out.println("idf = "+idf);
double queryTermWeight = normQueryTermFreq * idf;
//System.out.println("queryTermWeight = "+queryTermWeight);
//????????????????????????????? used when cosineSim ????????????????????????????????
queryVectorLength += (queryTermWeight * queryTermWeight);
//??????????????????????????????????????????????????????????????????????????????????
LinkedList<DocNode> docList = tNode.doc_list;
/** Calculate inner product **/
for (int i=0; i<docList.size(); i++)
{
DocNode doc = docList.get(i);
double innerTermWeight = doc.getTermDocWt() * queryTermWeight; // Wij * Wiq
System.out.println("doc = "+doc.fileid+"\tinner term weight = "+innerTermWeight);
if(matchedDocs.containsKey(new Integer(doc.fileid)))
{
/** summation of Wij*Wiq and update matchedDoc in hashtable matchedDocs **/
MatchedDocNode matchedDoc = (MatchedDocNode)matchedDocs.get(new Integer(doc.fileid));
matchedDoc.setScore(matchedDoc.getScore()+innerTermWeight);
}
else
{
/** add matchedDoc to hashtable matchedDocs **/
MatchedDocNode matchedDoc = new MatchedDocNode(doc);
matchedDoc.setScore(innerTermWeight);
matchedDoc.setTermDocWt(doc.getTermDocWt());
matchedDocs.put(new Integer(doc.fileid), matchedDoc);
}
}
}
}
// testing only
/* for (Enumeration e=matchedDocs.keys(); e.hasMoreElements(); )
{
Integer a = (Integer)e.nextElement();
MatchedDocNode b = ((MatchedDocNode)queryTermFreqTable.get(a));
System.out.println("#matchedFileID#: "+a+"\t#score#: "+b);
} //*/
System.out.println("queryVectorLength = "+queryVectorLength);
LinkedList<MatchedDocNode> resultList = new LinkedList<MatchedDocNode>();
int j = 0;
for (Enumeration e=matchedDocs.elements(); e.hasMoreElements(); j++)
{
resultList.add((MatchedDocNode)e.nextElement());
//print only the vectorSim
System.out.println("....doc = "+resultList.get(j).fileid+"\tscore = "+resultList.get(j).getScore());
//System.out.println("queryTerms.length = "+queryTerms.length+"\tTermDocWt = "+resultList.get(j).getTermDocWt());
//print also the cosineSim if cosineSimilarity = true
if (cosineSimilarity)
{
System.out.println("cos = "+ ( resultList.get(j).getScore() / ( Math.sqrt(queryVectorLength) * Math.sqrt((double) doc_vector_length_array[resultList.get(j).fileid]) ) )); /*<<>>*/
resultList.get(j).setScore( resultList.get(j).getScore() / ( Math.sqrt(queryVectorLength) * Math.sqrt((double) doc_vector_length_array[resultList.get(j).fileid]) ) ); /*<<>>*/
}
}
/** in descending order (refer to compareTo method of MatchedDocNode.java) **/
Collections.sort(resultList);
return resultList;
}
public static void main(String[] args) throws Exception
{
InvertedFile invertedFile = new InvertedFile(InvertedFile.loadHashTable("invertedFile.dat"));
String docNameList[] = (String[])RawConverter.loadObject("fileNameList.dat");
double[] doc_vector_length_array = (double[])InvertedFile.loadVLArray("combDocVectorLengthArray.dat"); /*<<>>*/
VectorSpaceModel vsm = new VectorSpaceModel(invertedFile, docNameList, doc_vector_length_array); /*<<>>*/
// String[] query = {"opportunists", "subsoil", "friday", "subsoil"};
String[] query = {"subsoiL", "subsoiL", "opportunisT"};
// System.out.println(invertedFile.getList(query[0]));
LinkedList<MatchedDocNode> result = vsm.retrieveDocs(query, true);
System.out.println("==============================");
for(int i=0; i<result.size(); i++)
{
System.out.println("file: "+result.get(i).fileid+"\tscore: "+result.get(i).getScore());
}
//
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -