⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vectorspacemodel.java

📁 自己写的search engine, 有 boolean search, fuzzy search
💻 JAVA
字号:
package searchingEngine.VectorSpaceModel;

import java.io.*;
import java.util.*;

import searchingEngine.dataPreprocessing.invertedFile.*;
import searchingEngine.utilites.dataConverter.RawConverter;

public class VectorSpaceModel
{
	private InvertedFile hash_table;
	private String docNameList[];
	private double[] doc_vector_length_array; /*<<>>*/

	/** Constructor **/
	public VectorSpaceModel(InvertedFile hash_table, String[] docNameList, double[] doc_vector_length_array) throws Exception /*<<>>*/
	{
		this.hash_table = hash_table;
		this.docNameList = docNameList;
		this.doc_vector_length_array = doc_vector_length_array; /*<<>>*/
	}
	
	//if cosineSimilarity = true, do cosineSim, else do vectorSim only
	public LinkedList<MatchedDocNode> retrieveDocs(String[] queryTerms, boolean cosineSimilarity)
	{
		double queryVectorLength = 0;

		/** hardcode the hashtable size for storing matched documents... **/
		Hashtable matchedDocs = new Hashtable(10000, (float)0.5);	// should be (#doc/2, 0.5)

		/** construct the hashtable for unique queryTerms of query words **/
		Hashtable queryTermFreqTable = new Hashtable((queryTerms.length*4), (float)0.25);

		//hash each query word into the hashtable
		for (int i=0; i<queryTerms.length; i++)
		{
			System.out.println(i+". "+queryTerms[i]);
			/** add query term into queryTermFreqTable, if exists then adjust the frequency **/
			if(queryTermFreqTable.containsKey(queryTerms[i]))
			{
				queryTermFreqTable.put(queryTerms[i], new Integer(((Integer)queryTermFreqTable.get(queryTerms[i])).intValue()+1));
				System.out.println("entryset: "+queryTermFreqTable.entrySet());
//				System.out.println("keyset: "+queryTermFreqTable.keySet());
			}
			else
			{
				queryTermFreqTable.put(queryTerms[i], new Integer(1));
				System.out.println("entryset: "+queryTermFreqTable.entrySet());
//				System.out.println("keyset: "+queryTermFreqTable.keySet());
			}
		}

		/** Calculate the score for each document in each term **/
		//based on each unique query term
		for (Enumeration e=queryTermFreqTable.keys(); e.hasMoreElements(); )
		{
			String queryTerm = (String)e.nextElement();
			int queryTermFreq = ((Integer)queryTermFreqTable.get(queryTerm)).intValue();
			//System.out.println("queryTermFreq = "+queryTermFreq);
			double normQueryTermFreq = (double) queryTermFreq / (double) queryTerms.length;
			//System.out.println("normQueryTermFreq = "+normQueryTermFreq);
//			System.out.println("*queryTerm*: "+queryTerm+"\t *queryTermFreq*: "+queryTermFreq);

			/** do only if hash_table has this term **/
			TermNode tNode = (TermNode)hash_table.getTable().get(queryTerm);
			System.out.println("tNode: "+tNode.term);
			if(tNode!=null)
			{
				double idf = tNode.getIdf();
				System.out.println("idf = "+idf);
				double queryTermWeight = normQueryTermFreq * idf;
				//System.out.println("queryTermWeight = "+queryTermWeight);

				//????????????????????????????? used when cosineSim ????????????????????????????????
				queryVectorLength += (queryTermWeight * queryTermWeight);
				//??????????????????????????????????????????????????????????????????????????????????

				LinkedList<DocNode> docList = tNode.doc_list;
				/** Calculate inner product **/
				for (int i=0; i<docList.size(); i++)
				{
					DocNode doc = docList.get(i);
					double innerTermWeight = doc.getTermDocWt() * queryTermWeight;	// Wij * Wiq

					System.out.println("doc = "+doc.fileid+"\tinner term weight = "+innerTermWeight);

					if(matchedDocs.containsKey(new Integer(doc.fileid)))
					{
						/** summation of Wij*Wiq and update matchedDoc in hashtable matchedDocs **/
						MatchedDocNode matchedDoc = (MatchedDocNode)matchedDocs.get(new Integer(doc.fileid));
						matchedDoc.setScore(matchedDoc.getScore()+innerTermWeight);
					}
					else
					{
						/** add matchedDoc to hashtable matchedDocs **/
						MatchedDocNode matchedDoc = new MatchedDocNode(doc);
						matchedDoc.setScore(innerTermWeight);
						matchedDoc.setTermDocWt(doc.getTermDocWt());
						matchedDocs.put(new Integer(doc.fileid), matchedDoc);
					}
				}
			}
		}

		// testing only
/*		for (Enumeration e=matchedDocs.keys(); e.hasMoreElements(); )
		{
			Integer a = (Integer)e.nextElement();
			MatchedDocNode b = ((MatchedDocNode)queryTermFreqTable.get(a));
			System.out.println("#matchedFileID#: "+a+"\t#score#: "+b);
		} //*/

		System.out.println("queryVectorLength = "+queryVectorLength);

		LinkedList<MatchedDocNode> resultList = new LinkedList<MatchedDocNode>();
		int j = 0;
		for (Enumeration e=matchedDocs.elements(); e.hasMoreElements(); j++)
		{
			resultList.add((MatchedDocNode)e.nextElement());
			//print only the vectorSim
			System.out.println("....doc = "+resultList.get(j).fileid+"\tscore = "+resultList.get(j).getScore());
			//System.out.println("queryTerms.length = "+queryTerms.length+"\tTermDocWt = "+resultList.get(j).getTermDocWt());
			//print also the cosineSim if cosineSimilarity = true
			if (cosineSimilarity) 
			{
				System.out.println("cos = "+ (     resultList.get(j).getScore()     /     ( Math.sqrt(queryVectorLength) * Math.sqrt((double) doc_vector_length_array[resultList.get(j).fileid]) )     )); /*<<>>*/
				resultList.get(j).setScore(     resultList.get(j).getScore()     /     ( Math.sqrt(queryVectorLength) * Math.sqrt((double) doc_vector_length_array[resultList.get(j).fileid]) )     ); /*<<>>*/
			}
		}

		/** in descending order (refer to compareTo method of MatchedDocNode.java) **/
		Collections.sort(resultList);

		return resultList;
	}

	public static void main(String[] args) throws Exception
	{
		InvertedFile invertedFile = new InvertedFile(InvertedFile.loadHashTable("invertedFile.dat"));
		String docNameList[] = (String[])RawConverter.loadObject("fileNameList.dat");
		double[] doc_vector_length_array = (double[])InvertedFile.loadVLArray("combDocVectorLengthArray.dat"); /*<<>>*/
		VectorSpaceModel vsm = new VectorSpaceModel(invertedFile, docNameList, doc_vector_length_array); /*<<>>*/
//		String[] query = {"opportunists", "subsoil", "friday", "subsoil"};
		String[] query = {"subsoiL", "subsoiL", "opportunisT"};
//		System.out.println(invertedFile.getList(query[0]));
		LinkedList<MatchedDocNode> result = vsm.retrieveDocs(query, true);
		System.out.println("==============================");
		for(int i=0; i<result.size(); i++)
		{
			System.out.println("file: "+result.get(i).fileid+"\tscore: "+result.get(i).getScore());
		}
		//
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -