⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 querylsi.cpp

📁 根据潜在语义分析进行查询。将文本中的特征集合做LSI变换。
💻 CPP
📖 第 1 页 / 共 2 页
字号:
//	File: QueryLSI.cpp
//	Purpose: LSI SDD project Query Processor  
//	Date written: October 2002
//	Author: Jason Dowling (jdowling@pcug.org.au)
//
//  Inputs: termDoc.sdd (output from SDDPACK) 
//          termCount.dat (output from Preparation program)
//			termIndex.dat (output from Preparation program)
//
//	Outputs:
//			Displays ranked list of matching documents and precision measures
//	
//	Usage:  This version is set up for the MEDLINE collection.
//			Run this program in a batch file with the query no. ie:
//			querylsi -q 1 <medQry1.txt > runDetails.log
//	
//	Please see my BComp(Hons) thesis for further details.
//
#include <time.h>
#include <iostream.h>
#include <fstream.h>
#include <cstdlib>
#include <ctype.h>
#include <iomanip.h>
#include "tree.h"    // Search tree class 
#include "string1.h" // String class  
#include "matrix.h"  // Matrix class 	 
#include "list.h"    // Linked List class
#include "porter.h"  // Porter Stemmer 
#include "math.h"	 	


extern "C" int stem(char *word);  //Porter stemmer

char* convertCase(char *term);    // convert string to uppercase
void printDateTime();             // output date time of run
void sort (double [],int);        // bubble sort 
double calculateCosine(const Matrix<double>&, const Matrix<double>&, const int, const int );

int main(int argc, char* argv[])
{

	String strTerm, strTest;    
	char term2[100];            
	int intNoDocs = 0;          //total documents this collection
	int indexNo = 0 ;           //Stre current term's index value
	List< String > strList;     //Linked list for term index values
	List< int > relList;		//Linked list for hold relevance information
	IRT_Porter *p;				//Porter stemmer
	p = new IRT_Porter;			//Instantiate Porter stemmer
	int intTermCount = 0 ;		//Counter variable for looping
	int intCurrentTerm = 0;		//Counter variable for looping
    int intCurrentRank = 0;		//Counter variable for looping
	int intValue =0;            //Holds each element of integer matices Xk, Yk 
	int intCurrentQuery = 0;	//Counter variable for looping
	int intCurrentDoc=0;		//Counter variable for looping

	//open the relevant document file to calculate precision/recall
	ifstream inRelFile("MED_DOCS.REL",ios::in);
	if (!inRelFile){
		cerr << "MED_DOCS.REL File could not be opened"  << endl;
		exit(1);
	}

	//Get the Query number we are processing.  The user needs to 
	//pass the query value in as 'queryLSI -q x', where x is the 
	//query number
    if (argv[2] > 0)
		intCurrentQuery = atoi(argv[2]);
	else
	{
		cout << "I cannot identify query number.  Please enter [QueryLSI -q n], where n is the query number (between 1 and 30 for Medline)" << endl; 
		return 0;
	}
	cout << "Testing Medline Query No: " << intCurrentQuery  << endl;

	//Read in the Medline relevance file and insert values into
	//linked list
	int intRelDocTotal = 0;
	int intQry = 0;
	int intDoc = 0;
	while (inRelFile >> intQry >> intDoc)
	{
		if (intQry == intCurrentQuery) 
		{	intRelDocTotal++;
			relList.insertAtBack( intDoc );
		}
	}

	inRelFile.close();	
	cout << "No of relevant docs for this query: " << intRelDocTotal << endl;
	
	ifstream inSDDFile("TermDoc.sdd",ios::in);
	if (!inSDDFile){
		cerr << "TermDoc.sdd File could not be opened"  << endl;
		exit(1);
	}

	cout << "Medline SDD LSI engine results.  Jason Dowling 2002" << endl;
	printDateTime();
	cout << "Reading in term document matrix (may take a few seconds)..." << endl;
	char string[1000];
	
	//ignore the first two comment lines in the sdd file
	inSDDFile.getline(string,999,'\n');
	inSDDFile.getline(string,999,'\n');
	
	int intRank=0;
	int intNoTerms=0;
	intNoDocs=0;
	double dblValue =0;
	char tempDVal[27];
	char charExp[3];
	int intExp = 0;

	//the third sdd file line contains rank, terms and documents
	inSDDFile >> intRank >> intNoTerms >> intNoDocs;
  	cout << endl << "This matrix has: rank: " << intRank << " m:" << intNoTerms << " n:" << intNoDocs << endl;

	Matrix<double> SDD_X(intNoTerms,intRank);
	Matrix<double> SDD_D(intRank,intRank);
	Matrix<double> SDD_Y(intNoDocs,intRank);
	Matrix<double> SDD_Y_t(intRank, intNoDocs);
	Matrix<double> SDD_X_t(intRank,intNoTerms);

	int i, j = 1;
	int intCheckValues = 0;
	//read in the values for the Dk matrix.  These values are formatted
	//in scientific notation, and require a little processing.  Note the
	//Dk matrix entries are inserted diagonally.
	i = 1;
	cout << "Reading in Dk matrix" << endl;
	for (i; i<= intRank; i++)
		{   inSDDFile >> string;
			for (int j=0;j<27;j++)
				tempDVal[j] = string[j];
			//extract exponent for this value
			charExp[0] = string[29];
			charExp[1] = string[30];
			charExp[2] = string[31];
			intExp = atoi(charExp);
				dblValue = atof(tempDVal);
			if (intExp>0) 
				dblValue = dblValue/pow(10,intExp);
			SDD_D(i,i) = dblValue;
			intCheckValues++;
			dblValue = 0;
			intExp = 0;
		}
	cout << "Read in " << intCheckValues << " values for Dk matrix. "  << endl;
	intCheckValues=0;

	//Read in values for the Xk matrix - (Term x Rank)
	//Each line is one column of entries in the matrix
	intValue = 0;
	cout << "Reading in X matrix" << endl;
	for (i=1; i <= intRank; i++) {
		for (j=1; j<=intNoTerms; j++)
			{inSDDFile >> intValue;
			intCheckValues++;	
			SDD_X(j,i) = intValue;	
		}
	}
	cout << "Read in " << intCheckValues << " values for Xk matrix."  << endl;
	intCheckValues=0;

	//Read in values for the Y matrix  (Rank * Docs)
	//Each line is one column of entries in the matrix
	cout << "Reading in Y matrix" << endl;
	intValue = 0;
	for (i=1; i <= intRank; i++)
		for (j=1; j<=intNoDocs; j++)
		{
		inSDDFile >> intValue;
		SDD_Y(j,i) = intValue;
		intCheckValues++;	
		}

	cout << "Resad in " << intCheckValues << " values for Yk matrix."  << endl;
	inSDDFile.close();	
	
	//Transpose X to get X_t which is required for query processing. 
	for (i=1; i <= intRank; i++) 
		for (j=1; j<=intNoTerms; j++)
			if 	(SDD_X(j,i) != 0)
				SDD_X_t(i,j) = SDD_X(j,i);

	//transpose Y to get Y_t which is required for query processing. 
	for (i=1; i <= intRank; i++) 
		for (j=1; j<=intNoDocs; j++)
			if 	(SDD_Y(j,i) != 0)
				SDD_Y_t(i,j) = SDD_Y(j,i);

	//The following three lines can be used to output the reconstructed
	//rank k matrix A_k.  This can be useful for testing and comparing the 
    //approximation with the original term document matrix (or SVD):
	//Matrix<double> SDD_All(intNoTerms,intNoDocs);
	//SDD_All = SDD_X * SDD_D * SDD_Y_t;
	//SDD_All.saveSparseMatrix("testout.sdd");

	//Read in the term index and insert values into linked list
	cout << "Reading in the list of terms..." << endl;   
	ifstream inTermFile("termIndex.dat",ios::in);
	if (!inTermFile){
		cerr << "termIndex.dat could not be opened"  << endl;
		exit(1);
	}
	
	cout << "Reading termIndex.dat..." << endl;
	while (inTermFile >> term2 >> indexNo)
	{
		strList.insertAtBack( term2 );
	}
    inTermFile.close();	

	//read in the number of documents in which term appears - this 
	//information is needed for global weighting.  
	int* intTermTotal = new int[intNoTerms+1];
	//initialise array
	for (intCurrentTerm=1;intCurrentTerm<=intNoTerms; intCurrentTerm++)
		intTermTotal[intCurrentTerm]=0;

	ifstream inTermCountFile("termCount.dat",ios::in); 
	if (!inTermCountFile){
		cerr << "termCount.dat file could not be opened"  << endl;
		exit(1); }
	
	cout << "Reading termCount.dat..." << endl;
	while (inTermCountFile >> intCurrentTerm >> intTermCount)
		intTermTotal[intCurrentTerm]=intTermCount;

	inTermCountFile.close();	
	
	//Please note that this program can be used interactively,
	//however query strings are usually passed in from the command
	//line.  eg. 'queryLSI -q 1 < query1.txt > resultQ1.log'
	//cout << endl << "Please enter query terms: " << endl;
	
	char input_string[1000];    //max 1000 characters for query 
	char *tokenPtr;
	cin.getline(input_string,1000);
	cout << endl << "The entered text was: " << input_string << endl;

	Matrix<double> SDD_Q(intNoTerms,1);    //used as a weighted query vector
	Matrix<double> 	rawQuery(intNoTerms,1);//query vector with raw term freq only.	
	tokenPtr = strtok(input_string, " ");  //used to parse query terms
	
	//parse the user entered query.  Need to update to handle illegal chars.
	int termID = 0;
	while (tokenPtr!=NULL) {
		termID = 0;
		cout << tokenPtr << " ";
		//p->Porter(tokenPtr);   //Porter stemming.
		strTerm = convertCase(tokenPtr);  //make all terms uppercase
		cout << tokenPtr << " ";
		
		//see if this term can be matched with the term list
		//if there is a match, increment the query vector
		termID = strList.findTerm(tokenPtr);
		if (termID > 0) {
			cout << " TermNo is " << termID << endl;
			rawQuery(termID,   1) += 1 ;
		}
		else
			cout << " This term has been ignored " << endl;

		tokenPtr = strtok(NULL, " ");
	}


	//logarithmic local weighting 
	for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
		SDD_Q(intCurrentTerm,1)=log10(1 + rawQuery(intCurrentTerm,1));

	//global weighting routine - Inverse Document Frequency
	for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
			if (rawQuery(intCurrentTerm,1)!=0)
			{
				  SDD_Q(intCurrentTerm,1)=

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -