📄 querylsi.cpp
字号:
// File: QueryLSI.cpp
// Purpose: LSI SDD project Query Processor
// Date written: October 2002
// Author: Jason Dowling (jdowling@pcug.org.au)
//
// Inputs: termDoc.sdd (output from SDDPACK)
// termCount.dat (output from Preparation program)
// termIndex.dat (output from Preparation program)
//
// Outputs:
// Displays ranked list of matching documents and precision measures
//
// Usage: This version is set up for the MEDLINE collection.
// Run this program in a batch file with the query no. ie:
// querylsi -q 1 <medQry1.txt > runDetails.log
//
// Please see my BComp(Hons) thesis for further details.
//
#include <time.h>
#include <iostream.h>
#include <fstream.h>
#include <cstdlib>
#include <ctype.h>
#include <iomanip.h>
#include "tree.h" // Search tree class
#include "string1.h" // String class
#include "matrix.h" // Matrix class
#include "list.h" // Linked List class
#include "porter.h" // Porter Stemmer
#include "math.h"
extern "C" int stem(char *word); //Porter stemmer
char* convertCase(char *term); // convert string to uppercase
void printDateTime(); // output date time of run
void sort (double [],int); // bubble sort
double calculateCosine(const Matrix<double>&, const Matrix<double>&, const int, const int );
int main(int argc, char* argv[])
{
String strTerm, strTest;
char term2[100];
int intNoDocs = 0; //total documents this collection
int indexNo = 0 ; //Stre current term's index value
List< String > strList; //Linked list for term index values
List< int > relList; //Linked list for hold relevance information
IRT_Porter *p; //Porter stemmer
p = new IRT_Porter; //Instantiate Porter stemmer
int intTermCount = 0 ; //Counter variable for looping
int intCurrentTerm = 0; //Counter variable for looping
int intCurrentRank = 0; //Counter variable for looping
int intValue =0; //Holds each element of integer matices Xk, Yk
int intCurrentQuery = 0; //Counter variable for looping
int intCurrentDoc=0; //Counter variable for looping
//open the relevant document file to calculate precision/recall
ifstream inRelFile("MED_DOCS.REL",ios::in);
if (!inRelFile){
cerr << "MED_DOCS.REL File could not be opened" << endl;
exit(1);
}
//Get the Query number we are processing. The user needs to
//pass the query value in as 'queryLSI -q x', where x is the
//query number
if (argv[2] > 0)
intCurrentQuery = atoi(argv[2]);
else
{
cout << "I cannot identify query number. Please enter [QueryLSI -q n], where n is the query number (between 1 and 30 for Medline)" << endl;
return 0;
}
cout << "Testing Medline Query No: " << intCurrentQuery << endl;
//Read in the Medline relevance file and insert values into
//linked list
int intRelDocTotal = 0;
int intQry = 0;
int intDoc = 0;
while (inRelFile >> intQry >> intDoc)
{
if (intQry == intCurrentQuery)
{ intRelDocTotal++;
relList.insertAtBack( intDoc );
}
}
inRelFile.close();
cout << "No of relevant docs for this query: " << intRelDocTotal << endl;
ifstream inSDDFile("TermDoc.sdd",ios::in);
if (!inSDDFile){
cerr << "TermDoc.sdd File could not be opened" << endl;
exit(1);
}
cout << "Medline SDD LSI engine results. Jason Dowling 2002" << endl;
printDateTime();
cout << "Reading in term document matrix (may take a few seconds)..." << endl;
char string[1000];
//ignore the first two comment lines in the sdd file
inSDDFile.getline(string,999,'\n');
inSDDFile.getline(string,999,'\n');
int intRank=0;
int intNoTerms=0;
intNoDocs=0;
double dblValue =0;
char tempDVal[27];
char charExp[3];
int intExp = 0;
//the third sdd file line contains rank, terms and documents
inSDDFile >> intRank >> intNoTerms >> intNoDocs;
cout << endl << "This matrix has: rank: " << intRank << " m:" << intNoTerms << " n:" << intNoDocs << endl;
Matrix<double> SDD_X(intNoTerms,intRank);
Matrix<double> SDD_D(intRank,intRank);
Matrix<double> SDD_Y(intNoDocs,intRank);
Matrix<double> SDD_Y_t(intRank, intNoDocs);
Matrix<double> SDD_X_t(intRank,intNoTerms);
int i, j = 1;
int intCheckValues = 0;
//read in the values for the Dk matrix. These values are formatted
//in scientific notation, and require a little processing. Note the
//Dk matrix entries are inserted diagonally.
i = 1;
cout << "Reading in Dk matrix" << endl;
for (i; i<= intRank; i++)
{ inSDDFile >> string;
for (int j=0;j<27;j++)
tempDVal[j] = string[j];
//extract exponent for this value
charExp[0] = string[29];
charExp[1] = string[30];
charExp[2] = string[31];
intExp = atoi(charExp);
dblValue = atof(tempDVal);
if (intExp>0)
dblValue = dblValue/pow(10,intExp);
SDD_D(i,i) = dblValue;
intCheckValues++;
dblValue = 0;
intExp = 0;
}
cout << "Read in " << intCheckValues << " values for Dk matrix. " << endl;
intCheckValues=0;
//Read in values for the Xk matrix - (Term x Rank)
//Each line is one column of entries in the matrix
intValue = 0;
cout << "Reading in X matrix" << endl;
for (i=1; i <= intRank; i++) {
for (j=1; j<=intNoTerms; j++)
{inSDDFile >> intValue;
intCheckValues++;
SDD_X(j,i) = intValue;
}
}
cout << "Read in " << intCheckValues << " values for Xk matrix." << endl;
intCheckValues=0;
//Read in values for the Y matrix (Rank * Docs)
//Each line is one column of entries in the matrix
cout << "Reading in Y matrix" << endl;
intValue = 0;
for (i=1; i <= intRank; i++)
for (j=1; j<=intNoDocs; j++)
{
inSDDFile >> intValue;
SDD_Y(j,i) = intValue;
intCheckValues++;
}
cout << "Resad in " << intCheckValues << " values for Yk matrix." << endl;
inSDDFile.close();
//Transpose X to get X_t which is required for query processing.
for (i=1; i <= intRank; i++)
for (j=1; j<=intNoTerms; j++)
if (SDD_X(j,i) != 0)
SDD_X_t(i,j) = SDD_X(j,i);
//transpose Y to get Y_t which is required for query processing.
for (i=1; i <= intRank; i++)
for (j=1; j<=intNoDocs; j++)
if (SDD_Y(j,i) != 0)
SDD_Y_t(i,j) = SDD_Y(j,i);
//The following three lines can be used to output the reconstructed
//rank k matrix A_k. This can be useful for testing and comparing the
//approximation with the original term document matrix (or SVD):
//Matrix<double> SDD_All(intNoTerms,intNoDocs);
//SDD_All = SDD_X * SDD_D * SDD_Y_t;
//SDD_All.saveSparseMatrix("testout.sdd");
//Read in the term index and insert values into linked list
cout << "Reading in the list of terms..." << endl;
ifstream inTermFile("termIndex.dat",ios::in);
if (!inTermFile){
cerr << "termIndex.dat could not be opened" << endl;
exit(1);
}
cout << "Reading termIndex.dat..." << endl;
while (inTermFile >> term2 >> indexNo)
{
strList.insertAtBack( term2 );
}
inTermFile.close();
//read in the number of documents in which term appears - this
//information is needed for global weighting.
int* intTermTotal = new int[intNoTerms+1];
//initialise array
for (intCurrentTerm=1;intCurrentTerm<=intNoTerms; intCurrentTerm++)
intTermTotal[intCurrentTerm]=0;
ifstream inTermCountFile("termCount.dat",ios::in);
if (!inTermCountFile){
cerr << "termCount.dat file could not be opened" << endl;
exit(1); }
cout << "Reading termCount.dat..." << endl;
while (inTermCountFile >> intCurrentTerm >> intTermCount)
intTermTotal[intCurrentTerm]=intTermCount;
inTermCountFile.close();
//Please note that this program can be used interactively,
//however query strings are usually passed in from the command
//line. eg. 'queryLSI -q 1 < query1.txt > resultQ1.log'
//cout << endl << "Please enter query terms: " << endl;
char input_string[1000]; //max 1000 characters for query
char *tokenPtr;
cin.getline(input_string,1000);
cout << endl << "The entered text was: " << input_string << endl;
Matrix<double> SDD_Q(intNoTerms,1); //used as a weighted query vector
Matrix<double> rawQuery(intNoTerms,1);//query vector with raw term freq only.
tokenPtr = strtok(input_string, " "); //used to parse query terms
//parse the user entered query. Need to update to handle illegal chars.
int termID = 0;
while (tokenPtr!=NULL) {
termID = 0;
cout << tokenPtr << " ";
//p->Porter(tokenPtr); //Porter stemming.
strTerm = convertCase(tokenPtr); //make all terms uppercase
cout << tokenPtr << " ";
//see if this term can be matched with the term list
//if there is a match, increment the query vector
termID = strList.findTerm(tokenPtr);
if (termID > 0) {
cout << " TermNo is " << termID << endl;
rawQuery(termID, 1) += 1 ;
}
else
cout << " This term has been ignored " << endl;
tokenPtr = strtok(NULL, " ");
}
//logarithmic local weighting
for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
SDD_Q(intCurrentTerm,1)=log10(1 + rawQuery(intCurrentTerm,1));
//global weighting routine - Inverse Document Frequency
for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
if (rawQuery(intCurrentTerm,1)!=0)
{
SDD_Q(intCurrentTerm,1)=
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -