📄 querylsi.cpp
字号:
(SDD_Q(intCurrentTerm,1)*((log10((float)intNoDocs/intTermTotal[intCurrentTerm]))));
}
double cosine = 0.0; //similarity result
int intRankSize=1033; //how many documents do we want in query results
double* dblResults = new double[intRankSize]; //ranked list of top cosine values
int* intResultDoc = new int[intRankSize]; //ranked list of top documents
double dblThreshold=-100.0; //any cosine less than this value will not be displayed
int intMatch = 0; // contains array index where new value needs to be entered
//initialise ranking arrays
for (i = 0; i<(intRankSize-1); i++)
{
dblResults[i]=0.000000;
intResultDoc[i]=0;
}
Matrix<double> SDD_A(intRank, intNoDocs);
Matrix<double> SDD_newQ(intRank,1);
Matrix<double> SDD_newQ_t(1,intRank);
Matrix<double> SDD_Cosine(intNoDocs,1);
Matrix<double> SDD_Result(1,1);
//first matrix required to compute similarity with query vector
SDD_A = SDD_D * SDD_Y_t;
//normalize Matrix SDD_A (intRank x intNoDocs)
double dblSumTermFreq=0;
double dblNormValue=0;
for (intCurrentDoc = 1; intCurrentDoc <=intNoDocs; intCurrentDoc++)
{
dblSumTermFreq = 0;
dblNormValue=0;
for (intCurrentRank = 1; intCurrentRank<=intRank; intCurrentRank++)
dblSumTermFreq=dblSumTermFreq+(SDD_A(intCurrentRank,intCurrentDoc) * SDD_A(intCurrentRank,intCurrentDoc));
dblNormValue= sqrt(dblSumTermFreq);
for (intCurrentRank = 1; intCurrentRank<=intRank; intCurrentRank++)
if (SDD_A(intCurrentRank,intCurrentDoc) != 0)
SDD_A(intCurrentRank,intCurrentDoc)=
(SDD_A(intCurrentRank,intCurrentDoc) / dblNormValue);
}
//project query vector in the same space as term-doc matrix
SDD_newQ = SDD_X_t * SDD_Q;
//transpose SDD_newQ vector
for (intCurrentRank=1;intCurrentRank<=intRank;intCurrentRank++)
SDD_newQ_t(1,intCurrentRank) = SDD_newQ(intCurrentRank,1);
//calculate similarity for each document
SDD_Cosine = SDD_newQ_t * SDD_A;
//Now output the cosine for each document
for (intCurrentDoc = 1; intCurrentDoc<=intNoDocs; intCurrentDoc++)
{
cosine = SDD_Cosine(1,intCurrentDoc);
//insert cosine into result list in rank order
if (cosine > dblThreshold)
{
intMatch = -1;
for (i=0; i<intRankSize; i++)
{
if (cosine >= dblResults[i])
{ intMatch=i;
break;
}
}
if (intMatch >= 0)
{
//shift elements to the right and insert the new value
for (j=intRankSize-1; j>=intMatch;j--)
{
dblResults[j+1]=dblResults[j];
intResultDoc[j+1] = intResultDoc[j];
} //insert new value
dblResults[intMatch]=cosine;
intResultDoc[intMatch] = intCurrentDoc;
}
}
}
//The following code is used to calculate various precision measures.
int intRelID = 0;
int intHitCount= 0; //counter for the number of correct docs retrieved
double dblRecall = 0.0;
double* dblPrecisionPercent = new double[11]; //for 11 point int-ave-precision
double* dbl25_50_75Percent = new double[3]; //for 25/50/75 average precision
//initialise arrays and local variables
for (i = 0; i<11; i++)
dblPrecisionPercent[i] = 0;
for (i = 0; i<3; i++)
dbl25_50_75Percent[i] = 0;
double dblPrecision = 0.0;
double dblAvePrecision = 0.0;
double dblSumPrecision = 0.0;
double dblRPrecision = 0.0;
//Output details for each document in descending
//order of similarity.
for (i = 0; i<intRankSize ; i++)
{
cout << "Rank : " << i+1 << " Document: " << intResultDoc[i] << " Cosine: " << dblResults[i];
intRelID = relList.find(intResultDoc[i]);
if (intRelID > 0)
{
intHitCount++;
}
if (intHitCount> 0 && intHitCount <= intRelDocTotal)
{
dblRecall = (double)intHitCount/(double)intRelDocTotal;
dblPrecision = (double)intHitCount/((double)i+1);
if (intRelID > 0)
{
dblSumPrecision += dblPrecision;
}
}
cout << " P= " << dblPrecision;
cout << " R= " << dblRecall ;
cout << " Hit total: " << intHitCount << endl;
if (dblRecall <= .25)
dbl25_50_75Percent[0] = dblPrecision;
else if (dblRecall <= .50)
dbl25_50_75Percent[1] = dblPrecision;
else if (dblRecall <= .75)
dbl25_50_75Percent[2] = dblPrecision;
//100% recall used for R-Precision
if (intHitCount == intRelDocTotal)
if ( dblPrecision > dblPrecisionPercent[10] )
dblPrecisionPercent[10] = dblPrecision;
//set 11 point interpolated average components
if (dblRecall <= .10)
{ if ( dblPrecision > dblPrecisionPercent[0] )
dblPrecisionPercent[0] = dblPrecision; }
else if (dblRecall > .10 && dblRecall <= .20)
{ if ( dblPrecision > dblPrecisionPercent[1] )
dblPrecisionPercent[1] = dblPrecision;}
else if (dblRecall > .20 && dblRecall <= .30)
{ if ( dblPrecision > dblPrecisionPercent[2] )
dblPrecisionPercent[2] = dblPrecision;}
else if (dblRecall > .30 && dblRecall <= .40)
{ if ( dblPrecision > dblPrecisionPercent[3] )
dblPrecisionPercent[3] = dblPrecision;}
else if (dblRecall > .40 && dblRecall <= .50)
{ if ( dblPrecision > dblPrecisionPercent[4] )
dblPrecisionPercent[4] = dblPrecision;}
else if (dblRecall > .50 && dblRecall <= .60)
{ if ( dblPrecision > dblPrecisionPercent[5] )
dblPrecisionPercent[5] = dblPrecision;}
else if (dblRecall > .60 && dblRecall <= .70)
{ if ( dblPrecision > dblPrecisionPercent[6] )
dblPrecisionPercent[6] = dblPrecision;}
else if (dblRecall > .70 && dblRecall <= .80)
{ if ( dblPrecision > dblPrecisionPercent[7] )
dblPrecisionPercent[7] = dblPrecision;}
else if (dblRecall > .80 && dblRecall <= .90)
{ if ( dblPrecision > dblPrecisionPercent[8] )
dblPrecisionPercent[8] = dblPrecision;}
else if(dblRecall > .90 && dblRecall <= 1)
{ if ( dblPrecision > dblPrecisionPercent[9] )
dblPrecisionPercent[9] = dblPrecision;}
}
//calculate interpolated average precision and output results
//for this query.
for (i = 0; i< 11; i++)
dblAvePrecision = dblAvePrecision + dblPrecisionPercent[i];
dblAvePrecision = dblAvePrecision /11.0;
cout << "Interpolated Average Precision for Query " << intCurrentQuery << " = " << dblAvePrecision << endl;
cout << "Non-InterpolatedPrecision for for Query " << intCurrentQuery << " = " << dblSumPrecision / (double) intRelDocTotal << endl;
cout << "R_Precision = " << dblPrecisionPercent[10] << endl;
cout << "Average based on mean of 25,50 and 75 percent recall: " << ((dbl25_50_75Percent[0] + dbl25_50_75Percent[1] + dbl25_50_75Percent[2])/3.0) << endl;
cout << "Precision at Recall levels 0 to 10" << endl;
cout << "******************************************" << endl;
for (i = 0; i< 11; i++)
cout << dblPrecisionPercent[i] << endl;
cout << "******************************************" << endl;
return 0;
}
// This routine is useful for calculating the cosine of an normal VSM matrix
// Parameters are: the term-dccument matrix, the query vector, the current document
// being compared, and the number of rows in the term-doc matrix.
// The cosine between the query vector and the chosen document is returned.
double calculateCosine(const Matrix<double>& termDocMatrix, const Matrix<double>& queryVector, const int intCurrentDoc, const int intNoTerms)
{
double sumA2 = 0;
double sumB2 = 0;
double sumAB = 0;
for (int i = 1; i <= intNoTerms; i++)
{
sumA2 = sumA2 + ( termDocMatrix(i,intCurrentDoc) * termDocMatrix(i,intCurrentDoc) );
sumB2 = sumB2 + ( queryVector(i,1) * queryVector(i,1) );
sumAB = sumAB + (termDocMatrix(i,intCurrentDoc) * queryVector(i, 1) );
}
if( sqrt(sumA2) * sqrt(sumB2) > 0)
return sumAB / (sqrt(sumA2) * sqrt(sumB2));
else
return 0;
}
//Utility routine for converting a string into uppercase characters
char* convertCase(char *inTerm)
{
for(int i = 0; i < 100; i++ )
inTerm[i] = toupper(inTerm[i]);
return inTerm;
}
// Utility routine for sorting an array (bubble sort)
// Parameters are an array of doubles and the number of elements in the array
void sort (double* a,int n)
{ for (int i=1; i<n; i++)
{ double temp = a[i];
for (int j=1; j>0 && a[j-1] >temp; j--)
a[j] = a[j-1];
a[j]=temp;
}
}
void printDateTime()
{
// Use the following lines in any *function* where you access date or time.
// They create objects to hold the time as formatted in Unix and to provide
// access to the time field-by-filed.
// This routine has been adapted from Eugene Wallingford system_time_and_date.C
// code available at http://www.cs.uni.edu/~wallingf/teaching/052/code/
tm * local_time; // pointer to a tm object
time_t current_time; // time_t object to hold current time
time( ¤t_time ); // get the current date
local_time = localtime( ¤t_time ); // break it up into a tm object
// Here's how you access month, day, and year:
int day = local_time->tm_mday;
int month = local_time->tm_mon + 1; // month in tm is in range [0..11]
int year = local_time->tm_year + 1900; // year in tm is number since 1900
// Here's how you access hours, minutes, and seconds:
int hour = local_time->tm_hour;
int minute = local_time->tm_min;
int second = local_time->tm_sec;
//--------------------------------------------------------------------------
cout << "Run time: "
<< hour << ':' << minute << ':' << second << endl
<< "Run date: "
<< month << '/' << day << '/' << year << endl;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -