📄 preparation.cpp
字号:
// File: Preparation.cpp
//
// Purpose: Generates term-document matrix files for Latent Semantic Indexing
// (SDD) Engine.
// Also produces term index and term counts for query processing.
// Inputs: 1. MEDTEST.DAT (file produced by lex processing included in jdgetterms.exe)
// 2. ENGLISH.STOP (stop fil from SMART system)
// 3. UNIQUETERMSTOP.TXT (list of terms appearing only once in doc. collection)
//
// Outputs: 1. STEMLIST.DAT (temporary file for term processing)
// 2. TERMDOC.MTX (the term doc matrix in 'MatrixMarket matrix coordinate real
// general format')
//
// Author: Jason Dowling August 2002
//
// For more details please see my BComp(Hons) thesis at www.pcug.org.au/~jdowling
#include <iostream.h>
#include <fstream.h>
#include <cstdlib>
#include <ctype.h>
#include <iomanip.h>
//#include <string.h> // String handling routines
#include "tree.h" // Search tree class
#include "string1.h" // String class
#include "matrix.h" // Matrix class
#include "list.h" // Linked List class
#include "porter.h" // Porter Stemmer
#include "math.h" // required for
//extern "C" int stem(char *word);
char* convertCase(char *term); // convert string to uppercase
main()
{
//Read in data files required to create the term-document matrix
//1. Read in terms extracted from document collection using Lex
// this file should have been created in the lexical analysis program
ifstream inDataFile("medTest.dat",ios::in);
if (!inDataFile){
cerr << "File could not be opened" << endl;
exit(1);
}
// 2. Read in the SMART list of stop words. These words will not
// be used in the term document matrix
ifstream inStopFile("english.stop",ios::in);
if (!inDataFile){
cerr << "File could not be opened" << endl;
exit(1);
}
//3. Read in the list of terms which have been identified as
//unique to this collection. We don't want to use these in
//the term document matrix
ifstream inStopUniqueFile("uniqueTermStop.txt",ios::in);
if (!inDataFile){
cerr << "File could not be opened" << endl;
exit(1);
}
//Temporary output file used to store stemmed and
//uppercase terms during operation
ofstream outStemFile("stemList.dat",ios::out);
if (!outStemFile){
cerr << "StemList.dat File could not be opened" << endl;
exit(1);
}
String strTerm, strTest;
char term[100];
char *newTerm;
char stopTerm[100];
char uniqueTerm[100]; //used in
char term2[100];
int lineNo = 0;
int docID = 0;
int intNoDocs = 0;
Tree< String > intTree; //Search tree for term processing
int indexNo = 0 ;
List< String > strList; //linked list for
List< String > stopList; //linked list for SMART stop words
List< String > stopUniqueList; //linked list for unique words
IRT_Porter *p; //Pointer to Porter Stemming code
p = new IRT_Porter; //instaniate Porter Stemmer
//read in the stop list and add to a list
while (inStopFile >> stopTerm)
{
stopList.insertAtFront(stopTerm);
}
inStopFile.close();
//Read in the unique term stop list and add to a linked list.
//This list is used to remove terms which only occur once in
//the entire collection. These make no difference to the
//precision results, however they slow down query processing
//and SDD processing.
int intTerm = 0;
newTerm = "";
while (inStopUniqueFile >> uniqueTerm)
{
stopUniqueList.insertAtFront(uniqueTerm);
}
inStopUniqueFile.close();
newTerm = "";
while (inDataFile >> term >> lineNo >> docID)
{
newTerm = convertCase(term);
if(stopList.findTerm(newTerm) ==0 )
{
//p->Porter(newTerm); // uncomment to use stemming
outStemFile << convertCase(newTerm) << " " << lineNo << " " << docID << endl;
}
intNoDocs = docID;
}
inDataFile.close();
outStemFile.close();
//read in the list of uppercase terms (which may be stemmed)
//and add them to a binary search tree. This enables us to
//sort the unique terms in this collection, and get a count of
//the number of terms we will be using.
ifstream inStemFile("stemList.dat",ios::in);
if (!inStemFile){
cerr << "File could not be opened" << endl;
exit(1);
}
while (inStemFile >> term >> lineNo >> docID)
{
//add the stemmed term to the list, but check
//if it has been identified as a unique term which
//requires removal. Remove the if test below if you
//need to output all terms.
if (strlen(term) > 0)
{
if (stopUniqueList.findTerm(term) ==0 )
intTree.insertNode( term );
}
}
inStemFile.close();
//the traversal increments a counter for the number
//of terms in this search tree.
intTree.inOrderTraversal();
int intNoTerms = intTree.getNodeCount();
//read in the index of terms and their termID and add to
//list.
ifstream inTermFile("termIndex.dat",ios::in);
if (!inDataFile){
cerr << "termIndex.dat could not be opened" << endl;
exit(1);
}
cout << "Reading termIndex.dat..." << endl;
while (inTermFile >> term2 >> indexNo)
{
strList.insertAtBack( term2 );
}
cout << "Number of documents: " << intNoDocs << endl;
//set up matrices for updating
//the termMatrix is used for term frequency only
//the weighted matrix has
cout << "Creating matrix: " << intNoTerms << " by " << intNoDocs << endl;
Matrix<double> termMatrix(intNoTerms,intNoDocs);
Matrix<double> weightedMatrix(intNoTerms,intNoDocs);
//now we need to read in the stemmed list again
//and set up the term/document matrix.
inStemFile.open("stemList.dat");
int termID = 0;
while (inStemFile >> term >> lineNo >> docID)
{
if (strlen(term) > 0)
{
termID = strList.findTerm(term);
if (termID > 0 && docID > 0)
termMatrix(termID, docID) +=1;
}
}
inStemFile.close();
int intCurrentDoc = 0; //counter for each document
int intCurrentTerm =0; //counter for each term
int* intTermTotal = new int[intNoTerms+1]; //holds the frequency of terms in collection
//Calculate the number of times in documents in which each term appears -
//this information is needed for global weighting (in query vector also)
//first initialise array
for (intCurrentTerm=1;intCurrentTerm<=intNoTerms; intCurrentTerm++)
intTermTotal[intCurrentTerm]=0;
// then loop through the term doc matrix and add up the term frequency
for (intCurrentTerm=1;intCurrentTerm<=intNoTerms; intCurrentTerm++)
for (intCurrentDoc = 1; intCurrentDoc <=intNoDocs; intCurrentDoc++)
{ if (termMatrix(intCurrentTerm,intCurrentDoc)!=0)
intTermTotal[intCurrentTerm]++;
}
//save the term totals.
ofstream outTermCountFile("termCount.dat",ios::out);
if (!outStemFile)
{
cerr << "termCount.dat file could not be opened" << endl;
exit(1);
}
for (intCurrentTerm=1;intCurrentTerm<=intNoTerms;intCurrentTerm++)
outTermCountFile << intCurrentTerm << " " << intTermTotal[intCurrentTerm] << endl;
outTermCountFile.close();
// local weighting (log) for lxm.lfx
for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
for (intCurrentDoc = 1; intCurrentDoc <=intNoDocs; intCurrentDoc++)
weightedMatrix(intCurrentTerm,intCurrentDoc)= log10(1 + termMatrix(intCurrentTerm,intCurrentDoc));
//global weighting routine - Inverse Document Frequency
//for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
// if (weightedMatrix(intCurrentTerm,1)!=0)
// {
// weightedMatrix(intCurrentTerm,1)=
// (weightedMatrix(intCurrentTerm,1)*((log10((float)intNoDocs/intTermTotal[intCurrentTerm]))));
// }
//normalisation routine
double dblSumTermFreq=0;
double dblNormValue=0;
intCurrentDoc = 0;
intCurrentTerm = 0;
for (intCurrentDoc = 1; intCurrentDoc <=intNoDocs; intCurrentDoc++)
{ //get the maximum frequency of any term
dblSumTermFreq = 0;
dblNormValue=0;
for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
dblSumTermFreq=dblSumTermFreq+(weightedMatrix(intCurrentTerm,intCurrentDoc)*weightedMatrix(intCurrentTerm,intCurrentDoc));
//dblNormValue= sqrt(dblSumTermFreq);
dblNormValue= ((sqrt(dblSumTermFreq))/dblSumTermFreq);
for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
if (weightedMatrix(intCurrentTerm,intCurrentDoc)!=0)
weightedMatrix(intCurrentTerm,intCurrentDoc)=weightedMatrix(intCurrentTerm,intCurrentDoc) * dblNormValue;
}
//save the matrices
//termMatrix.saveSparseMatrix("RawTermDoc.mtx"); //no longer used.
weightedMatrix.saveSparseMatrix("TermDoc.mtx");
cout << "Term Document Matrix preparation has completed successfully" << endl;
return 0;
}
//utility routine for converting a string to uppercase
char* convertCase(char *inTerm)
{
for(int i = 0; i < 100; i++ )
inTerm[i] = toupper(inTerm[i]);
return inTerm;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -