📄 preparation.cpp

📁 潜在语义准备
💻 CPP
字号:
// File:	 Preparation.cpp
//
// Purpose:  Generates term-document matrix files for Latent Semantic Indexing
//           (SDD) Engine.
//           Also produces term index and term counts for query processing. 
// Inputs:   1. MEDTEST.DAT (file produced by lex processing included in jdgetterms.exe) 
//			 2. ENGLISH.STOP (stop fil from SMART system)
//			 3. UNIQUETERMSTOP.TXT (list of terms appearing only once in doc. collection)
// 
// Outputs:  1. STEMLIST.DAT (temporary file for term processing) 
//			 2. TERMDOC.MTX (the term doc matrix in 'MatrixMarket matrix coordinate real 
//							 general format')
//     
// Author:   Jason Dowling August 2002
//
// For more details please see my BComp(Hons) thesis at www.pcug.org.au/~jdowling


#include <iostream.h> 
#include <fstream.h>
#include <cstdlib>   
#include <ctype.h> 
#include <iomanip.h>  
//#include <string.h>  // String handling routines 
#include "tree.h"    // Search tree class 
#include "string1.h" // String class  
#include "matrix.h"  // Matrix class 	 
#include "list.h"    // Linked List class
#include "porter.h"  // Porter Stemmer 
#include "math.h"	 // required for 	
//extern "C" int stem(char *word);

char* convertCase(char *term); // convert string to uppercase

main()
{

	//Read in data files required to create the term-document matrix
	
	//1. Read in terms extracted from document collection using Lex
	//   this file should have been created in the lexical analysis program
	ifstream inDataFile("medTest.dat",ios::in);
	if (!inDataFile){
		cerr << "File could not be opened"  << endl;
		exit(1);
	}

	// 2. Read in the SMART list of stop words.  These words will not
	// be used in the term document matrix
	ifstream inStopFile("english.stop",ios::in);
	if (!inDataFile){
		cerr << "File could not be opened"  << endl;
		exit(1);
	}

	//3. Read in the list of terms which have been identified as 
	//unique to this collection.  We don't want to use these in 
	//the term document matrix
	ifstream inStopUniqueFile("uniqueTermStop.txt",ios::in);
	if (!inDataFile){
		cerr << "File could not be opened"  << endl;
		exit(1);
	}
	
	//Temporary output file used to store stemmed and 
	//uppercase terms during operation
	ofstream outStemFile("stemList.dat",ios::out); 
	if (!outStemFile){
		cerr << "StemList.dat File could not be opened"  << endl;
		exit(1);
	}


	String strTerm, strTest;
	char term[100];
	char *newTerm;
	char stopTerm[100];
	char uniqueTerm[100];			//used in 
	char term2[100];
	int lineNo = 0;
	int docID = 0;
	int intNoDocs = 0;
	Tree< String > intTree;			//Search tree for term processing
	int indexNo = 0 ;				
	List< String > strList;         //linked list for 
	List< String > stopList;		//linked list for SMART stop words
	List< String > stopUniqueList;  //linked list for unique words 
	IRT_Porter *p;					//Pointer to Porter Stemming code
	
	p = new IRT_Porter;				//instaniate Porter Stemmer


	//read in the stop list and add to a list
	while (inStopFile >> stopTerm)
	{
		stopList.insertAtFront(stopTerm);
	}
	inStopFile.close();

	//Read in the unique term stop list and add to a linked list.  
	//This list is used to remove terms which only occur once in 
	//the entire collection. These make no difference to the 
	//precision results, however they slow down query processing 
	//and SDD processing.
	int intTerm = 0;
	newTerm = "";
	while (inStopUniqueFile >> uniqueTerm)
	{
		stopUniqueList.insertAtFront(uniqueTerm);
	}
	inStopUniqueFile.close();
	newTerm = "";	

	
	while (inDataFile >> term >> lineNo >> docID)
	{
		newTerm = convertCase(term);
		if(stopList.findTerm(newTerm) ==0 )
		{
		      //p->Porter(newTerm);   // uncomment to use stemming
			  outStemFile << convertCase(newTerm) << " " << lineNo << " " << docID << endl;
		}
		intNoDocs  = docID;
    }			
	inDataFile.close();
	outStemFile.close();
	

	//read in the list of uppercase terms (which may be stemmed) 
	//and add them to a binary search tree.  This enables us to
	//sort the unique terms in this collection, and get a count of 
	//the number of terms we will be using.  
	ifstream inStemFile("stemList.dat",ios::in);
	if (!inStemFile){
		cerr << "File could not be opened"  << endl;
		exit(1);
	}
	while (inStemFile >> term >> lineNo >> docID)
	{
		//add the stemmed term to the list, but check 
		//if it has been identified as a unique term which
		//requires removal.  Remove the if test below if you 
		//need to output all terms.
		if (strlen(term) > 0)
		{
			if (stopUniqueList.findTerm(term) ==0 )  
				intTree.insertNode( term );				
		}
	}
   inStemFile.close();

   //the traversal increments a counter for the number 
   //of terms in this search tree.
   intTree.inOrderTraversal();
   int intNoTerms = intTree.getNodeCount();

   //read in the index of terms and their termID and add to 
   //list.
   ifstream inTermFile("termIndex.dat",ios::in);
	if (!inDataFile){
		cerr << "termIndex.dat could not be opened"  << endl;
		exit(1);
	}
	cout << "Reading termIndex.dat..." << endl;
	while (inTermFile >> term2 >> indexNo)
	{
		strList.insertAtBack( term2 );
	}
	cout << "Number of documents: " << intNoDocs << endl;

	//set up matrices for updating
	//the termMatrix is used for term frequency only 
	//the weighted matrix has 
	cout << "Creating matrix: " << intNoTerms << " by " << intNoDocs << endl;
	Matrix<double> termMatrix(intNoTerms,intNoDocs);
	Matrix<double> weightedMatrix(intNoTerms,intNoDocs);  

	//now we need to read in the stemmed list again 
	//and set up the term/document matrix.
	inStemFile.open("stemList.dat");
	int termID = 0;
	while (inStemFile >> term >> lineNo >> docID)
	{
		if (strlen(term) > 0)
		{
			termID = strList.findTerm(term);
			if (termID > 0 && docID > 0)
				termMatrix(termID, docID) +=1;
		}
	}
	
	inStemFile.close();	
  
	int intCurrentDoc = 0;    //counter for each document
	int intCurrentTerm =0;	  //counter for each term
	int* intTermTotal = new int[intNoTerms+1];  //holds the frequency of terms in collection


	//Calculate the number of times in documents in which each term appears -
	//this information is needed for global weighting (in query vector also)
	//first initialise array
	for (intCurrentTerm=1;intCurrentTerm<=intNoTerms; intCurrentTerm++)
		intTermTotal[intCurrentTerm]=0;
	// then loop through the term doc matrix and add up the term frequency
	for (intCurrentTerm=1;intCurrentTerm<=intNoTerms; intCurrentTerm++)
		for (intCurrentDoc = 1; intCurrentDoc <=intNoDocs; intCurrentDoc++)
		{	if (termMatrix(intCurrentTerm,intCurrentDoc)!=0)
				intTermTotal[intCurrentTerm]++;
		}
	//save the term totals.  
	ofstream outTermCountFile("termCount.dat",ios::out); 
	if (!outStemFile)
	{
		cerr << "termCount.dat file could not be opened"  << endl;
		exit(1);
	}
	for (intCurrentTerm=1;intCurrentTerm<=intNoTerms;intCurrentTerm++) 
		outTermCountFile << intCurrentTerm << " " << intTermTotal[intCurrentTerm] << endl;
	outTermCountFile.close();
	
	// local weighting (log) for lxm.lfx 
	for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
		for (intCurrentDoc = 1; intCurrentDoc <=intNoDocs; intCurrentDoc++)
			weightedMatrix(intCurrentTerm,intCurrentDoc)= log10(1 + termMatrix(intCurrentTerm,intCurrentDoc));
	
	//global weighting routine - Inverse Document Frequency
	//for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
	//			if (weightedMatrix(intCurrentTerm,1)!=0)
	//			{
	//				weightedMatrix(intCurrentTerm,1)=
	//					(weightedMatrix(intCurrentTerm,1)*((log10((float)intNoDocs/intTermTotal[intCurrentTerm]))));
	//			}
	
	
	//normalisation routine
	double dblSumTermFreq=0;
	double dblNormValue=0;
	intCurrentDoc = 0;
	intCurrentTerm = 0;
	
	for (intCurrentDoc = 1; intCurrentDoc <=intNoDocs; intCurrentDoc++)
	{  //get the maximum frequency of any term
		dblSumTermFreq = 0;
		dblNormValue=0;
		for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
			dblSumTermFreq=dblSumTermFreq+(weightedMatrix(intCurrentTerm,intCurrentDoc)*weightedMatrix(intCurrentTerm,intCurrentDoc));
		
		//dblNormValue= sqrt(dblSumTermFreq);
		dblNormValue= ((sqrt(dblSumTermFreq))/dblSumTermFreq);

		
		for (intCurrentTerm = 1; intCurrentTerm<=intNoTerms; intCurrentTerm++)
			if (weightedMatrix(intCurrentTerm,intCurrentDoc)!=0)
				weightedMatrix(intCurrentTerm,intCurrentDoc)=weightedMatrix(intCurrentTerm,intCurrentDoc) * dblNormValue;
	}
	
	//save the matrices  
	//termMatrix.saveSparseMatrix("RawTermDoc.mtx");  //no longer used.  
	weightedMatrix.saveSparseMatrix("TermDoc.mtx");

	cout << "Term Document Matrix preparation has completed successfully" << endl;
	
	return 0;

}


//utility routine for converting a string to uppercase
char* convertCase(char *inTerm)
{
    for(int i = 0; i < 100; i++ ) 
		inTerm[i] = toupper(inTerm[i]);

	return inTerm;

}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -