📄 invert.cpp

📁 用于信息检索方面
💻 CPP
字号:
/***
   Implemented by Miaomiao Shen
   ms894@cornell.edu
   2027282
   */
#include <iostream>
#include <string>
#include <fstream>
#include <map>
#include <algorithm>
#include <math.h>
using namespace std;


struct Term
{
	string name;
	string DocId;
	int Loc;
};

typedef struct TreeNode
{
Term wordTerm;
struct TreeNode *left;
struct TreeNode *right;

}treeNode;

class BiSortTree
{
public:
    BiSortTree();
	void desplayTree(void);//to show the tree
	void writeTree(void);  //write the whole tree in file "wordlist.txt"
	void insertTree(Term key);//insert a value to the tree
	
    treeNode* searchTree(string key,int show,int docu);//search a value in the tree
    //int treeMaxFr(void);// calculate the max frequency

	~BiSortTree();

private:
	
	treeNode*  buildTree(treeNode* head,Term number);//set up the tree
	treeNode* search(treeNode* head ,string key,int show,int docu);//search the tree
    void showTree(treeNode* head);//show
	void write(treeNode* head);//write tree info into a file
    void destroyTree(treeNode* head);//delete
    //int MaxFr(treeNode* head);// calculate the max frequency
    treeNode *Head;
    
};

BiSortTree tree;
BiSortTree treeArray[21];
string DocToInt[21]={"NULL","file01.txt","file02.txt","file03.txt","file04.txt","file05.txt","file06.txt","file07.txt","file08.txt","file09.txt","file10.txt","file11.txt"
,"file12.txt","file13.txt","file14.txt","file15.txt","file16.txt","file17.txt","file18.txt","file19.txt","file20.txt"};
map<string,int> DocuMax;
int firstPos;
double fre;
double OutputFre[21];
double ni=0; //the number of documents that contains the input term
double idf=0;
double tf=0;
double OutputTF[21];
string Info1;
string Info2;
string *OutputInfo1;
string *OutputInfo2;
int posting=0;
int N=20;
double tf_idf=0;

/**************to construct a B tree****************/
BiSortTree::BiSortTree()
{
	//to build an empty tree at first
   Head=NULL;
   Term number;
}

treeNode* BiSortTree::buildTree(treeNode* head,Term number)
{
	treeNode *p;    
	p=new treeNode;
    p->wordTerm=number;
	p->left =p->right=NULL;

	if(head==NULL)
	{     
      return p;
	}
	else
	{
  
       if(_stricmp(p->wordTerm.name.c_str(),head->wordTerm.name.c_str())<0)
		   head->left=buildTree(head->left,number);
	   else
		   //to do more to dealwith duplicate nodes
		   head->right=buildTree(head->right,number);
  
       return head;
	}
}
/*****************to insert a value to the tree***********************************/
void BiSortTree::insertTree(Term key)
{
	Head=buildTree(Head,key);
}
/*****************to seach a tree*************************/
treeNode*  BiSortTree::searchTree(string key,int show,int docu)
{
	return search(Head,key,show,docu);
}

treeNode* BiSortTree::search(treeNode* head ,string key,int show,int docu)
{  
	Info1="";
	Info2="";
	if(head==NULL)
		return NULL;
	if(_stricmp(head->wordTerm.name.c_str(),key.c_str())==0)
	{
		if(fre==0)
		{
			firstPos=head->wordTerm.Loc;
		}
		fre++;
		posting++;
		//cout<<posting<<endl;
		if(show==1)
		{
			//cout<<"loc"<<fre<<" in "<<head->wordTerm.DocId<<" :"<<head->wordTerm.Loc<<", ";
			Info1.append("DocId: ");
			Info1.append(head->wordTerm.DocId);
			Info1.append(", Loc: ");
            char ch[10];
		    itoa(head->wordTerm.Loc,ch,10);
			Info1.append(ch);
			Info1.append("\r\n");
			OutputInfo1[posting]=Info1;

      FILE *fid=fopen(DocToInt[docu].c_str(),"r");
	  char buff[20000];
	  while(!feof(fid))
	  {
		fgets(buff,20000,fid);
	  }	
	  fclose(fid);
	  string strBuff(buff);// strBuff is the origin string from text
	  
	  int pos1=head->wordTerm.Loc-40;
	  if(pos1<0) pos1=0;
	  string preStr=strBuff.substr(0,pos1);
	  //cout<<preStr<<endl;
	  pos1=preStr.find_last_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.1234567890'");
      //cout<<pos1<<endl;

	  int pos2=head->wordTerm.Loc+40;
	  if(pos2>=strBuff.length()-1) pos2=strBuff.length()-1;
	  string nexStr=strBuff.substr(pos2);
	  //cout<<nexStr<<endl;
	  pos2=nexStr.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.1234567890'");
	  pos2=head->wordTerm.Loc+40+pos2;
      //cout<<pos2<<endl;

	  //cout<<"some relational text: "<<strBuff.substr(pos1+1,pos2-pos1-1)<<endl;
	  //cout<<endl;
	  Info2=strBuff.substr(pos1+1,pos2-pos1-1);	  
	  OutputInfo2[posting]=Info2;

			//cout<<Info1;			
		}
	     search(head->right,key,show,docu);
         return head;
	}
	else 
	{
		if(_stricmp(key.c_str(),head->wordTerm.name.c_str())<0)
			return search( head->left,key,show,docu);
		else
			return search(head->right,key,show,docu);
	}

}

/*****************to show the tree ********************************************/
void BiSortTree::desplayTree(void)
{
	showTree(Head);
	cout<<endl;
}
void BiSortTree::showTree(treeNode* Head)
{
	
	
	if(Head!=NULL)
	{
		showTree(Head->left ) ;
		cout<<Head->wordTerm.name<<' ';
		showTree(Head->right) ;
	}
	

}
/***********************MaxFr****************************************************/
string comp="";
string docId="";
int termLoc=-1;
int docFre=0;
int termFre=0;
int postLink=0;
void BiSortTree::writeTree(void)
{
	
	write(Head);
}

void BiSortTree::write(treeNode* Head)
{
	if(Head!=NULL)
	{
		write(Head->left );

		string temp=Head->wordTerm.name;
		if(_stricmp(comp.c_str(),temp.c_str())!=0)  //different term so we need provide the last term's document frequency,this term's name and postLink
		{
			docId=Head->wordTerm.DocId;
			comp=temp;
			postLink++;  //have a new link number

		    FILE *fid=fopen("1a.txt","a");
		//fputs(temp.c_str(),fid);
		    if(docFre!=0) // to print the last terms'document frequency 
		   {
		     char bh[10];
		     itoa(docFre,bh,10);
		     fputs(bh,fid);
		     fputs("\r\n",fid);		
		   }

		  docFre=1;//reset the docFre
          

		  fputs(temp.c_str(),fid);  //to print out this term's name and the posting link number.
	      char ah[10];
		  itoa(postLink,ah,10);
		  fputs(", link to the posting: ",fid);
		  fputs(ah,fid);
		  fputs(", Document Frequency: ",fid);		
		  fclose(fid);		

          //to do with the 1b.txt
		  FILE *fd=fopen("1b.txt","a");
		   if(termFre!=0) // to print the last terms'document frequency 
		   {
		     char dh[10];
		     itoa(termFre,dh,10);
			 fputs("term frequency in this document = ",fid);
		     fputs(dh,fid);
		     fputs("\r\n",fid);		
		   }
          termFre=1;//reset the termFre
		  fputs("\r\n",fd);
		  fputs("No: ",fd);
		  fputs(ah,fd); //print out the postlink
		  fputs("\r\n",fd);
		  fputs(temp.c_str(),fd);
		  fputs(" ",fd);
		  fputs(Head->wordTerm.DocId.c_str(),fd);
		  fputs(" Location info: ",fd);
		  char ch[10];
		  itoa(Head->wordTerm.Loc,ch,10);
		  fputs(ch,fd);
		  fputs("\r\n",fd);
		  fclose(fd);
		}
		else // just need to add up the document number if there documentnumber is not the same;
		{
			if((_stricmp(docId.c_str(),Head->wordTerm.DocId.c_str())!=0))
			{
				docFre++;
				docId=Head->wordTerm.DocId;

				FILE *fiid=fopen("1b.txt","a");
				char eh[10];
		        itoa(termFre,eh,10);
				fputs("term frequency in this document = ",fiid);
		        fputs(eh,fiid);
		        fputs("\r\n",fiid);		
				termFre=1;
				fputs(temp.c_str(),fiid);
		        fputs(" ",fiid);
		        fputs(Head->wordTerm.DocId.c_str(),fiid);
		        fputs(" Location info: ",fiid);
		        char ch[10];
		        itoa(Head->wordTerm.Loc,ch,10);
		        fputs(ch,fiid);
		        fputs("\r\n",fiid);
				fclose(fiid);
			}
			else
			{
				termFre++;
				FILE *fiid=fopen("1b.txt","a");
				fputs(temp.c_str(),fiid);
		        fputs(" ",fiid);
		        fputs(Head->wordTerm.DocId.c_str(),fiid);
		        fputs(" Location info: ",fiid);
		        char ch[10];
		        itoa(Head->wordTerm.Loc,ch,10);
		        fputs(ch,fiid);
		        fputs("\r\n",fiid);
				fclose(fiid);
			}
			
		}

		write(Head->right) ;
	}
}

/*****************to delete a tree************************************/
BiSortTree::~BiSortTree()
{
destroyTree(Head);
}
void BiSortTree::destroyTree(treeNode* head )
{

if(head!=NULL)
{
destroyTree(head->left );
destroyTree(head->right );
delete head;

}

}

// above content is to construct a tree

string filter("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.1234567890'");// to seperate the word by tokens
string stopList[45];

//when the program is ruuning, it will run the Setup funtion to read data and construct 1a and 1b files.
void Setup(string s)
{
	FILE *fid=fopen(s.c_str(),"r");
	char buff[200000];
	while(!feof(fid))
	{
		fgets(buff,200000,fid);
	}	
	fclose(fid);
	string strBuff(buff);// strBuff is the origin string from text
	
	string::size_type pos=0;
	string::size_type prev_pos=strBuff.find_first_of(filter);
	string::size_type temp_pos=strBuff.find_first_of(filter);

	bool onoff=false;
	int fg=0;
	string SeperateWord;
	//use a map to count word frequency
	map<string,int> wordCount;

	while((pos=strBuff.find_first_of(filter,pos))!=string::npos)   
    {   
		fg=0;
          if (onoff)   
		  {   
			  prev_pos=temp_pos-1;
			  onoff=false;   
          }   
		  ++pos;   
		  if((pos -temp_pos)!=1)   
		  {   
             onoff   =   true;   
			 //output the seperated word to do more
			 
			 SeperateWord=strBuff.substr(prev_pos,temp_pos-prev_pos);
			 // to check the word if it is in the stoplist
			 for(int j=0;j<=39;j++)
			 {
				 if(_stricmp(SeperateWord.c_str(),stopList[j].c_str())==0)
					 fg=1;
			 }

			 if(fg==0)
			 {
				 //cout<<SeperateWord<<endl;
				 Term newTerm;
				 newTerm.name=SeperateWord;
				 newTerm.DocId=s;
				 newTerm.Loc=prev_pos;
				 //using map
				 string str;
				 str=SeperateWord;
				 transform(str.begin(),str.end(),str.begin(),tolower);
				 ++wordCount[str];

				 tree.insertTree(newTerm);
				 for(int i=1;i<=20;i++)
				 {
					 if(DocToInt[i]==s)
						 treeArray[i].insertTree(newTerm);
				 }


				 //calculate the frequency

			 }

		  }   
		  temp_pos=pos;   
	} //end of while

	fg=0;
	//the last word
	if(prev_pos!=string::npos)   
	{
		//to do more.
		    
		     SeperateWord=strBuff.substr(prev_pos,temp_pos-prev_pos);
			 for(int j=0;j<=39;j++)
			 {
				 if(_stricmp(SeperateWord.c_str(),stopList[j].c_str())==0)
					 fg=1;
			 }

			 if(fg==0)
			 {
				 //cout<<SeperateWord<<endl;
			     Term newTerm;
				 newTerm.name=SeperateWord;
				 newTerm.DocId=s;
				 newTerm.Loc=prev_pos;

				 string str;
				 str=SeperateWord;
				 
				 transform(str.begin(),str.end(),str.begin(),tolower);
				 ++wordCount[str];

				 tree.insertTree(newTerm);
				 for(int i=1;i<=20;i++)
				 {
					  if(DocToInt[i]==s)
						  treeArray[i].insertTree(newTerm);
				 }

			 }
			 
	}
	for(map<string,int>::iterator it=wordCount.begin();it!=wordCount.end();it++)
	{
		if(DocuMax[s]<(*it).second)
			DocuMax[s]=(*it).second;
	}
	//cout<<s<<" "<<DocuMax[s]<<endl;
}

void Initiate()
{
	int posStop=0;

	FILE *fid=fopen("stoplist.txt","r");
	while(!feof(fid))
	{
		char s[20];
		fgets(s,20,fid);
		stopList[posStop].empty();
		stopList[posStop]=string(s);
		stopList[posStop]=stopList[posStop].substr(0,stopList[posStop].length()-1);
		posStop++;
	}
    fclose(fid);

    //Setup(string("test.txt"));
	cout<<"constructing dababase, please wait..."<<endl;
	Setup(string("file01.txt"));
	Setup(string("file02.txt"));
	Setup(string("file03.txt"));
	Setup(string("file04.txt"));
	Setup(string("file05.txt"));
	Setup(string("file06.txt"));
	Setup(string("file07.txt"));
	Setup(string("file08.txt"));
	Setup(string("file09.txt"));
	Setup(string("file10.txt"));
	Setup(string("file11.txt"));
	Setup(string("file12.txt"));
	Setup(string("file13.txt"));
	Setup(string("file14.txt"));
	Setup(string("file15.txt"));
	Setup(string("file16.txt"));
	Setup(string("file17.txt"));
	Setup(string("file18.txt"));
	Setup(string("file19.txt"));
	Setup(string("file20.txt"));

}

//according to the tree
void WordList()
{
	tree.writeTree();
}

void SearchTerm(string s,int docu)
{
	fre=0;
	
	//firstPos=0;
	Info1="";
	treeArray[docu].searchTree(s,1,docu);
	
	OutputTF[docu]=fre/DocuMax[DocToInt[docu]];
	OutputFre[docu]=fre;
	if(fre!=0)
	{
	  ni++;
	}
}


int main()
{
	OutputInfo1=new string[5000];
	OutputInfo2=new string[5000];
	//to construct a tree for these data
	Initiate();

	//to set up the wordlist and the posting file name "1a.txt" and "1b.txt"
	WordList();

	string input;
	cout<<"Please input the term you want to search,use ZZZ to end up."<<endl;
	cin>>input;
	while(strcmp(input.c_str(),"ZZZ")!=0)
	{
		tf=0;
		ni=0;
        idf=0;
		posting=0;
        for(int i=1;i<=20;i++)
		{
			SearchTerm(input,i);
		}
		if(ni==0)
			cout<<"No term is founded,it maybe in StopList or not exist in the database."<<endl;
		else
		{
			double x=N/ni;
			double y=2.0;
			//the idf for all the documents is the same;
			idf=log(x)/log(y)+1;
			cout<<posting<<" postings are found."<<endl;	
			cout<<endl;
			int pos=1;
			for(int k=1;k<=20;k++)
			{
				if(OutputFre[k]>0)
				{
				    for(int m=1;m<=OutputFre[k];m++)
					{
					cout<<"DocuID: "<<k<<endl;
					cout<<"term weightings: "<<OutputFre[k]<<", term tf: "<<OutputTF[k]<<", term idf: "<<idf<<", term tf*idf: "<<OutputTF[k]*idf<<endl;
					cout<<OutputInfo1[pos];
					cout<<"relational test: "<<OutputInfo2[pos]<<endl;
					cout<<endl;
					pos++;
					}
				}
			}
		}
		cout<<"Please input the term you want to search,use ZZZ to end up."<<endl;
		cin>>input;
	}
	return 0;
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -