📄 invert.cpp
字号:
/***
Implemented by Miaomiao Shen
ms894@cornell.edu
2027282
*/
#include <iostream>
#include <string>
#include <fstream>
#include <map>
#include <algorithm>
#include <math.h>
using namespace std;
struct Term
{
string name;
string DocId;
int Loc;
};
typedef struct TreeNode
{
Term wordTerm;
struct TreeNode *left;
struct TreeNode *right;
}treeNode;
class BiSortTree
{
public:
BiSortTree();
void desplayTree(void);//to show the tree
void writeTree(void); //write the whole tree in file "wordlist.txt"
void insertTree(Term key);//insert a value to the tree
treeNode* searchTree(string key,int show,int docu);//search a value in the tree
//int treeMaxFr(void);// calculate the max frequency
~BiSortTree();
private:
treeNode* buildTree(treeNode* head,Term number);//set up the tree
treeNode* search(treeNode* head ,string key,int show,int docu);//search the tree
void showTree(treeNode* head);//show
void write(treeNode* head);//write tree info into a file
void destroyTree(treeNode* head);//delete
//int MaxFr(treeNode* head);// calculate the max frequency
treeNode *Head;
};
BiSortTree tree;
BiSortTree treeArray[21];
string DocToInt[21]={"NULL","file01.txt","file02.txt","file03.txt","file04.txt","file05.txt","file06.txt","file07.txt","file08.txt","file09.txt","file10.txt","file11.txt"
,"file12.txt","file13.txt","file14.txt","file15.txt","file16.txt","file17.txt","file18.txt","file19.txt","file20.txt"};
map<string,int> DocuMax;
int firstPos;
double fre;
double OutputFre[21];
double ni=0; //the number of documents that contains the input term
double idf=0;
double tf=0;
double OutputTF[21];
string Info1;
string Info2;
string *OutputInfo1;
string *OutputInfo2;
int posting=0;
int N=20;
double tf_idf=0;
/**************to construct a B tree****************/
BiSortTree::BiSortTree()
{
//to build an empty tree at first
Head=NULL;
Term number;
}
treeNode* BiSortTree::buildTree(treeNode* head,Term number)
{
treeNode *p;
p=new treeNode;
p->wordTerm=number;
p->left =p->right=NULL;
if(head==NULL)
{
return p;
}
else
{
if(_stricmp(p->wordTerm.name.c_str(),head->wordTerm.name.c_str())<0)
head->left=buildTree(head->left,number);
else
//to do more to dealwith duplicate nodes
head->right=buildTree(head->right,number);
return head;
}
}
/*****************to insert a value to the tree***********************************/
void BiSortTree::insertTree(Term key)
{
Head=buildTree(Head,key);
}
/*****************to seach a tree*************************/
treeNode* BiSortTree::searchTree(string key,int show,int docu)
{
return search(Head,key,show,docu);
}
treeNode* BiSortTree::search(treeNode* head ,string key,int show,int docu)
{
Info1="";
Info2="";
if(head==NULL)
return NULL;
if(_stricmp(head->wordTerm.name.c_str(),key.c_str())==0)
{
if(fre==0)
{
firstPos=head->wordTerm.Loc;
}
fre++;
posting++;
//cout<<posting<<endl;
if(show==1)
{
//cout<<"loc"<<fre<<" in "<<head->wordTerm.DocId<<" :"<<head->wordTerm.Loc<<", ";
Info1.append("DocId: ");
Info1.append(head->wordTerm.DocId);
Info1.append(", Loc: ");
char ch[10];
itoa(head->wordTerm.Loc,ch,10);
Info1.append(ch);
Info1.append("\r\n");
OutputInfo1[posting]=Info1;
FILE *fid=fopen(DocToInt[docu].c_str(),"r");
char buff[20000];
while(!feof(fid))
{
fgets(buff,20000,fid);
}
fclose(fid);
string strBuff(buff);// strBuff is the origin string from text
int pos1=head->wordTerm.Loc-40;
if(pos1<0) pos1=0;
string preStr=strBuff.substr(0,pos1);
//cout<<preStr<<endl;
pos1=preStr.find_last_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.1234567890'");
//cout<<pos1<<endl;
int pos2=head->wordTerm.Loc+40;
if(pos2>=strBuff.length()-1) pos2=strBuff.length()-1;
string nexStr=strBuff.substr(pos2);
//cout<<nexStr<<endl;
pos2=nexStr.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.1234567890'");
pos2=head->wordTerm.Loc+40+pos2;
//cout<<pos2<<endl;
//cout<<"some relational text: "<<strBuff.substr(pos1+1,pos2-pos1-1)<<endl;
//cout<<endl;
Info2=strBuff.substr(pos1+1,pos2-pos1-1);
OutputInfo2[posting]=Info2;
//cout<<Info1;
}
search(head->right,key,show,docu);
return head;
}
else
{
if(_stricmp(key.c_str(),head->wordTerm.name.c_str())<0)
return search( head->left,key,show,docu);
else
return search(head->right,key,show,docu);
}
}
/*****************to show the tree ********************************************/
void BiSortTree::desplayTree(void)
{
showTree(Head);
cout<<endl;
}
void BiSortTree::showTree(treeNode* Head)
{
if(Head!=NULL)
{
showTree(Head->left ) ;
cout<<Head->wordTerm.name<<' ';
showTree(Head->right) ;
}
}
/***********************MaxFr****************************************************/
string comp="";
string docId="";
int termLoc=-1;
int docFre=0;
int termFre=0;
int postLink=0;
void BiSortTree::writeTree(void)
{
write(Head);
}
void BiSortTree::write(treeNode* Head)
{
if(Head!=NULL)
{
write(Head->left );
string temp=Head->wordTerm.name;
if(_stricmp(comp.c_str(),temp.c_str())!=0) //different term so we need provide the last term's document frequency,this term's name and postLink
{
docId=Head->wordTerm.DocId;
comp=temp;
postLink++; //have a new link number
FILE *fid=fopen("1a.txt","a");
//fputs(temp.c_str(),fid);
if(docFre!=0) // to print the last terms'document frequency
{
char bh[10];
itoa(docFre,bh,10);
fputs(bh,fid);
fputs("\r\n",fid);
}
docFre=1;//reset the docFre
fputs(temp.c_str(),fid); //to print out this term's name and the posting link number.
char ah[10];
itoa(postLink,ah,10);
fputs(", link to the posting: ",fid);
fputs(ah,fid);
fputs(", Document Frequency: ",fid);
fclose(fid);
//to do with the 1b.txt
FILE *fd=fopen("1b.txt","a");
if(termFre!=0) // to print the last terms'document frequency
{
char dh[10];
itoa(termFre,dh,10);
fputs("term frequency in this document = ",fid);
fputs(dh,fid);
fputs("\r\n",fid);
}
termFre=1;//reset the termFre
fputs("\r\n",fd);
fputs("No: ",fd);
fputs(ah,fd); //print out the postlink
fputs("\r\n",fd);
fputs(temp.c_str(),fd);
fputs(" ",fd);
fputs(Head->wordTerm.DocId.c_str(),fd);
fputs(" Location info: ",fd);
char ch[10];
itoa(Head->wordTerm.Loc,ch,10);
fputs(ch,fd);
fputs("\r\n",fd);
fclose(fd);
}
else // just need to add up the document number if there documentnumber is not the same;
{
if((_stricmp(docId.c_str(),Head->wordTerm.DocId.c_str())!=0))
{
docFre++;
docId=Head->wordTerm.DocId;
FILE *fiid=fopen("1b.txt","a");
char eh[10];
itoa(termFre,eh,10);
fputs("term frequency in this document = ",fiid);
fputs(eh,fiid);
fputs("\r\n",fiid);
termFre=1;
fputs(temp.c_str(),fiid);
fputs(" ",fiid);
fputs(Head->wordTerm.DocId.c_str(),fiid);
fputs(" Location info: ",fiid);
char ch[10];
itoa(Head->wordTerm.Loc,ch,10);
fputs(ch,fiid);
fputs("\r\n",fiid);
fclose(fiid);
}
else
{
termFre++;
FILE *fiid=fopen("1b.txt","a");
fputs(temp.c_str(),fiid);
fputs(" ",fiid);
fputs(Head->wordTerm.DocId.c_str(),fiid);
fputs(" Location info: ",fiid);
char ch[10];
itoa(Head->wordTerm.Loc,ch,10);
fputs(ch,fiid);
fputs("\r\n",fiid);
fclose(fiid);
}
}
write(Head->right) ;
}
}
/*****************to delete a tree************************************/
BiSortTree::~BiSortTree()
{
destroyTree(Head);
}
void BiSortTree::destroyTree(treeNode* head )
{
if(head!=NULL)
{
destroyTree(head->left );
destroyTree(head->right );
delete head;
}
}
// above content is to construct a tree
string filter("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.1234567890'");// to seperate the word by tokens
string stopList[45];
//when the program is ruuning, it will run the Setup funtion to read data and construct 1a and 1b files.
void Setup(string s)
{
FILE *fid=fopen(s.c_str(),"r");
char buff[200000];
while(!feof(fid))
{
fgets(buff,200000,fid);
}
fclose(fid);
string strBuff(buff);// strBuff is the origin string from text
string::size_type pos=0;
string::size_type prev_pos=strBuff.find_first_of(filter);
string::size_type temp_pos=strBuff.find_first_of(filter);
bool onoff=false;
int fg=0;
string SeperateWord;
//use a map to count word frequency
map<string,int> wordCount;
while((pos=strBuff.find_first_of(filter,pos))!=string::npos)
{
fg=0;
if (onoff)
{
prev_pos=temp_pos-1;
onoff=false;
}
++pos;
if((pos -temp_pos)!=1)
{
onoff = true;
//output the seperated word to do more
SeperateWord=strBuff.substr(prev_pos,temp_pos-prev_pos);
// to check the word if it is in the stoplist
for(int j=0;j<=39;j++)
{
if(_stricmp(SeperateWord.c_str(),stopList[j].c_str())==0)
fg=1;
}
if(fg==0)
{
//cout<<SeperateWord<<endl;
Term newTerm;
newTerm.name=SeperateWord;
newTerm.DocId=s;
newTerm.Loc=prev_pos;
//using map
string str;
str=SeperateWord;
transform(str.begin(),str.end(),str.begin(),tolower);
++wordCount[str];
tree.insertTree(newTerm);
for(int i=1;i<=20;i++)
{
if(DocToInt[i]==s)
treeArray[i].insertTree(newTerm);
}
//calculate the frequency
}
}
temp_pos=pos;
} //end of while
fg=0;
//the last word
if(prev_pos!=string::npos)
{
//to do more.
SeperateWord=strBuff.substr(prev_pos,temp_pos-prev_pos);
for(int j=0;j<=39;j++)
{
if(_stricmp(SeperateWord.c_str(),stopList[j].c_str())==0)
fg=1;
}
if(fg==0)
{
//cout<<SeperateWord<<endl;
Term newTerm;
newTerm.name=SeperateWord;
newTerm.DocId=s;
newTerm.Loc=prev_pos;
string str;
str=SeperateWord;
transform(str.begin(),str.end(),str.begin(),tolower);
++wordCount[str];
tree.insertTree(newTerm);
for(int i=1;i<=20;i++)
{
if(DocToInt[i]==s)
treeArray[i].insertTree(newTerm);
}
}
}
for(map<string,int>::iterator it=wordCount.begin();it!=wordCount.end();it++)
{
if(DocuMax[s]<(*it).second)
DocuMax[s]=(*it).second;
}
//cout<<s<<" "<<DocuMax[s]<<endl;
}
void Initiate()
{
int posStop=0;
FILE *fid=fopen("stoplist.txt","r");
while(!feof(fid))
{
char s[20];
fgets(s,20,fid);
stopList[posStop].empty();
stopList[posStop]=string(s);
stopList[posStop]=stopList[posStop].substr(0,stopList[posStop].length()-1);
posStop++;
}
fclose(fid);
//Setup(string("test.txt"));
cout<<"constructing dababase, please wait..."<<endl;
Setup(string("file01.txt"));
Setup(string("file02.txt"));
Setup(string("file03.txt"));
Setup(string("file04.txt"));
Setup(string("file05.txt"));
Setup(string("file06.txt"));
Setup(string("file07.txt"));
Setup(string("file08.txt"));
Setup(string("file09.txt"));
Setup(string("file10.txt"));
Setup(string("file11.txt"));
Setup(string("file12.txt"));
Setup(string("file13.txt"));
Setup(string("file14.txt"));
Setup(string("file15.txt"));
Setup(string("file16.txt"));
Setup(string("file17.txt"));
Setup(string("file18.txt"));
Setup(string("file19.txt"));
Setup(string("file20.txt"));
}
//according to the tree
void WordList()
{
tree.writeTree();
}
void SearchTerm(string s,int docu)
{
fre=0;
//firstPos=0;
Info1="";
treeArray[docu].searchTree(s,1,docu);
OutputTF[docu]=fre/DocuMax[DocToInt[docu]];
OutputFre[docu]=fre;
if(fre!=0)
{
ni++;
}
}
int main()
{
OutputInfo1=new string[5000];
OutputInfo2=new string[5000];
//to construct a tree for these data
Initiate();
//to set up the wordlist and the posting file name "1a.txt" and "1b.txt"
WordList();
string input;
cout<<"Please input the term you want to search,use ZZZ to end up."<<endl;
cin>>input;
while(strcmp(input.c_str(),"ZZZ")!=0)
{
tf=0;
ni=0;
idf=0;
posting=0;
for(int i=1;i<=20;i++)
{
SearchTerm(input,i);
}
if(ni==0)
cout<<"No term is founded,it maybe in StopList or not exist in the database."<<endl;
else
{
double x=N/ni;
double y=2.0;
//the idf for all the documents is the same;
idf=log(x)/log(y)+1;
cout<<posting<<" postings are found."<<endl;
cout<<endl;
int pos=1;
for(int k=1;k<=20;k++)
{
if(OutputFre[k]>0)
{
for(int m=1;m<=OutputFre[k];m++)
{
cout<<"DocuID: "<<k<<endl;
cout<<"term weightings: "<<OutputFre[k]<<", term tf: "<<OutputTF[k]<<", term idf: "<<idf<<", term tf*idf: "<<OutputTF[k]*idf<<endl;
cout<<OutputInfo1[pos];
cout<<"relational test: "<<OutputInfo2[pos]<<endl;
cout<<endl;
pos++;
}
}
}
}
cout<<"Please input the term you want to search,use ZZZ to end up."<<endl;
cin>>input;
}
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -