📄 index.cpp
字号:
#include <stdio.h>
#include <string>
#include <fstream>
#include <iostream> //for cout
#include <vector> //for v_tids[i]
#include <ctime>
using namespace std;
class index
{
public:
index()
{m_nLastDocID=0;
for(int i=0;i<1000;i++){ m_arry[i]=new first[1]; m_arry[i]->m_nLastDocID=-1;}
for(int i=0;i<100;i++){a[i]=-1;}
}
~index(){}
//Hit结点
typedef struct HitNode
{
int nPos;//位置
HitNode* pNext;//下一hit指针
};
//二级索引结点
typedef struct DocListNode
{
int DocID;//DOC ID
int nHits;//词出现的次数
float fWeight;//词在文中权重
HitNode pHitHead[100];//Hit链表头指针
DocListNode* pNext;
};
//一级索引结点
typedef struct first
{
int nKeyWordID;//关键字ID
int nDocs;//文档数
DocListNode pDocListHead[100];//文档链表头指针
first* pNext;
int m_nLastDocID;
};
void wordleverindexing(int docid,int *termid, int termcount)
{
int posinc=0,f=0,b=0,iffind=0;
first *firstlevel;
firstlevel=(first*) malloc(10000);
// firstlevel=new first[1000];
//firstlevel=(first *) malloc(10000);
//m_arry= (first *) malloc(10000);
//m_arry=new first[10000];
//如何获取termcount
for(int i=0; i < termcount; ++i)
{
firstlevel=m_arry[termid[i]];
for(int d=0;d<1000;d++)
{
if(a[d]<0){a[d]=termid[i];iffind=1;break;}
if(a[d]==termid[i])
{iffind=0;break;}
// f++;
}
if(iffind)//第一次遇到这个词
{
//first *firstlevel2;
// firstlevel=firstlevel2;
//m_arry[termid[i]]=firstlevel;
firstlevel->nKeyWordID=termid[i];
firstlevel->nDocs=1;
firstlevel->pDocListHead[1].nHits=0;
p=1;
}
//firstlevel=addlocation(docid,posinc++,firstlevel);
m_arry[termid[i]]=addlocation(docid,posinc++,firstlevel);
// firstlevel=m_arry[termid[i]];
}
for(int i=0;i<1000;i++)
{
if (m_arry[i]->nKeyWordID <0) continue; //cout<<"no index";
else
{cout<<"KeyWordID:"<<m_arry[i]->nKeyWordID<< endl;
cout<<"nDocs:"<<m_arry[i]->nDocs<< endl;
cout<<"first doc number:"<<m_arry[i]->pDocListHead[1].DocID<< endl;
cout<<"nhits of first doc:"<<m_arry[i]->pDocListHead[1].nHits<< endl;
for(int z=1;z<=m_arry[i]->pDocListHead[1].nHits;z++)
{cout<<"position of the term:"<<m_arry[i]->pDocListHead[1].pHitHead[z].nPos<< endl;}
//cout<<"position of the term:"<<m_arry[i]->pDocListHead[1].pHitHead[2].nPos<< endl;
//cout<<"first doc number:"<<m_arry[i]->pDocListHead[2].DocID<< endl;
//cout<<"nhits of first doc:"<<m_arry[i]->pDocListHead[2].nHits<< endl;
// cout<<"position of the term:"<<m_arry[i]->pDocListHead[2].pHitHead[1].nPos<< endl;
//cout<<"position of the term:"<<m_arry[i]->pDocListHead[2].pHitHead[2].nPos<< endl;
// cout<<"second doc number:"<<m_arry[i]->pDocListHead[2].DocID<< endl;
//cout<<"nhits of second doc:"<<m_arry[i]->pDocListHead[2].nHits<< endl;
}
}
}
first * addlocation(int did,int location,first *onelevel)
{
i=0;
if (did != onelevel->m_nLastDocID) //第一次遇到此文档
{
//i++;
//if(i==1)onelevel->nDocs=1;
if(p==1)p=0;
else onelevel->nDocs++; //文档频率加1
int a=onelevel->nDocs; //文档频率同二级索引数是一样的
onelevel->pDocListHead[a].DocID=did;
onelevel->pDocListHead[a].nHits=1;
int b=onelevel->pDocListHead[a].nHits;
onelevel->pDocListHead[a].pHitHead[b].nPos=location;
onelevel->m_nLastDocID=did;
}
else
{
p=0;
int a=onelevel->nDocs;
onelevel->pDocListHead[a].nHits++; //如果不是第一次遇到,文档中词频加1
onelevel->pDocListHead[a].DocID=did;
int b=onelevel->pDocListHead[a].nHits;
onelevel->pDocListHead[a].pHitHead[b].nPos=location;
//onelevel->pDocListHead[a]->pHitHead[nHits-1]->pNext=onelevel->pDocListHead[a]->hit[nHits];
}
return onelevel;
}
public:
first * m_arry[1000];
int m_nLastDocID,i,lastpos,p;
int a[100];
};//类定义
/*
main 中调用过程:
index indexer=new index();
index->wordleverindexing(文档号,文档对应的分词数组, 文档中总次数);
*/
/*
int main()
{
/*
string filename;
cout << "input filename: " << endl;
cin>> filename;
segment(filename);
print(v_tids, v_terms); //test word id, english word has no tid
//new int array ,size = v_tids.size(), store element in v_tids
int t_count = v_tids.size();
cout << "array count: " << t_count << endl;
int *termid = new int[t_count];
for(int i = 0; i < t_count; ++i)
{
termid[i] = v_tids[i];
cout << termid[i] << " ";
}
*/
//call index.h, 20081015 add
/*
index indexer ;
int docid = 1;
int termid[5];
int t_count=5;
termid[0]=3;termid[1]=3;termid[2]=5;termid[3]=6;termid[4]=9;
indexer.wordleverindexing(docid,termid,t_count);
int docid1 = 2;
int termid1[5];
int t_count1=5;
termid1[0]=3;termid1[1]=3;termid1[2]=9;termid1[3]=6;termid1[4]=9;
indexer.wordleverindexing(docid1,termid1,t_count1);
cout << "index finish..." << endl;
return 0;
}
*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -