⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 index.cpp

📁 本系统实现了分词和倒排索引
💻 CPP
字号:

#include <stdio.h>
#include <string>
#include <fstream>
#include <iostream> //for cout 
#include <vector> //for v_tids[i]
#include <ctime> 
using namespace std;

class index
{
public:
index()
{m_nLastDocID=0;
for(int i=0;i<1000;i++){ m_arry[i]=new first[1]; m_arry[i]->m_nLastDocID=-1;}
for(int i=0;i<100;i++){a[i]=-1;}
}
~index(){}

//Hit结点
typedef struct HitNode
   {
	int      nPos;//位置
	HitNode* pNext;//下一hit指针
   };
//二级索引结点
typedef struct DocListNode
   {
	int       DocID;//DOC ID
	int       nHits;//词出现的次数
	float     fWeight;//词在文中权重
	HitNode  pHitHead[100];//Hit链表头指针
	DocListNode* pNext;
   };
//一级索引结点
typedef struct first
   {
	int			 nKeyWordID;//关键字ID
	int			 nDocs;//文档数
	DocListNode  pDocListHead[100];//文档链表头指针
	first* pNext;
	int m_nLastDocID;
   };


void wordleverindexing(int docid,int *termid, int termcount)
{
     
 int posinc=0,f=0,b=0,iffind=0;
 
 first *firstlevel;
 firstlevel=(first*) malloc(10000);
// firstlevel=new first[1000];
 //firstlevel=(first *) malloc(10000);

//m_arry= (first *) malloc(10000);
 //m_arry=new first[10000];
 //如何获取termcount

 
  
 for(int i=0; i < termcount; ++i)
  {  
   
  
    firstlevel=m_arry[termid[i]];
	for(int d=0;d<1000;d++)
	{
		if(a[d]<0){a[d]=termid[i];iffind=1;break;}
	 if(a[d]==termid[i]) 
	 {iffind=0;break;}
    // f++;
	
	}
     if(iffind)//第一次遇到这个词
      {
        //first *firstlevel2;
       // firstlevel=firstlevel2;
        //m_arry[termid[i]]=firstlevel; 
		
       firstlevel->nKeyWordID=termid[i]; 
       firstlevel->nDocs=1;
       firstlevel->pDocListHead[1].nHits=0;
       p=1;
       }
     //firstlevel=addlocation(docid,posinc++,firstlevel);
	 m_arry[termid[i]]=addlocation(docid,posinc++,firstlevel);
    // firstlevel=m_arry[termid[i]];
	

	 
	 
  }
 for(int i=0;i<1000;i++)
	 {
	   if (m_arry[i]->nKeyWordID <0) continue; //cout<<"no index";
	   else 
	   {cout<<"KeyWordID:"<<m_arry[i]->nKeyWordID<< endl;
	   cout<<"nDocs:"<<m_arry[i]->nDocs<< endl;
       cout<<"first doc number:"<<m_arry[i]->pDocListHead[1].DocID<< endl;
	   cout<<"nhits of first doc:"<<m_arry[i]->pDocListHead[1].nHits<< endl;
	   for(int z=1;z<=m_arry[i]->pDocListHead[1].nHits;z++)
	   {cout<<"position of the term:"<<m_arry[i]->pDocListHead[1].pHitHead[z].nPos<< endl;}
       //cout<<"position of the term:"<<m_arry[i]->pDocListHead[1].pHitHead[2].nPos<< endl;
	   //cout<<"first doc number:"<<m_arry[i]->pDocListHead[2].DocID<< endl;
	   //cout<<"nhits of first doc:"<<m_arry[i]->pDocListHead[2].nHits<< endl;
	  // cout<<"position of the term:"<<m_arry[i]->pDocListHead[2].pHitHead[1].nPos<< endl;
       //cout<<"position of the term:"<<m_arry[i]->pDocListHead[2].pHitHead[2].nPos<< endl;
      // cout<<"second doc number:"<<m_arry[i]->pDocListHead[2].DocID<< endl;
       //cout<<"nhits of second doc:"<<m_arry[i]->pDocListHead[2].nHits<< endl;
	   }
      }
}
first *  addlocation(int did,int location,first *onelevel)
{

 i=0;
 if (did != onelevel->m_nLastDocID) //第一次遇到此文档
    {    
		//i++;
		//if(i==1)onelevel->nDocs=1;
		if(p==1)p=0;
        else onelevel->nDocs++; //文档频率加1
        int a=onelevel->nDocs; //文档频率同二级索引数是一样的
        onelevel->pDocListHead[a].DocID=did;
        onelevel->pDocListHead[a].nHits=1;
		int b=onelevel->pDocListHead[a].nHits;
        onelevel->pDocListHead[a].pHitHead[b].nPos=location;
        onelevel->m_nLastDocID=did;
    }
      
 else
  { 
    p=0;
    int a=onelevel->nDocs;
    onelevel->pDocListHead[a].nHits++; //如果不是第一次遇到,文档中词频加1
    onelevel->pDocListHead[a].DocID=did;
	int b=onelevel->pDocListHead[a].nHits;
    onelevel->pDocListHead[a].pHitHead[b].nPos=location;
    //onelevel->pDocListHead[a]->pHitHead[nHits-1]->pNext=onelevel->pDocListHead[a]->hit[nHits];
  }
     return onelevel;
 }

public:
first * m_arry[1000];
int m_nLastDocID,i,lastpos,p;
int a[100];
};//类定义
/*
main 中调用过程: 
index indexer=new index();
index->wordleverindexing(文档号,文档对应的分词数组, 文档中总次数);
*/

/*
int main()
{
    /*
	string filename;
    cout << "input filename: " << endl;
    cin>> filename;
    segment(filename); 
    print(v_tids, v_terms);  //test word id, english word has no tid
    
    //new int array ,size = v_tids.size(), store element in v_tids
    int t_count = v_tids.size();
    cout << "array count: " << t_count << endl;
    int *termid = new int[t_count];
    for(int i = 0; i < t_count; ++i)
    {
        termid[i] = v_tids[i];
        cout << termid[i] << " ";
    }
	*/
    //call index.h, 20081015 add
	/*
    index indexer ;
    int docid = 1;
	int termid[5];
	int t_count=5;
	termid[0]=3;termid[1]=3;termid[2]=5;termid[3]=6;termid[4]=9;
	indexer.wordleverindexing(docid,termid,t_count); 
	int docid1 = 2;
	int termid1[5];
	int t_count1=5;
	termid1[0]=3;termid1[1]=3;termid1[2]=9;termid1[3]=6;termid1[4]=9;
    indexer.wordleverindexing(docid1,termid1,t_count1); 
    cout << "index finish..." << endl;
    return 0;
}
*/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -