⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 倒排文档.cpp

📁 首先对中文文档建立倒排文档
💻 CPP
字号:
#include <time.h>
#include <iostream>
using namespace std;

struct cell
{
	char lemma[20];
	unsigned char *p;
};
cell inver[200000];
int wordleng=0;

int comminute (char *text,long lg,int N);
unsigned char *Reallmoc(unsigned char *oldp,int oldn);


void main (char *arg[])
{

	int i=0,N=0,j=0;
	int piece=0;
	int number=0;
    char *text = NULL;					//检索的文献读取存放
	long length=0;
	char ch='\0';
	char *file={0};
	FILE *fp = NULL;
	char *savefile;						//结果存放文档
	clock_t start0, finish0;			//程序运行时间
	double sftime0;
	start0 = clock();

	savefile="倒排文档.txt";
	file = "text.txt";

	FILE *cp = fopen("词库.txt","r");//词库位置
   while(!feof(cp))						 //读取词库
   {
	   ch=fgetc(cp);
	   i=0;
	   while(ch!=13&&ch!=10&&!feof(cp))
	   {
		  inver[wordleng].lemma[i]=ch;
		    ch=fgetc(cp);
		  i++;
	   } 
//     std::cout<<(inver[wordleng].lemma);		 //屏幕输出。临时
	   if(i>3)
	   {
		   for(;i<20;i++)
			   inver[wordleng].lemma[i]='\0';
		   inver[wordleng].p=NULL;
		  wordleng++;
	   }
   }
   fclose(cp);							 //关闭词库    
 
    if((fp = fopen(file,"r"))==NULL)	//打开指定文献
        printf("无法打开文件%s!!",file);

	fseek (fp,-1L,2);					//计算文章字节长度
	 length = ftell(fp);
	  rewind (fp);
	  std::cout <<"已打开"<<file<<",长度为:"<<length*1.0/1024<<"K字节."<<endl;   //输出文章长度
	piece = 2*(length/508);				//将文档切割成255份
	  text = (char *)malloc((piece+20)* sizeof(char));
		 memset((char *)text,0,(piece+20)*sizeof(char));
	while(!feof(fp))					//读取指定文件
    {
       if (length<piece)		  
	   {
		   for (i=0;i<length-10;i++)
			   if(i%2==0)
				   while ((int)(text[ i ] = fgetc(fp))>=0);
			   else
				   text[ i ] = fgetc(fp);
		   N++;
	   }
	   else
	   {
		   for (i=0;i<piece;i++)
			   if(i%2==0)
				   while ((int)(text[ i ] = fgetc(fp))>=0);
			   else 
				   text[ i ] = fgetc(fp);
			   length=length-piece;
			N++;
	   }
//	   std::cout<<text<<endl;			//调试临时显示
	   comminute (text,i,N);			//调用分词程序
	   memset((char *)text,0,(i+18)*sizeof(char));
	}
	fclose(fp);

	FILE *wp = fopen(savefile,"w");		//文本输出	
	for(i=0;i<wordleng;i++)				//开始输出
	{
	   if((inver[i].p)!=NULL)
	   {			      
		   fprintf(wp,"%-12s",inver[i].lemma);		//出现的词语内容
		   number = (int)inver[i].p[0];
		   for(j=1;j<=number;j++)	 
			   fprintf(wp,"\t%d",inver[i].p[j]);
		   fprintf(wp,"\n");
	   }
	   
	}
	std::cout<<"结果成功输出到文件:" <<savefile<<endl;
	finish0 = clock();
	sftime0 = (double)(finish0 - start0) / CLOCKS_PER_SEC;//计算用时
	std::cout<<"建立倒排文档共用时:"<<sftime0<<"秒."<<endl;
	getchar();
}

int comminute (char *text,long lg,int N)
{
	char segment[22],temp[22];
	int begin=0,end=wordleng-1,middle;//定位词条标记
	int point=0;						//已分词处标记
	int i,k;
	int number=0;
	
	while(point<lg)
	{
		memset((char *)segment,0,22*sizeof(char));
		begin=0;
		end=wordleng-1;
		for (i=0; i<18 && point+i<lg; i++)//读取18个字符
			segment[i] = text[point+i];
		
		if(i>0)
		{
			while(end-begin>3)				//二分法查找,大范围定位
			{
				middle=(int)((begin+end)/2);
				k =(int) strcmp(segment,inver[middle].lemma);
				if(k<0)
					end=middle;
				else
					begin=middle;
			}
			
			while(i>2)
			{
				if(begin<400)				 //为准确适当范围
					begin=0;
				else 
					begin=begin-400;
				memset((char *)temp,0,22*sizeof(char));
				strncpy(temp,segment,i);
				while(end-begin>1)			//二分法重新定位
				{
					middle=(int)((begin+end)/2);
					k =(int) strcmp(temp,inver[middle].lemma);
					if(k<0)
						end=middle;
					else
						begin=middle;
				}
				if(strcmp(temp,inver[begin].lemma)==0)//与词库匹配
				{
					if((inver[begin].p)==NULL)
					{
						inver[begin].p = (unsigned char *)malloc(2* sizeof(char));//申请空间
						inver[begin].p[0] = 1;
						inver[begin].p[1] = N;
						break;
					}
					else
					{
						number = inver[begin].p[0];
						if(inver[begin].p[number]!=N)
						{
							inver[begin].p[0] = number+1;
							inver[begin].p  = Reallmoc(inver[begin].p, number+1);//调整空间大小
							inver[begin].p[number+1] = N;							
						}
						break;
					}					
				}
				else		
					i=i-2;						//缩短字符串
			}
			point=point+i;i=0;					//最大匹配
		}
	}  
	  return 0;
}

//替代realloc函数
unsigned char *Reallmoc(unsigned char *oldp,int oldn)
{
	unsigned char *newp = (unsigned char *)malloc((oldn+1) * sizeof(unsigned char));
	for(int i=0;i<oldn;i++)
		newp[i] = oldp[i];
	newp[oldn+1] = '\0';  
	return newp;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -