📄 倒排文档.cpp
字号:
#include <time.h>
#include <iostream>
using namespace std;
struct cell
{
char lemma[20];
unsigned char *p;
};
cell inver[200000];
int wordleng=0;
int comminute (char *text,long lg,int N);
unsigned char *Reallmoc(unsigned char *oldp,int oldn);
void main (char *arg[])
{
int i=0,N=0,j=0;
int piece=0;
int number=0;
char *text = NULL; //检索的文献读取存放
long length=0;
char ch='\0';
char *file={0};
FILE *fp = NULL;
char *savefile; //结果存放文档
clock_t start0, finish0; //程序运行时间
double sftime0;
start0 = clock();
savefile="倒排文档.txt";
file = "text.txt";
FILE *cp = fopen("词库.txt","r");//词库位置
while(!feof(cp)) //读取词库
{
ch=fgetc(cp);
i=0;
while(ch!=13&&ch!=10&&!feof(cp))
{
inver[wordleng].lemma[i]=ch;
ch=fgetc(cp);
i++;
}
// std::cout<<(inver[wordleng].lemma); //屏幕输出。临时
if(i>3)
{
for(;i<20;i++)
inver[wordleng].lemma[i]='\0';
inver[wordleng].p=NULL;
wordleng++;
}
}
fclose(cp); //关闭词库
if((fp = fopen(file,"r"))==NULL) //打开指定文献
printf("无法打开文件%s!!",file);
fseek (fp,-1L,2); //计算文章字节长度
length = ftell(fp);
rewind (fp);
std::cout <<"已打开"<<file<<",长度为:"<<length*1.0/1024<<"K字节."<<endl; //输出文章长度
piece = 2*(length/508); //将文档切割成255份
text = (char *)malloc((piece+20)* sizeof(char));
memset((char *)text,0,(piece+20)*sizeof(char));
while(!feof(fp)) //读取指定文件
{
if (length<piece)
{
for (i=0;i<length-10;i++)
if(i%2==0)
while ((int)(text[ i ] = fgetc(fp))>=0);
else
text[ i ] = fgetc(fp);
N++;
}
else
{
for (i=0;i<piece;i++)
if(i%2==0)
while ((int)(text[ i ] = fgetc(fp))>=0);
else
text[ i ] = fgetc(fp);
length=length-piece;
N++;
}
// std::cout<<text<<endl; //调试临时显示
comminute (text,i,N); //调用分词程序
memset((char *)text,0,(i+18)*sizeof(char));
}
fclose(fp);
FILE *wp = fopen(savefile,"w"); //文本输出
for(i=0;i<wordleng;i++) //开始输出
{
if((inver[i].p)!=NULL)
{
fprintf(wp,"%-12s",inver[i].lemma); //出现的词语内容
number = (int)inver[i].p[0];
for(j=1;j<=number;j++)
fprintf(wp,"\t%d",inver[i].p[j]);
fprintf(wp,"\n");
}
}
std::cout<<"结果成功输出到文件:" <<savefile<<endl;
finish0 = clock();
sftime0 = (double)(finish0 - start0) / CLOCKS_PER_SEC;//计算用时
std::cout<<"建立倒排文档共用时:"<<sftime0<<"秒."<<endl;
getchar();
}
int comminute (char *text,long lg,int N)
{
char segment[22],temp[22];
int begin=0,end=wordleng-1,middle;//定位词条标记
int point=0; //已分词处标记
int i,k;
int number=0;
while(point<lg)
{
memset((char *)segment,0,22*sizeof(char));
begin=0;
end=wordleng-1;
for (i=0; i<18 && point+i<lg; i++)//读取18个字符
segment[i] = text[point+i];
if(i>0)
{
while(end-begin>3) //二分法查找,大范围定位
{
middle=(int)((begin+end)/2);
k =(int) strcmp(segment,inver[middle].lemma);
if(k<0)
end=middle;
else
begin=middle;
}
while(i>2)
{
if(begin<400) //为准确适当范围
begin=0;
else
begin=begin-400;
memset((char *)temp,0,22*sizeof(char));
strncpy(temp,segment,i);
while(end-begin>1) //二分法重新定位
{
middle=(int)((begin+end)/2);
k =(int) strcmp(temp,inver[middle].lemma);
if(k<0)
end=middle;
else
begin=middle;
}
if(strcmp(temp,inver[begin].lemma)==0)//与词库匹配
{
if((inver[begin].p)==NULL)
{
inver[begin].p = (unsigned char *)malloc(2* sizeof(char));//申请空间
inver[begin].p[0] = 1;
inver[begin].p[1] = N;
break;
}
else
{
number = inver[begin].p[0];
if(inver[begin].p[number]!=N)
{
inver[begin].p[0] = number+1;
inver[begin].p = Reallmoc(inver[begin].p, number+1);//调整空间大小
inver[begin].p[number+1] = N;
}
break;
}
}
else
i=i-2; //缩短字符串
}
point=point+i;i=0; //最大匹配
}
}
return 0;
}
//替代realloc函数
unsigned char *Reallmoc(unsigned char *oldp,int oldn)
{
unsigned char *newp = (unsigned char *)malloc((oldn+1) * sizeof(unsigned char));
for(int i=0;i<oldn;i++)
newp[i] = oldp[i];
newp[oldn+1] = '\0';
return newp;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -