📄 idf.txt
字号:
#include <math.h>
#include <time.h>
#include <iostream>
using namespace std;
#define N 9 //文献数目
int comminute (char *text,long lg,int number);//分词程序
int fileopen(char *f,int n); //独指定文件
char word[200000][22]={0}; //160000条词库
int frequency[200000][N]={0}; //N篇文章
int wordleng=0; //词库中实际词条数目
void main(int n,char *arg[])
{
int i=0,j=0;
int ni;
int max[N]={0}; //存放文献使用频率最大词
char ch;
char *file[N]={0}; //需要检索的文献
char *savefile; //结果存放文档
clock_t start0, finish0; //程序运行时间
double sftime0;
start0 = clock();
savefile="D://12345.txt";
file[0] = "D://text1.txt";
file[1] = "D://text2.txt";
file[2] = "D://text3.txt";
file[3] = "D://text4.txt";
file[4] = "D://text5.txt";
file[5] = "D://text6.txt";
file[6] = "D://text7.txt";
file[7] = "D://text8.txt";
file[8] = "D://text9.txt";
FILE *cp = fopen("D:\\file2.txt","r");//词库位置
while(!feof(cp)) //读取词库
{
ch=fgetc(cp);
for(i=0;ch!=13&&i<22&&ch!=10;i++)
{
word[wordleng][i]=ch;
ch=fgetc(cp);
}
// std::cout<<(word[wordleng]); //屏幕输出。临时
wordleng++;
}
fclose(cp); //关闭词库
for(i=0;i<N;i++)
fileopen(file[i],i); //调用
FILE *p = fopen(savefile,"w"); //文本输出
fprintf(p," word "); //输出到文档结果
for(j=0;j<N;j++)
fprintf(p,"\t文%d词频\t文%d加权",j+1,j+1);
fprintf(p,"\n");
for(i=0;i<wordleng;i++) //计算文献j中初始频率最大词
{
for(j=0;j<N;j++)
if(frequency[i][j]>max[j])
max[j] = frequency[i][j];
}
for(i=0;i<wordleng;i++) //开始输出
{
ni=0;
for(j=0;j<N;j++) //计算包含词i的文件数目
{
if(frequency[i][j]!=0)
ni = ni+1;
}
if(ni!=0)
{
fprintf(p,"%-12s",word[i]); //出现的词语内容
for(j=0;j<N;j++)
if(frequency[i][j]!=0) //(frequency[i][j]*1.0/max[j])*log(N*1.0/ni)为词语加权
fprintf(p,"\t%d\t%5.4f",frequency[i][j], (frequency[i][j]*1.0/max[j])*log(N*1.0/ni));
else
fprintf(p,"\t0\t0.0000");
fprintf(p,"\n");
}
}
std::cout<<"结果成功输出到文件:" <<savefile<<endl;
finish0 = clock();
sftime0 = (double)(finish0 - start0) / CLOCKS_PER_SEC;//计算用时
std::cout<<"分词共用时间:"<<sftime0<<"秒."<<endl;
}
int fileopen(char *f,int n)
{
int i=0;
char *text = NULL; //检索的文献读取存放
long length=0,leng=0;
clock_t start, finish;
double sftime;
start = clock();
FILE *fp = NULL;
if((fp = fopen(f,"r"))==NULL) //打开指定文献
{
printf("无法打开文件%s!!",f);
}
fseek (fp,-1L,2); //计算文章字节长度
leng=length = ftell(fp);
rewind (fp);
std::cout <<"已打开"<<f<<",长度为:"<<length*1.0/1024<<"K字节."; //输出文章长度
if(length<1024) //分配内存大小,防止乱码适当扩大
{
text = (char *)malloc((length+20)* sizeof(char));
memset((char *)text,0,(length+20)*sizeof(char));
}
else
{
text = (char *)malloc(1040* sizeof(char));
memset((char *)text,0,1040*sizeof(char));
}
while(!feof(fp)) //读取指定文件
{
if (leng<1024)
for (i=0;i<leng-10;i++)
if(i%2==0)
while ((int)(text[ i ] = fgetc(fp))>=0);
else text[ i ] = fgetc(fp);
else
{
for (i=0;i<1024;i++)
if(i%2==0)
while ((int)(text[ i ] = fgetc(fp))>=0);
else text[ i ] = fgetc(fp);
leng=leng-1024;
}
// std::cout<<text<<endl; //调试临时显示
comminute (text,i,n); //调用分词程序
memset((char *)text,0,(i+18)*sizeof(char));
}
fclose(fp); //指定文件关闭
finish = clock();
sftime = (double)(finish - start) / CLOCKS_PER_SEC;//计算用时
std::cout<<" 分词用时:"<<sftime<<"秒."<<endl;
return 0;
}
int comminute (char *text,long lg,int number)
{
char segment[22],temp[22];
int begin=0,end=wordleng-1,middle;//定位词条标记
int point=0; //已分词处标记
int i,k;
int count=0;
while(point<lg)
{
memset((char *)segment,0,22*sizeof(char));
begin=0;
end=wordleng-1;
for (i=0; i<18 && point+i<lg; i++)//读取18个字符
{
segment[i] = text[point+i];
}
// std::cout<<segment<<endl; //临时输出,调试用
if(i>0)
{
while(end-begin>10) //二分法查找,大范围定位
{
middle=(int)((begin+end)/2);
k =(int) strcmp(segment,word[middle]);
if(k<0)
end=middle;
else
begin=middle;
}
while(i>2)
{
if(begin<415) //为准确适当范围
begin=0;
else begin=begin-415;
memset((char *)temp,0,22*sizeof(char));
strncpy(temp,segment,i);
while(end-begin>1) //二分法重新定位
{
middle=(int)((begin+end)/2);
k =(int) strcmp(temp,word[middle]);
if(k<0)
end=middle;
else
begin=middle;
}
if(strcmp(temp,word[begin])==0)//与词库匹配
{
frequency[begin][number]+=1;
break;
}
else
i=i-2; //缩短字符串
}
point=point+i;i=0; //最大匹配
}
}
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -