⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idf.txt

📁 关于TF-IDF排序问题,对于中文文档都能用的很方便的...需要的可以看一下子
💻 TXT
字号:
#include <math.h>
#include <time.h>
#include <iostream>
using namespace std;
#define N 9							 //文献数目

int comminute (char *text,long lg,int number);//分词程序
int fileopen(char *f,int n);		//独指定文件

char word[200000][22]={0};			//160000条词库
int frequency[200000][N]={0};		//N篇文章
int wordleng=0;						//词库中实际词条数目

void main(int n,char *arg[])
{
   int i=0,j=0;
   int ni;
   int max[N]={0};					//存放文献使用频率最大词
   char ch;
   char *file[N]={0};				//需要检索的文献
   char *savefile;					//结果存放文档
   clock_t start0, finish0;			//程序运行时间
   double sftime0;
   start0 = clock();

   savefile="D://12345.txt";
   file[0] = "D://text1.txt";
   file[1] = "D://text2.txt";   
   file[2] = "D://text3.txt";
   file[3] = "D://text4.txt";
   file[4] = "D://text5.txt";
   file[5] = "D://text6.txt";
   file[6] = "D://text7.txt";
   file[7] = "D://text8.txt";
   file[8] = "D://text9.txt";

   FILE *cp = fopen("D:\\file2.txt","r");//词库位置

   while(!feof(cp))						 //读取词库
   {
	   ch=fgetc(cp);
		 for(i=0;ch!=13&&i<22&&ch!=10;i++)
		 {
			word[wordleng][i]=ch;
				ch=fgetc(cp);
		 } 
//     std::cout<<(word[wordleng]);		 //屏幕输出。临时
	   wordleng++;
   }
   fclose(cp);							 //关闭词库
    
   for(i=0;i<N;i++)
	   fileopen(file[i],i);				 //调用

   FILE *p = fopen(savefile,"w");		//文本输出	
	fprintf(p,"  word      ");			//输出到文档结果
	for(j=0;j<N;j++)
		 fprintf(p,"\t文%d词频\t文%d加权",j+1,j+1);
    fprintf(p,"\n");
    for(i=0;i<wordleng;i++)				//计算文献j中初始频率最大词
    {
	   for(j=0;j<N;j++)
		   if(frequency[i][j]>max[j])
			   max[j] = frequency[i][j]; 
    }
	
   for(i=0;i<wordleng;i++)				//开始输出
   {
	   ni=0;
	   for(j=0;j<N;j++)					//计算包含词i的文件数目
	   {
		   if(frequency[i][j]!=0)
			   ni = ni+1;
	   }
	   if(ni!=0)
	   {			      
		  fprintf(p,"%-12s",word[i]);		//出现的词语内容
		   for(j=0;j<N;j++)
			if(frequency[i][j]!=0)		//(frequency[i][j]*1.0/max[j])*log(N*1.0/ni)为词语加权
				fprintf(p,"\t%d\t%5.4f",frequency[i][j], (frequency[i][j]*1.0/max[j])*log(N*1.0/ni));
			else 
				fprintf(p,"\t0\t0.0000");
			fprintf(p,"\n");
	   }
	
   }
   std::cout<<"结果成功输出到文件:" <<savefile<<endl;
   finish0 = clock();
   sftime0 = (double)(finish0 - start0) / CLOCKS_PER_SEC;//计算用时
   std::cout<<"分词共用时间:"<<sftime0<<"秒."<<endl;
}

int fileopen(char *f,int n)
{	
	int i=0;	
    char *text = NULL;					//检索的文献读取存放
	long length=0,leng=0;
	clock_t start, finish;
	double sftime;
	start = clock();

	FILE *fp = NULL;
 
    if((fp = fopen(f,"r"))==NULL)		//打开指定文献
    {
        printf("无法打开文件%s!!",f);
    } 

	fseek (fp,-1L,2);					//计算文章字节长度
	 leng=length = ftell(fp);
	  rewind (fp);
	  std::cout <<"已打开"<<f<<",长度为:"<<length*1.0/1024<<"K字节.";   //输出文章长度

	if(length<1024)						//分配内存大小,防止乱码适当扩大
	 {
		text = (char *)malloc((length+20)* sizeof(char));
		 memset((char *)text,0,(length+20)*sizeof(char));
	 }
	else
	 {
		text = (char *)malloc(1040* sizeof(char));
		 memset((char *)text,0,1040*sizeof(char));
	 }
	
    while(!feof(fp))					//读取指定文件
    {
       if (leng<1024)		  
	     for (i=0;i<leng-10;i++)
			  if(i%2==0)	  
		   while ((int)(text[ i ] = fgetc(fp))>=0);
			else text[ i ] = fgetc(fp);
	   else 
		 {
		   for (i=0;i<1024;i++)		 
			 if(i%2==0)	  
		   while ((int)(text[ i ] = fgetc(fp))>=0);
			else text[ i ] = fgetc(fp);
			 leng=leng-1024;
		 }
//	   std::cout<<text<<endl;			//调试临时显示
	   comminute (text,i,n);			//调用分词程序
	   memset((char *)text,0,(i+18)*sizeof(char));
	}
	fclose(fp);							//指定文件关闭

	finish = clock();
	 sftime = (double)(finish - start) / CLOCKS_PER_SEC;//计算用时
	  std::cout<<"  分词用时:"<<sftime<<"秒."<<endl;
	return 0;
 }

int comminute (char *text,long lg,int number)
{
	  char segment[22],temp[22];
	  int begin=0,end=wordleng-1,middle;//定位词条标记
	  int point=0;						//已分词处标记
	  int i,k;
	  int count=0;

	  while(point<lg)
	  {
		 memset((char *)segment,0,22*sizeof(char));
		  begin=0;
		 end=wordleng-1;   
	      for (i=0; i<18 && point+i<lg; i++)//读取18个字符
		  {
			   segment[i] = text[point+i];
		  }
//		 std::cout<<segment<<endl;			//临时输出,调试用
	
		 if(i>0)
		 {
			 while(end-begin>10)				//二分法查找,大范围定位
			{
				middle=(int)((begin+end)/2);
				 k =(int) strcmp(segment,word[middle]);
				if(k<0)
					end=middle;
				else
					begin=middle;
			}
			
			while(i>2)
			{						
				if(begin<415)				 //为准确适当范围
					begin=0;
				else begin=begin-415;
				memset((char *)temp,0,22*sizeof(char));
				  strncpy(temp,segment,i);
				while(end-begin>1)			//二分法重新定位
				{
					middle=(int)((begin+end)/2);
					k =(int) strcmp(temp,word[middle]);
					if(k<0)
						end=middle;
					else
						begin=middle;
				}
				if(strcmp(temp,word[begin])==0)//与词库匹配
				{
					frequency[begin][number]+=1;
					break;
				}
				else		
					i=i-2;						//缩短字符串
			}
			point=point+i;i=0;					//最大匹配
		}
	  }	
	  return 0;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -