📄 gjc.cpp

📁 给一篇文章
💻 CPP
字号:
#include <stdio.h> 
#include <stdlib.h> 
#include<string>
#include <fstream >
#include<process.h>
using  namespace std;
const int WORD_LENGTH = 256; //使用const int 比用define要好 
const int DOCUMENT_LENGTH=475;




//定义一个结构体
struct wordNode
{ 
char word[WORD_LENGTH]; //存放单词 
int iWordCount; //单词出现次数 
wordNode *pNext; //结构体的指针 
}; 

//链表的头指针
wordNode *pHeader = NULL;  



//声明需要的函数 
void CountWord(char *current); 
wordNode * SearchWord(char *current); 
void PrintResult(int sum); 
void Release(); 

//主函数
int main() 
{ 
 
	FILE *fp1,*fp2,*fp3;//打开一个文件
	int count=0,count1=0,count2=0;
	std::string stop1[3000],stop2[3000],stop3[3000];
    char rdBuff[100];

	
//打开文件停用词表
     	if((fp1=fopen("D:\\tyc.txt","r+"))==NULL)
		{
	     printf("can not open file!");
	     exit(0);
		}


//将停用词表存放在stop1[] 中
       while( !feof(fp1) )
	   {

	     fgets(rdBuff,100,fp1);
		 stop1[count1] = rdBuff;
	     int m = stop1[count1].find("\n");//读入了\n
		 if(m != string::npos) 
		 stop1[count1]=stop1[count1].substr(0,(stop1[count1].length() - 1));
		 count1++;
		
	   }



 //给要处理的语料进行去除停用词


   //首先打开语料，把它存入一个stop2[]中
	    printf("输入原文件的语料：\n");
	    printf("\n");
        if((fp2=fopen("D:\\1_result.txt","r+"))==NULL)
		{ 
	       printf("can not open file!");
	       exit(0);
		}
        
	
        while((fscanf(fp2,"%s", rdBuff))!= EOF )
		{   
		 stop2[count2] = rdBuff;
		 count2++; 
		 printf("%s  ", stop2[count2 - 1].c_str() );
		}

    //去除停用词

    
	   int i,j;
       for(i=0;i<count2;i++)
	   { 
		   int k=0;
           for(j=0;j<count1;j++)
		   {
		    
			  if(stop2[i].compare(stop1[j] )==0)
		  
			  { 
				k=1;
			    break;
			  }
		    
		  }

         if(k==0)
		 {
			 stop3[count]=stop2[i];
	         count++;
		 }
	 }
	  
	   
//去除停用词后输出语料	   
	   printf("\n");
	   printf("\n");
	   printf("\n去除停用词后的语料是： \n");
     for(i=0;i<count;i++)
	 { 
		 printf("%s  ",stop3[i].c_str());

	 }
	 printf("\n");
	 printf("\n");

//把去除停用词后的预料存入文件中
	if((fp3=fopen("D:\\result.txt","w+"))==NULL)
	{
	     printf("can not open file!");
	     exit(0);
	}
//把去除停用词后的语料放入一个文件中，并输出查看结果
	    printf("\n");
		printf("\n");
	    printf("如果去除停用词后的语料成功存入文件中，则显示此文件的内容：\n");
        int m=0;
        while(m<=count)
		{ 
		 fprintf(fp3,"%s%s", stop3[m].c_str(),"  ");
		 m++;
		 printf("%s  ", stop3[m-1].c_str() );
		}

	fclose(fp1);
    fclose(fp2);
    fclose(fp3); 

//临时存放单词的词组 
char temp[WORD_LENGTH]; 

//打开要读取的文件 
FILE *fp; 
if((fp=fopen("D:\\result.txt", "r"))==NULL) 
{ 
printf("Open file failed!!\n"); 
exit(0); 
} 

//循环读取文本中的内容 
int count3=0;
while((fscanf(fp,"%s",temp))!= EOF ) 
{ 
  CountWord(temp); 
  count3++;

} 


//关闭文件 
fclose(fp); 

//统计一篇文档的总词的个数
printf("\n该文档的单词总数是：%d\n",count3);

//输出统计结果 
PrintResult(count3); 

//释放内存，一定要养成好习惯 
Release(); 

return 0;




} 

//单词统计 
void CountWord(char *current) 
{ 
   wordNode *pNode = NULL; 
   pNode = SearchWord(current); 
   if(NULL == pNode) 
   { 
     return; 
   } 
   else 
   { 
    pNode->iWordCount++; 
   } 
} 

//查找单词所在节点 
wordNode * SearchWord(char *current) 
{ 
//当链表为空的时候，也就统计第一个单词时 
  if( pHeader == NULL) 
  { 
   pHeader = new wordNode; 
   strcpy(pHeader->word, current);
   pHeader->iWordCount = 0; 
   pHeader->pNext = NULL; 
   return pHeader; 
  } 

//搜索现有的链表 
  wordNode *pCurr = pHeader; 
  wordNode *pPre = NULL; //跟踪的指针
  while( (NULL != pCurr) && (0 != strcmp(pCurr->word, current)) ) 
  { //如果链表不为空，并且链表中节点的单词和当前单词不相等，就继续搜索
     pPre = pCurr; 
     pCurr = pCurr->pNext; 
  } 

//该单词不存在 
  if(NULL == pCurr) //该单词不存在时，就建立一个新的节点把它连接在链表中
  { 
   pCurr = new wordNode; 
   strcpy(pCurr->word, current); 
   pCurr->iWordCount = 0; 
   pCurr->pNext = NULL; 
   pPre->pNext = pCurr; 
  } 
  return pCurr; 
} 

//输出结果 
void PrintResult(int sum) 
{ float tf;
  int div;
  if(NULL == pHeader) 
  printf("No Word!!\n"); 
  
  else 
  { 
    wordNode *pCurr = pHeader; 
    printf("\n");
    printf("\n");
    printf("******************统计词频*********************\n");
    printf("词语名       该词出现的次数             词频（TF）");
    printf("\n");
    while(NULL != pCurr) 
	{  
		div=pCurr->iWordCount;
		tf=(float)div/sum;//计算词频
		printf("%s\t\t%d\t\t\t%f\n", pCurr->word, pCurr->iWordCount,tf); 
        pCurr = pCurr->pNext; 
	} 
    //统计idf值
    for(int i=1;i<=DOCUMENT_LENGTH;i++)
	{
	
	
	
	
	}

  } 

} 

void Release() //释放每个节点
{ 
	if(NULL == pHeader) 
	{return; }

    wordNode *pCurr = pHeader; 
    while( pCurr!= NULL) 
	{ 
      pHeader = pCurr->pNext; 
      delete pCurr; 
      pCurr = pHeader; 
	} 
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -