📄 gjc.cpp
字号:
#include <stdio.h>
#include <stdlib.h>
#include<string>
#include <fstream >
#include<process.h>
using namespace std;
const int WORD_LENGTH = 256; //使用const int 比用define要好
const int DOCUMENT_LENGTH=475;
//定义一个结构体
struct wordNode
{
char word[WORD_LENGTH]; //存放单词
int iWordCount; //单词出现次数
wordNode *pNext; //结构体的指针
};
//链表的头指针
wordNode *pHeader = NULL;
//声明需要的函数
void CountWord(char *current);
wordNode * SearchWord(char *current);
void PrintResult(int sum);
void Release();
//主函数
int main()
{
FILE *fp1,*fp2,*fp3;//打开一个文件
int count=0,count1=0,count2=0;
std::string stop1[3000],stop2[3000],stop3[3000];
char rdBuff[100];
//打开文件停用词表
if((fp1=fopen("D:\\tyc.txt","r+"))==NULL)
{
printf("can not open file!");
exit(0);
}
//将停用词表存放在stop1[] 中
while( !feof(fp1) )
{
fgets(rdBuff,100,fp1);
stop1[count1] = rdBuff;
int m = stop1[count1].find("\n");//读入了\n
if(m != string::npos)
stop1[count1]=stop1[count1].substr(0,(stop1[count1].length() - 1));
count1++;
}
//给要处理的语料进行去除停用词
//首先打开语料,把它存入一个stop2[]中
printf("输入原文件的语料:\n");
printf("\n");
if((fp2=fopen("D:\\1_result.txt","r+"))==NULL)
{
printf("can not open file!");
exit(0);
}
while((fscanf(fp2,"%s", rdBuff))!= EOF )
{
stop2[count2] = rdBuff;
count2++;
printf("%s ", stop2[count2 - 1].c_str() );
}
//去除停用词
int i,j;
for(i=0;i<count2;i++)
{
int k=0;
for(j=0;j<count1;j++)
{
if(stop2[i].compare(stop1[j] )==0)
{
k=1;
break;
}
}
if(k==0)
{
stop3[count]=stop2[i];
count++;
}
}
//去除停用词后输出语料
printf("\n");
printf("\n");
printf("\n去除停用词后的语料是: \n");
for(i=0;i<count;i++)
{
printf("%s ",stop3[i].c_str());
}
printf("\n");
printf("\n");
//把去除停用词后的预料存入文件中
if((fp3=fopen("D:\\result.txt","w+"))==NULL)
{
printf("can not open file!");
exit(0);
}
//把去除停用词后的语料放入一个文件中,并输出查看结果
printf("\n");
printf("\n");
printf("如果去除停用词后的语料成功存入文件中,则显示此文件的内容:\n");
int m=0;
while(m<=count)
{
fprintf(fp3,"%s%s", stop3[m].c_str()," ");
m++;
printf("%s ", stop3[m-1].c_str() );
}
fclose(fp1);
fclose(fp2);
fclose(fp3);
//临时存放单词的词组
char temp[WORD_LENGTH];
//打开要读取的文件
FILE *fp;
if((fp=fopen("D:\\result.txt", "r"))==NULL)
{
printf("Open file failed!!\n");
exit(0);
}
//循环读取文本中的内容
int count3=0;
while((fscanf(fp,"%s",temp))!= EOF )
{
CountWord(temp);
count3++;
}
//关闭文件
fclose(fp);
//统计一篇文档的总词的个数
printf("\n该文档的单词总数是:%d\n",count3);
//输出统计结果
PrintResult(count3);
//释放内存,一定要养成好习惯
Release();
return 0;
}
//单词统计
void CountWord(char *current)
{
wordNode *pNode = NULL;
pNode = SearchWord(current);
if(NULL == pNode)
{
return;
}
else
{
pNode->iWordCount++;
}
}
//查找单词所在节点
wordNode * SearchWord(char *current)
{
//当链表为空的时候,也就统计第一个单词时
if( pHeader == NULL)
{
pHeader = new wordNode;
strcpy(pHeader->word, current);
pHeader->iWordCount = 0;
pHeader->pNext = NULL;
return pHeader;
}
//搜索现有的链表
wordNode *pCurr = pHeader;
wordNode *pPre = NULL; //跟踪的指针
while( (NULL != pCurr) && (0 != strcmp(pCurr->word, current)) )
{ //如果链表不为空,并且链表中节点的单词和当前单词不相等,就继续搜索
pPre = pCurr;
pCurr = pCurr->pNext;
}
//该单词不存在
if(NULL == pCurr) //该单词不存在时,就建立一个新的节点把它连接在链表中
{
pCurr = new wordNode;
strcpy(pCurr->word, current);
pCurr->iWordCount = 0;
pCurr->pNext = NULL;
pPre->pNext = pCurr;
}
return pCurr;
}
//输出结果
void PrintResult(int sum)
{ float tf;
int div;
if(NULL == pHeader)
printf("No Word!!\n");
else
{
wordNode *pCurr = pHeader;
printf("\n");
printf("\n");
printf("******************统计词频*********************\n");
printf("词语名 该词出现的次数 词频(TF)");
printf("\n");
while(NULL != pCurr)
{
div=pCurr->iWordCount;
tf=(float)div/sum;//计算词频
printf("%s\t\t%d\t\t\t%f\n", pCurr->word, pCurr->iWordCount,tf);
pCurr = pCurr->pNext;
}
//统计idf值
for(int i=1;i<=DOCUMENT_LENGTH;i++)
{
}
}
}
void Release() //释放每个节点
{
if(NULL == pHeader)
{return; }
wordNode *pCurr = pHeader;
while( pCurr!= NULL)
{
pHeader = pCurr->pNext;
delete pCurr;
pCurr = pHeader;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -