📄 tf.cpp

📁 我用容器写的文本词条tfidf权值计算程序

💻 CPP

字号:

#include <fstream>
#include <sstream>
#include <math.h>
#include <iostream>
#include <map>
#include <vector>
#include <string>
#include <algorithm>
using namespace std;
const M=20,N=200;        //M个文本，N个词
void main()
{
	ifstream file,file1;
	map<int,string> docs; 	   //文档容器
	map<int,string> w;         //词编号
	map<string,int> words;     //词频容器    
    map<string,int> wfreq;   //词在文本中的频率
	map<string,int>::iterator it;
    float TF_data[M][N];
	float TFIDF[M][N];
	vector<int> D(N,0);       //词所在文本数目
	

	int docCount=0,wordCount=0;
	string doc,word,s,a;

	file.open("input.txt");     //打开分词，去听用词后的文件
   
	while (getline(file,doc))   //统计文本数
    {	
	  docs[++docCount]=doc;    //文本装入容器	 
	  for (istringstream stream(doc);stream>>word;)  //读取一行文本中的词，以空格分开
	  {  		
		  words[word]++; 		 
	  }	  
	  
	} 	
	    wordCount=words.size();
	file.close();
  
	cout<<"文本总数："<<docCount<<endl;
	cout<<"词的总数："<<wordCount<<endl;

	int n=1;
	for (it=words.begin();it!=words.end();it++)  //给所有词编号
	{
		w[n++]=it->first;
		cout<<it->first<<endl;
	}
	
	
	file1.open("input.txt");
   
    int i=1;
    while(getline(file1,doc))         //遍历所有文本
	 {	 
		int j=0;
		wfreq.clear();
	    for (istringstream str(doc);str>>a;)
		{
			j++;
		    wfreq[a]++;
			
		}
		
		for(int k=1;k<=wordCount;k++)           //计算每个词在文本中的TF值
		{
			if(wfreq.count(w[k]))
			{	
				D[k]++;
			 	TF_data[i][k]=(float) wfreq[w[k]]/j;
			}
			else
				TF_data[i][k]= 0;
		}
		
		i++;
	 }
	file1.close();
    

	for (int x=1;x<=docCount;x++)       //输出TFIDF值矩阵文本数*词条
	{
		for (int y=1;y<=wordCount;y++)
		{				
			//IDF[y]=log((float)docCount/D[y]);
			TFIDF[x][y]=TF_data[x][y]*log((float)docCount/(D[y]));
              //TFIDF[x][y]=TF_data[x][y]*IDF[y];
		   	cout<<TFIDF[x][y]<<", ";
		}
		cout<<endl;
	}   
}

⌨️ 快捷键说明

复制代码 Ctrl + C

搜索代码 Ctrl + F

全屏模式 F11

切换主题 Ctrl + Shift + D

显示快捷键 ?

增大字号 Ctrl + =

减小字号 Ctrl + -