📄 kmeans.cpp
字号:
// kmeans.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include "math.h"
#include "kmeans.h"
#include <fstream>
using namespace std;
#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif
/////////////////////////////////////////////////////////////////////////////
// The one and only application object
CWinApp theApp;
//
const int usersum = 900+1;//用户总数
const int itemsum =1600+1;//推介项总数
CString basefile;// = "D:\\experiment\\data\\u3.base";
CString testfile;// = "D:\\experiment\\data\\u3.test";
CString staticPath = "D:\\experiment\\static.txt";
CString resultsPath = "D:\\experiment\\time.txt";
CString clusterPath_5 = "D:\\experiment\\cluster_5.base";
CString clusterPath_1 = "D:\\experiment\\cluster_1.base";
CString clusterPath_2 = "D:\\experiment\\cluster_2.base";
CString clusterPath_3 = "D:\\experiment\\cluster_3.base";
CString clusterPath_4 = "D:\\experiment\\cluster_4.base";
// FILE * fp;
struct _simi
{
float value;//相似值
int num; //相似用户号
};
struct detailUserInfo
{
float PU[CLUSTER_NUM];
int NUM[CLUSTER_NUM];
};
struct detailMovieInfo
{
float PI[MOVIE_CLUSTER_NUM];
int NUM[MOVIE_CLUSTER_NUM];
};
// long trainuser[TRAIN_USER_NUM][ITEM_NUM]={0}; //训练集合user item rate矩阵
// long testUser[TEST_USER_NUM][ITEM_NUM]={0};
// long movie[ITEM_NUM][TEST_USER_NUM]={0};
//
//
// float cluster[CLUSTER_NUM][ITEM_NUM]={0};
// float movieCluster[MOVIE_CLUSTER_NUM][TEST_USER_NUM]={0};
long ** trainuser;
long ** movie;
float **cluster;
float ** movieCluster;
float RUI[CLUSTER_NUM][MOVIE_CLUSTER_NUM];
float evaluate[TEST_USER_NUM][ITEM_NUM];//存储useri对电影j的评分,r(i,j)
float PU[CLUSTER_NUM];// 类别i所占的比例
float PI[MOVIE_CLUSTER_NUM];//类别i所占的比例
int INUM[MOVIE_CLUSTER_NUM];//存储各类的个数一计算概率
int UNUM[CLUSTER_NUM];
detailMovieInfo Pmovie[ITEM_NUM];//属于各种类别的概率
detailUserInfo Puser[TEST_USER_NUM];
int totalTrainNum=0;//测试集的总个数
float recomend[ITEM_NUM];
float MAE = 0;
int recomItemCount = 0;
CString clusterFilePath;
float tempCluster[ITEM_NUM][ITEM_NUM];//存储当前该聚类中的和
float tempData[MAX_CLUSTER_NUM]={0};
int tempNum[MAX_CLUSTER_NUM]={0};//存储当前聚类的个数
int tempNum1[MAX_CLUSTER_NUM][ITEM_NUM];
int Buf_UIR(char* buf,int *user, int *item, int *rate);
float Simility(long* Ua, float*Ub);
float SimilityPerson(long* Ua, float*Ub);
void loadMemory();// 申请动态内存
void freeMemory();
int maxValue(float data[],int len);
BOOL calculate(float** center,long **test,int flag,int op);
BOOL UpdateClusterCenter(int flag,int trainTimes);//根据用户、电影和训练次数训练聚类的中心
BOOL readFile(CString path);//从path的文件中读取数据到test,如果flag==1 ,是读取用户信息,否则读取电影信息
void initialCluster();
void initial();
void evaluateALL();
void getUserMovieRelation();
void evaluateALL()
{
float temp;
int i,j,k,l;
for (i=1;i<TEST_USER_NUM;i++)
{
temp =0;
for (j=1;j<ITEM_NUM;j++)
{
temp =0;
for (k =0;k<CLUSTER_NUM;k++)
{
for (l=0;l<MOVIE_CLUSTER_NUM;l++)
{
temp+=Puser[i].PU[k]*Pmovie[j].PI[l]*RUI[k][l];//*PU[k]*PI[l];
}
}
evaluate[i][j]=temp;
// cout<<temp<<" ";
}
// cout<<endl;
}
return;
}
void initial()
{
int i,j;
for (i=0;i<TEST_USER_NUM;i++)
{
for (j=0;j<CLUSTER_NUM;j++)
{
Puser[i].PU[j]=0;
Puser[i].NUM[j]=0;
}
}
for (i=0;i<ITEM_NUM;i++)
{
for (j=0;j<MOVIE_CLUSTER_NUM;j++)
{
Pmovie[i].PI[j]=0;
Pmovie[i].NUM[j]=0;
}
}
for (i=0;i<CLUSTER_NUM;i++)
{
for (j=0;j<ITEM_NUM;j++)
{
cluster[i][j]=0;
}
}
for (i=0;i<MOVIE_CLUSTER_NUM;i++)
{
for (j=0;j<TEST_USER_NUM;j++)
{
movie[i][j]=0;
}
}
for (i=0;i<TEST_USER_NUM;i++)
{
for (j=0;j<ITEM_NUM;j++)
{
evaluate[i][j]=0;
}
}
}
void getUserMovieRelation()
{
int i,j,k;
float rate=0,sum=0;
for (i=0;i<CLUSTER_NUM;i++)//以此对每个用户进行处理
{
rate =0;sum=0;
for (j=0;j<MOVIE_CLUSTER_NUM;j++)
{
for (k=1;k<ITEM_NUM;k++)
{
rate+=cluster[i][k]*Pmovie[k].PI[j];
if (fabs(cluster[i][j]-0)>0.1)
{
sum+=Pmovie[k].PI[j];
}
//测试出非零错误
// if (fabs(Pmovie[k].PI[j]-0)>0.0000001)
// {
// cout<<"rate"<<rate<<endl;
// cout<<sum<<endl;
// }
// else
// {
// // cout<<"too"<<endl;
// }
}
if (sum)
{
// cout<<"rate"<<rate<<" "<<sum<<endl;
// RUI[i][j]=(unsigned long)(rate*1000/sum)%5;
RUI[i][j]=rate/sum;
cout<<RUI[i][j]<<endl;
}
else
{
RUI[i][j]=0;
}
}
}
return;
}
int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
ofstream fouts;
ofstream foutT;
fouts.open(staticPath,ios::out);
foutT.open(resultsPath,ios::out);
int nRetCode = 0;
int cont=1;
DWORD times =10;
DWORD proStartTime=0;
DWORD seconds=0;
DWORD currentTime =0;
DWORD startTime=0;
int usernum;
int itemnum;
int rate;
int i;
int trainTimes;
DWORD anzhsoft=GetCurrentTime()/1000;
// initialize MFC and print and error on failure
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: change error code to suit your needs
cerr << _T("Fatal Error: MFC initialization failed") << endl;
nRetCode = 1;
return nRetCode;
}
int testDataNum =0;
while (testDataNum<5)
{
loadMemory();
proStartTime = GetCurrentTime()/1000;
testDataNum++;
basefile.Format("D:\\experiment\\data\\u%d.base",testDataNum);
testfile.Format("D:\\experiment\\data\\u%d.test",testDataNum);
fouts<<"修正余弦相关相似性的算数平均"<<endl;
fouts<<"第"<<testDataNum<<"个测试集"<<endl;
startTime =GetCurrentTime()/1000;
trainTimes=10;
// ZeroMemory(trainuser,sizeof(trainuser));
// ZeroMemory(movie,sizeof(movie));
// ZeroMemory(cluster,sizeof(cluster));
// ZeroMemory(movieCluster,sizeof(movieCluster));
initial();
// 首先对用户进行聚类
readFile(basefile);
initialCluster();
//测试中心节点是否赋值
/*
float ff=0;
for (j =1;j<CLUSTER_NUM;j++)
{
for (i=1;i<ITEM_NUM;i++)
{
ff+=cluster[j][i];
if (cluster[j][i]>0)
{
cout<<cluster[j][i]<<endl;
}
// if (trainuser[j][i]!=0)
{
// cout<<"1"<<endl;
}
}
}
fouts<<ff<<endl;
cout<<ff<<endl;
break;*/
currentTime = GetCurrentTime()/1000;
trainTimes =50;
foutT<<"训练user用时";
currentTime = GetCurrentTime()/1000;
UpdateClusterCenter(FLAG_USER,trainTimes);
int i,j;
// for (i=0;i<CLUSTER_NUM;i++)
// {
// for(j=0;j<ITEM_NUM;j++)
// {
// fouts<<cluster[i][j]<<" ";
// }
// fouts<<endl;
// }
// fouts<<"user 每类所有的人数\n";
// for ( j =0;j<CLUSTER_NUM;j++)
// {
// fouts<<tempNum[j]<<endl;
// }
// break;
foutT<<GetCurrentTime()/1000-currentTime<<endl;
foutT<<"训练movie用时";
currentTime = GetCurrentTime()/1000;
calculate((float**)cluster,(long**)trainuser,FLAG_USER,COS);
UpdateClusterCenter(FLAG_MOVIE,trainTimes);
foutT<<GetCurrentTime()/1000-currentTime<<endl;
// fouts<<"Moive"<<endl;
// for (i=0;i<MOVIE_CLUSTER_NUM;i++)
// {
// for(j=0;j<TEST_USER_NUM;j++)
// {
// fouts<<movieCluster[i][j]<<" ";
// }
// fouts<<endl;
// }
// fouts<<"movie 每类所有的总数\n";
// for ( j =0;j<MOVIE_CLUSTER_NUM;j++)
// {
// fouts<<tempNum[j]<<endl;
// }
//
foutT<<"计算用户概率用时";
currentTime=GetCurrentTime()/1000;
foutT<<GetCurrentTime()/1000-currentTime<<endl;
foutT<<"计算movie概率用时";
currentTime = GetCurrentTime()/1000;
calculate(movieCluster,movie,FLAG_MOVIE,COS);
/* foutT<<GetCurrentTime()/1000-currentTime<<endl;
fouts<<"******************************************user 各类的比率"<<endl;
for (int i=0;i<CLUSTER_NUM;i++)
{
fouts<<PU[i]<<endl;
}
fouts<<"******************************************movie 各类的比率"<<endl;
for (i=0;i<MOVIE_CLUSTER_NUM;i++)
{
fouts<<PI[i]<<endl;
}
fouts<<"******************************************用户与各类的比率"<<endl;
for (i=0;i<CLUSTER_NUM;i++)
{
fouts<<Puser[190].PU[i]<<endl;
}
for (i=0;i<TEST_USER_NUM;i++)
{
for (j=0;j<CLUSTER_NUM;j++)
{
fouts<<Puser[i].PU[j]<<" ";
}
fouts<<endl;
}
*/
getUserMovieRelation();
fouts<<"RUI"<<endl;
float temp;
// for (i=0;i<CLUSTER_NUM;i++)
// {
// temp =0;
// for (j=0;j<MOVIE_CLUSTER_NUM;j++)
// {
// fouts<<RUI[i][j]<<" ";
// temp+=RUI[i][j];
// }
// fouts<<endl;
// fouts<<temp<<endl;
// }
evaluateALL();
FILE *fp=fopen(testfile,"r");
if (!fp)
{
cout<<"open testfile failed";
return nRetCode;
}
char tmpbuf[101];
MAE =0;
recomItemCount =0;
while (!feof(fp))
{
fgets(tmpbuf,100,fp);
Buf_UIR(tmpbuf,&usernum,&itemnum,&rate);
if (usernum<usersum&&itemnum<itemsum)
{
MAE += fabs(evaluate[usernum][itemnum]-rate);
recomItemCount++;
}
}
fclose(fp);
MAE = MAE/recomItemCount;
cout<<endl<<"MAE:"<<MAE<<" 推介项"<<recomItemCount<<endl;
fouts<<endl<<"MAE:"<<MAE<<" 推介项"<<recomItemCount<<endl;
// break;
fouts<<"将数据聚类历时"<<currentTime-startTime<<"秒"<<endl;
startTime = GetCurrentTime()/1000;
fouts<<"获得各个类别的相似性历时"<<GetCurrentTime()/1000-startTime<<endl;
startTime = GetCurrentTime()/1000;
//用户相关度由高到低排序
fouts<<"将用户的相似性排序总计耗时"<<GetCurrentTime()/1000-startTime<<endl;
testfile.Format("D:\\experiment\\data\\u%d.test",testDataNum);
startTime = GetCurrentTime()/1000;
fouts<<"用户推荐总计耗时"<<GetCurrentTime()/1000-proStartTime<<endl;
// freeMemory();
}
fouts<<"程序运行总的时间"<<GetCurrentTime()/1000-anzhsoft<<endl;
cout<<"程序运行总的时间"<<GetCurrentTime()/1000-anzhsoft<<endl;
foutT.close();
fouts.close();
// cin>>i;
//
return nRetCode;
}
//进行一次的验证,以改更新中心节点
//计算每个用户属于每个类别的概率,每个组所占的比率
BOOL calculate(float** center,long **test,int flag,int op)
{
int clusterNum,itemNum,totalNum,i,j;
if (FLAG_USER==flag)
{
clusterNum = CLUSTER_NUM;
itemNum = ITEM_NUM;
totalNum = TRAIN_USER_NUM;
}
else if (FLAG_MOVIE == flag)
{
clusterNum = MOVIE_CLUSTER_NUM;
itemNum = TRAIN_USER_NUM;
totalNum = ITEM_NUM;
}
float*tempData= new float[clusterNum+1];//存储不同类别所占的概率
float*totalData = new float[clusterNum+1];
float temp;
ZeroMemory(totalData,sizeof(totalData));
// 首先获得每类所占的比率,顺序不能变。
temp=0;
for (i=0;i<clusterNum;i++)
{
temp+=tempNum[i];
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -