📄 c_kmeans.h
字号:
// my kmeans Class
#ifndef _C_KMEANS_H__
#define _C_KMEANS_H__
#define DELTA 1
struct cluster
{
int size; //类成员个数.
void * center;
void * newcenter;
//double radius; //
int * members; //子成员在buf中的 id 队列。
};
template <class T>
class k_means
{
private:
int m_GroupNum; //分组个数
int m_VectorSum; //向量总数
int m_VectorSize; //向量维数
int m_CaculateTimes; //重新计算聚类中心次数
struct cluster* m_clusters; //各类信息
void* m_src; //初始向量资源
public:
k_means();
~k_means();
int GetCaculateTimes()
{
return m_CaculateTimes;
}
int GetLargestClusterId();
void ExportSeeds(T* seeds);
void SplitCluster(int SrcCluster, int newCluster);
void CombineClusters(int Cluster1, int Cluster2);
void MakeFirstRoundCenters();
bool MakeClusterCenter(int cluster_id);
int init(T* data, int dim , int g_num, int P_num); //初始化分组。
int refresh_clusters(); //重新聚类,找出新的中心.
bool check_steady(); //检查是否已达稳定状态.
int show_result(); //show 结果出来.
};
//初始化码本.
template <class T>
int k_means<T>::init(T* data, int dim , int g_num, int P_num)
{
//int id1 , id2; //记录两个相距最远的点
m_GroupNum = g_num;
m_VectorSum = P_num;
m_VectorSize = dim;
m_CaculateTimes = 0;
m_clusters = new struct cluster[m_GroupNum];
m_src = data;
for (int i = 0; i < m_GroupNum; i++)
{
m_clusters[i].center = new T[m_VectorSize];
m_clusters[i].newcenter = new T[m_VectorSize];
m_clusters[i].members = new int[P_num];
m_clusters[i].size = 0;
}
MakeFirstRoundCenters();
return 0;
};
template <class T>
k_means<T>::k_means()
{
m_GroupNum = 0;
m_VectorSum = 0;
m_VectorSize = 0;
m_CaculateTimes = 0;
m_src = NULL;
m_clusters = NULL;
}
//析构,释放申请的内存;
template <class T>
k_means<T>::~k_means()
{
for (int i = 0 ;i < m_GroupNum; i++)
{
if (m_clusters)
{
delete [] m_clusters[i].center;
delete [] m_clusters[i].members;
delete [] m_clusters[i].newcenter;
}
}
delete [] m_clusters;
}
//判断是否已达稳定状态.
//
//
template <class T>
bool k_means<T>::check_steady()
{
for (int i = 0 ;i < m_GroupNum;i ++)
{
for (int j = 0; j < m_VectorSize; j++)
{
if (((T*)(m_clusters[i].center))[j] != ((T*)(m_clusters[i].newcenter))[j])
{
return false;
}
}
/*
if (dis((T*)m_clusters[i].center, (T*)m_clusters[i].center ,m_VectorSize ) > 0.01)
{
return false;
}
*/
}
return true;
}
template <class T>
int k_means<T>::refresh_clusters()
{
T* data;
double distance, dis_min;
int groupId; //应该放到哪个组里面
data = (T*)m_src;
//各组元素个数置0;
for (int i = 0; i < m_GroupNum; i ++)
{
m_clusters[i].size = 0;
}
for (int pointid = 0; pointid < m_VectorSum; pointid++) //遍历点阵进行分组
{
//与各个组重心比较
for (int j = 0; j < m_GroupNum; j++)
{
distance = (double)dis( &(((T*)m_src)[pointid * m_VectorSize]),(T*)(m_clusters[j].newcenter),m_VectorSize);
if (distance < dis_min || j == 0)
{
dis_min = distance;
groupId = j;
}
}
m_clusters[groupId].members[m_clusters[groupId].size] = pointid;
m_clusters[groupId].size++; //加入一个成员。
}
//计算各个分组的中心.
//double d_sum; //保存各维的坐标总和.
for (int gid = 0; gid < m_GroupNum; gid++)
{
if (m_clusters[i].size != 0)
{
MakeClusterCenter(gid);
}
}
//如果出现空组,then分裂大组。
for (gid = 0; gid < m_GroupNum; gid++)
{
if (m_clusters[gid].size == 0)
{
SplitCluster(GetLargestClusterId(),gid);
}
}
m_CaculateTimes++;
return 0;
}
template <class T>
int k_means<T>::show_result()
{
for (int i = 0; i < m_GroupNum; i ++)
{
cout<<"group "<< i << ":" <<endl;
//打印出该分组的中心:
cout<< "center : ";
for (int dim = 0; dim < m_VectorSize; dim ++)
{
cout<<((T*)(m_clusters[i].newcenter))[dim] << ",";
}
cout<<endl;
cout<<"组内成员个数 : " << m_clusters[i].size<<endl;
//总点数过多,就不显示每一个点了.
if (m_VectorSum <= 200)
{
//打印分组的各点.
cout<<"members : "<<endl;
for (int j = 0;j < m_clusters[i].size; j++)
{
cout<<" ";
for (dim = 0;dim < m_VectorSize; dim ++)
{
cout<<((T*)m_src)[m_clusters[i].members[j] * m_VectorSize + dim]<<",";
}
cout<<endl;
}
}
cout<<endl;
}
return 0;
}
//分裂法初始化码本,形成所需个数的初始向量集.
//初始码本的选择算法,很重要!
template <class T>
void k_means<T>::MakeFirstRoundCenters()
{
T * tmp_center = new T[m_VectorSize];
double d_sum;
for (int j = 0; j < m_VectorSize; j++) //计算中心的各维数据.
{
d_sum = 0;
for (int k = 0 ;k < m_VectorSum; k++)
{
if (j == 0)
{
m_clusters[0].members[k] = k;
}
d_sum += ((T*) m_src)[k * m_VectorSize + j];
}
((T*)(tmp_center))[j] = (T)(d_sum / m_VectorSum);
}
memcpy(m_clusters[0].newcenter, tmp_center, m_VectorSize * sizeof(T));
m_clusters[0].size = m_VectorSum;
int M = 1; //码本尺寸,即码本个数.
while (1)
{
for (int i = 0 ;i < M ;i++)
{
if (M + i >= m_GroupNum) //判断是否已经分离出了足够的码本.
{
return;
}
//SplitCluster(i, M + i); //分裂cluster[i] to cluster[i] + cluster[i+M]
//to be continued.
SplitCluster(GetLargestClusterId(),M + i);
}
M *= 2;
}
return;
}
//现在还没有用到. 关键要引入一个半径的概念
template <class T>
void k_means<T>::CombineClusters(int Cluster1, int Cluster2)
{
int Size = m_clusters[Cluster2].size;
memcpy(&(m_clusters[Cluster1].members[m_clusters[Cluster1].size]),m_clusters[Cluster1].members,Size * sizeof(int));
m_clusters[Cluster1].size += Size;
m_clusters[Cluster2].size = 0;
MakeClusterCenter(Cluster1);
}
//分裂出一个新的Cluster 至 newCluster处
template <class T>
void k_means<T>::SplitCluster(int SrcCluster, int newCluster)
{
int PointSum = m_clusters[SrcCluster].size;
memcpy(m_clusters[newCluster].newcenter,m_clusters[SrcCluster].newcenter, m_VectorSize * sizeof(T));
for (int i = 0 ; i < m_VectorSize ; i++)
{
((T*)(m_clusters[newCluster].newcenter))[i] += DELTA;
((T*)(m_clusters[SrcCluster].newcenter))[i] -= DELTA;
}
//分离一组中的数据.
int s_id = 0, d_id = 0;
double s_dis;
double d_dis;
for (int j = 0; j < PointSum; j++)
{
s_dis = dis( &(((T*)m_src)[m_clusters[SrcCluster].members[j] * m_VectorSize]),(T*)(m_clusters[SrcCluster].newcenter),m_VectorSize);
d_dis = dis( &(((T*)m_src)[m_clusters[SrcCluster].members[j] * m_VectorSize]),(T*)(m_clusters[newCluster].newcenter),m_VectorSize);
if (s_dis > d_dis)
{
m_clusters[newCluster].members[d_id++] = m_clusters[SrcCluster].members[j];
}
else
{
m_clusters[SrcCluster].members[s_id++] = m_clusters[SrcCluster].members[j];
}
}
m_clusters[newCluster].size = d_id;
m_clusters[SrcCluster].size = s_id;
MakeClusterCenter(SrcCluster);
MakeClusterCenter(newCluster);
return ;
}
//计算簇中心.
template <class T>
bool k_means<T>::MakeClusterCenter(int cluster_id)
{
if (cluster_id >= m_GroupNum)
{
return false;
}
memcpy(m_clusters[cluster_id].center,m_clusters[cluster_id].newcenter,m_VectorSize * sizeof(T));
double d_sum;
for (int j = 0; j < m_VectorSize; j++) //计算中心的各维数据.
{
d_sum = 0;
for (int k = 0 ;k < m_clusters[cluster_id].size; k++)
{
d_sum += ((T*) m_src)[ m_clusters[cluster_id].members[k] * m_VectorSize + j ];
}
((T*)(m_clusters[cluster_id].newcenter))[j] = (T)(d_sum / m_clusters[cluster_id].size);
}
return true;
}
template <class T>
void k_means<T>::ExportSeeds(T* seeds)
{
for (int i = 0; i < m_GroupNum ; i++)
{
memcpy (&seeds[i * m_VectorSize],m_clusters[i].newcenter, m_VectorSize * sizeof(T));
}
}
template <class T>
int k_means<T>::GetLargestClusterId()
{
int id = 0;
int size = 0;
for (int i = 0;i < m_GroupNum; i++)
{
if (m_clusters[i].size > size)
{
size = m_clusters[i].size;
id = i;
}
}
return id;
}
#endif
/* -------才开始用的最远两点分段取初始中心点的做法。
find_two_points(data, dim, g_num, P_num, &id1, &id2);
for (int i = 0; i < m_GroupNum; i++)
{
m_clusters[i].center = new T[m_VectorSize];
m_clusters[i].newcenter = new T[m_VectorSize];
m_clusters[i].members = new int[P_num];
m_clusters[i].size = 1;
//找两个相距最远的点,然后取等间距的groupnum个点为中心。
for (int j = 0 ;j < dim ; j++)
{
((T*)(m_clusters[i].center))[j] = data[id1 * dim + j] + (data[id2 * dim + j ] - data[id1 * dim + j]) * i / (m_GroupNum - 1);
}
memcpy(m_clusters[i].newcenter,m_clusters[i].center, m_VectorSize * sizeof(T));
}
*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -