⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mpi_kmeans.cxx

📁 改进的k均值算法
💻 CXX
字号:
#include <stdio.h>#include <stdlib.h>#include <float.h>#include <memory.h>#include <math.h>#include <assert.h>#include "mpi_kmeans.h"int comp_randperm (const void * a, const void * b){	return ((int)( *(double*)a - *(double*)b ));}void randperm(unsigned int *order, unsigned int npoints){	double *r;	unsigned int i;	r = (double*)malloc(2*npoints*sizeof(double));	for (i=0; i<2*npoints; i++,i++)	{		r[i] = rand();		r[i+1] = i/2;	}	qsort (r, npoints, 2*sizeof(double), comp_randperm);	for (i=1; i<2*npoints; i++,i++)		order[i/2] = (unsigned int)r[i];	free(r);}#if INPUT_TYPE>0double compute_distance(const PREC *vec1, const PREC *vec2, const unsigned int dim){	unsigned int k;	double d,df;	d = 0.0;	for ( k=0 ; k<dim ; k++ )	{		df = ((double)vec1[k]-(double)vec2[k]);		d += df*df;	}	d = sqrt(d);	return d;}double compute_distance(const PREC *vec1, const double *vec2, const unsigned int dim){	unsigned int k;	double d,df;	d = 0.0;	for ( k=0 ; k<dim ; k++ )	{		df = ((double)vec1[k]-vec2[k]);		d += df*df;	}	d = sqrt(d);	return d;}#endifdouble compute_distance(const double *vec1, const double *vec2, const unsigned int dim){	unsigned int k;	double d,df;	d = 0.0;	for ( k=0 ; k<dim ; k++ )	{		df = (vec1[k]-vec2[k]);		d += df*df;	}	d = sqrt(d);	return d;}double compute_sserror(const double *CX, const PREC *X, const unsigned int *c,unsigned int dim, unsigned int npts){	unsigned int i;	double sse, d;	const double *pcx;	const PREC *px;	sse = 0.0;	for ( i=0,px=X ; i<npts ; i++,px+=dim)	{		pcx = CX+c[i]*dim;		d = (double)(compute_distance(px,pcx,dim));		sse += d*d;	}	return(sse);}void compute_cluster_distances(BOUND_PREC *dist, BOUND_PREC *s, const double *CX, unsigned int dim,unsigned int nclus, const bool *cluster_changed){	unsigned int j,i,cnt;	const double *pcxp, *pcx;	for ( j=0 ; j<nclus ; j++ )		s[j] = BOUND_PREC_MAX;	for ( i=0,pcx=CX ; i<nclus-1 ; i++,pcx+=dim)	{		pcxp = CX + (i+1)*dim;		cnt=i*nclus+i+1;		for ( j=i+1 ; j<nclus; j++,cnt++,pcxp+=dim )		{			if (cluster_changed[i] || cluster_changed[j])			{				dist[cnt] = (BOUND_PREC)(0.5 * compute_distance(pcx,pcxp,dim));				dist[j*nclus+i] = dist[cnt];				if (dist[cnt] < s[i])					s[i] = dist[cnt];				if (dist[cnt] < s[j])					s[j] = dist[cnt];			}		}	}}void remove_point_from_cluster(unsigned int cluster_ind, double *CX, const PREC *px, unsigned int *nr_points, unsigned int dim){	double *pcx;	pcx = CX + cluster_ind*dim;	unsigned int k;	/* empty cluster after or before removal */	if (nr_points[cluster_ind]<2)	{		for ( k=0 ; k<dim ; k++ )			pcx[k] = 0.0;		nr_points[cluster_ind]=0;	}	else	{		double nr_old,nr_new; 		nr_old = (double)nr_points[cluster_ind];		(nr_points[cluster_ind])--;		nr_new = (double)nr_points[cluster_ind];		for ( k=0 ; k<dim ; k++ )			pcx[k] = (nr_old*pcx[k] - (double)px[k])/nr_new;	}}void add_point_to_cluster(unsigned int cluster_ind, double *CX, const PREC *px, unsigned int *nr_points, unsigned int dim){	unsigned int k;	double nr_old, nr_new;	double *pcx;	pcx = CX + cluster_ind*dim;	/* first point in cluster */	if (nr_points[cluster_ind]==0)	{				(nr_points[cluster_ind])++;		for ( k=0 ; k<dim ; k++ )			pcx[k] = (double)px[k];	}	else	{		nr_old = (double)(nr_points[cluster_ind]);		(nr_points[cluster_ind])++;		nr_new = (double)(nr_points[cluster_ind]);		for ( k=0 ; k<dim ; k++ )			pcx[k] = (nr_old*pcx[k]+(double)px[k])/nr_new;	}}unsigned int init_point_to_cluster(unsigned int point_ind, const PREC *px, const double *CX, unsigned int dim,unsigned int nclus, double *mindist, BOUND_PREC *low_b, const BOUND_PREC *cl_dist){	double mind, d;	unsigned int assignment, j,bias;	const double *pcx;	bool use_low_b = true;	if (low_b==NULL) use_low_b = false;	bias = point_ind*nclus;		pcx = CX;	mind = compute_distance(px,pcx,dim);	if (use_low_b) low_b[bias] = (BOUND_PREC)mind;	assignment = 0;	pcx+=dim;	for ( j=1 ; j<nclus ; j++,pcx+=dim )	{		if (mind + BOUND_EPS <= cl_dist[assignment*nclus+j])			continue;		d = compute_distance(px,pcx,dim);		if(use_low_b) low_b[j+bias] = (BOUND_PREC)d;		if (d<mind)		{			mind = d;			assignment = j;		}	}	mindist[point_ind] = mind;	return(assignment);}unsigned int assign_point_to_cluster_ordinary(const PREC *px, const double *CX, unsigned int dim,unsigned int nclus){	double mind, d;	unsigned int assignment=0, j;	const double *pcx;	mind = DBL_MAX;	for ( j=0,pcx=CX ; j<nclus ; j++,pcx+=dim )	{		d = compute_distance(px,pcx,dim);		if (d<mind)		{			mind = d;			assignment = j;		}	}	return(assignment);}unsigned int assign_point_to_cluster(unsigned int point_ind, const PREC *px, const double *CX, unsigned int dim,unsigned int nclus, unsigned int old_assignment, double *mindist, BOUND_PREC *s, BOUND_PREC *cl_dist, BOUND_PREC *low_b){	double mind, d;	unsigned int assignment, j, counter;	const double *pcx;	bool up_to_date = false,use_low_b=true;;	unsigned int bias;	bias  = point_ind*nclus;	if (low_b==NULL)use_low_b=false;	mind = mindist[point_ind];	if (mind+BOUND_EPS <= s[old_assignment])	{#ifdef KMEANS_DEBUG		saved_two++;#endif		return(old_assignment);	}	assignment = old_assignment;	counter = assignment*nclus;	for ( j=0,pcx=CX ; j<nclus ; j++,pcx+=dim )	{		if (j==old_assignment)		{#ifdef KMEANS_DEBUG			saved_three_one++;#endif			continue;		}				if (use_low_b && (mind+BOUND_EPS <= low_b[j+bias]))		{#ifdef KMEANS_DEBUG			saved_three_two++;#endif			continue;		}		if (mind+BOUND_EPS <= cl_dist[counter+j])		{#ifdef KMEANS_DEBUG			saved_three_three++;#endif			continue;		}		if (!up_to_date)		{			d = compute_distance(px,CX+assignment*dim,dim);			mind = d;			if(use_low_b) low_b[assignment+bias] = (BOUND_PREC)d;			up_to_date = true;		}		if (!use_low_b)			d = compute_distance(px,pcx,dim);		else if ((mind > BOUND_EPS+low_b[j+bias]) || (mind > BOUND_EPS+cl_dist[counter+j]))		{			d =compute_distance(px,pcx,dim);			low_b[j+bias] = (BOUND_PREC)d;		}		else		{#ifdef KMEANS_DEBUG			saved_three_b++;#endif			continue;		}		if (d<mind)		{			mind = d;			assignment = j;			counter = assignment*nclus;			up_to_date = true;		}	}	mindist[point_ind] = mind;	return(assignment);}double kmeans_run(double *CX,const PREC *X,unsigned int *c,unsigned int dim,unsigned int npts,unsigned int nclus,unsigned int maxiter){	double *tCX;	unsigned int *CN, *old_c, *rperm = NULL;	double sse = 0.0;	double *pcx, *tpcx, *mindist;	bool *cluster_changed,use_low_b=true;	const PREC *px;	unsigned int i,j,k,nchanged,iteration,cnt;	BOUND_PREC *s, *offset, *cl_dist, *low_b;	tCX = (double *)calloc(npts * dim, sizeof(double));	assert(tCX);	/* number of points per cluster */	CN = (unsigned int *) calloc(nclus, sizeof(unsigned int)); 	assert(CN);		/* old assignement of points to cluster */	old_c = (unsigned int *) malloc(npts* sizeof(unsigned int));	assert(old_c);	/* assign to value which is out of range */	for ( i=0 ; i<npts ; i++)		old_c[i] = nclus;#ifdef KMEANS_VERBOSE	printf("compile without setting the KMEANS_VERBOSE flag for no output\n");#endif	low_b = (BOUND_PREC *) calloc(npts*nclus,sizeof(BOUND_PREC));	if (low_b == NULL)	{#ifdef KMEANS_VERBOSE		printf("not enough memory for lower bound, will compute without\n");#endif		use_low_b = false;	}	else		assert(low_b);	cl_dist = (BOUND_PREC *)calloc(nclus*nclus, sizeof(double));	assert(cl_dist);	s = (BOUND_PREC *) malloc(nclus*sizeof(BOUND_PREC));	assert(s);	offset = (BOUND_PREC *) malloc(nclus * sizeof(BOUND_PREC)); /* change in distance of a cluster mean after a iteration */	assert(offset);	mindist = (double *)malloc(npts * sizeof(double));	assert(mindist);	for (i=0;i<npts;i++)		mindist[i] = DBL_MAX;	cluster_changed = (bool *) malloc(nclus * sizeof(bool)); /* did the cluster changed? */	assert(cluster_changed);	for ( j=0 ; j<nclus ; j++ )		cluster_changed[j] = true;	iteration = 0;	nchanged = 1;	while (iteration < maxiter || maxiter == 0)	{				/* compute cluster-cluster distances */		compute_cluster_distances(cl_dist, s, CX, dim,nclus, cluster_changed);				/* find nearest cluster center */		if (iteration == 0)		{			for (i=0, px=X; i<npts ; i++,px+=dim)			{				c[i] = init_point_to_cluster(i,px,CX,dim,nclus,mindist,low_b,cl_dist);				add_point_to_cluster(c[i],tCX,px,CN,dim);			}			nchanged = npts;		}		else		{			for ( j=0 ; j<nclus ; j++)				cluster_changed[j] = false;			nchanged = 0;			for (i=0, px=X; i<npts ; i++,px+=dim)			{				c[i] = assign_point_to_cluster(i,px,CX,dim,nclus,old_c[i],mindist,s,cl_dist,low_b);				if (old_c[i] == c[i]) continue;								nchanged++;				cluster_changed[c[i]] = true;				cluster_changed[old_c[i]] = true;				remove_point_from_cluster(old_c[i],tCX,px,CN,dim);				add_point_to_cluster(c[i],tCX,px,CN,dim);			}		}		/* fill up empty clusters */		for (j=0 ; j<nclus ; j++)		{			if (CN[j]>0) continue;			if (!rperm)			{				rperm = (unsigned int*)malloc(npts*sizeof(unsigned int));				assert(rperm);			}			randperm(rperm,npts);			i = 0; 			while (rperm[i]<npts && CN[c[rperm[i]]]<2) i++;			if (i==npts)continue;			i = rperm[i];#ifdef KMEANS_DEBUG			printf("empty cluster [%d], filling it with point [%d]\n",j,i);#endif			cluster_changed[c[rperm[i]]] = true;			cluster_changed[j] = true;			px = X + i*dim;			remove_point_from_cluster(c[i],tCX,px,CN,dim);			c[i] = j;			add_point_to_cluster(j,tCX,px,CN,dim);			/* void the bounds */			s[j] = (BOUND_PREC)0.0;			mindist[i] = 0.0;			if (use_low_b)				for ( k=0 ; k<npts ; k++ )					low_b[k*nclus+j] = (BOUND_PREC)0.0;						nchanged++;		}		/* no assignment changed: done */		if (nchanged==0) break; 		/* compute the offset */		for ( j=0,pcx=CX,tpcx=tCX; j<nclus ; j++,pcx+=dim,tpcx+=dim )		{			offset[j] = (BOUND_PREC)0.0;			if (cluster_changed[j])			{				offset[j] = (BOUND_PREC)compute_distance(pcx,tpcx,dim);				memcpy(pcx,tpcx,dim*sizeof(double));			}		}				/* update the lower bound */		if (use_low_b)		{			for ( i=0,cnt=0 ; i<npts ; i++ )				for ( j=0 ; j<nclus ; j++,cnt++ )				{					low_b[cnt] -= offset[j];					if (low_b[cnt]<(BOUND_PREC)0.0) low_b[cnt] = (BOUND_PREC)0.0;				}		}		for ( i=0; i<npts; i++)			mindist[i] += (double)offset[c[i]];		memcpy(old_c,c,npts*sizeof(unsigned int));#ifdef KMEANS_VERBOSE		sse = compute_sserror(CX,X,c,dim,npts);		printf("iteration %4d, #(changed points): %4d, sse: %4.2f\n",(int)iteration,(int)nchanged,sse);#endif#ifdef KMEANS_DEBUG		printf("saved at 2) %d\n",saved_two);		printf("saved at 3i) %d\n",saved_three_one);		printf("saved at 3ii) %d\n",saved_three_two);		printf("saved at 3iii) %d\n",saved_three_three);		printf("saved at 3b) %d\n",saved_three_b);		saved_two=0;		saved_three_one=0;		saved_three_two=0;		saved_three_three=0;		saved_three_b=0;#endif		iteration++;	}#ifdef KMEANS_DEBUG	for (j=0;j<nclus;j++)		if (CN[j]==0)			printf("BUG: EMPTY CLUSTER AFTERT ALL\n");#endif	/* find nearest cluster center if iteration reached maxiter */	if (nchanged>0)		for (i=0, px=X; i<npts ; i++,px+=dim)			c[i] = assign_point_to_cluster_ordinary(px,CX,dim,nclus);	sse = compute_sserror(CX,X,c,dim,npts);#ifdef KMEANS_VERBOSE	printf("iteration %4d, #(changed points): %4d, sse: %4.2f\n",(int)iteration,(int)nchanged,sse);#endif	if(low_b) free(low_b);	free(cluster_changed);	free(mindist);	free(s);	free(offset);	free(cl_dist);	free(tCX);	free(CN);	free(old_c);	if(rperm) free(rperm);	return(sse);}double kmeans(double *CX,const PREC *X,unsigned int *assignment,unsigned int dim,unsigned int npts,unsigned int nclus,unsigned int maxiter, unsigned int restarts){	double sse, minsse;	unsigned int res = restarts;	unsigned int k,i;	unsigned int *order, *bestassignment;	double *bestCX;	if (CX==NULL)	{		order = (unsigned int*)malloc(npts*sizeof(unsigned int));		CX = (double*)calloc(nclus*dim,sizeof(double));		/* generate new starting point */		randperm(order,npts);		for (i=0; i<nclus; i++)			for (k=0; k<dim; k++ )				CX[(i*dim)+k] = (double)X[order[i]*dim+k];		free(order);			}	sse = kmeans_run(CX,X,assignment,dim,npts,nclus,maxiter);		if (res>0)	{		minsse = sse;		order = (unsigned int*)malloc(npts*sizeof(unsigned int));		bestCX = (double*) malloc(dim*nclus*sizeof(double));		bestassignment = (unsigned int*)malloc(npts*sizeof(unsigned int));		memcpy(bestCX,CX,dim*nclus*sizeof(double));		memcpy(bestassignment,assignment,npts*sizeof(unsigned int));		while (res>0)		{			/* generate new starting point */			randperm(order,npts);			for (i=0; i<nclus; i++)				for (k=0; k<dim; k++ )					CX[(i*dim)+k] = (double)X[order[i]*dim+k];					sse = kmeans_run(CX,X,assignment,dim,npts,nclus,maxiter);			if (sse<minsse)			{#ifdef KMEANS_VERBOSE				printf("found a better clustering with sse = %g\n",sse);#endif				minsse = sse;				memcpy(bestCX,CX,dim*nclus*sizeof(double));				memcpy(bestassignment,assignment,npts*sizeof(unsigned int));			}			res--;		}		memcpy(CX,bestCX,dim*nclus*sizeof(double));		memcpy(assignment,bestassignment,npts*sizeof(unsigned int));		sse = minsse;		free(bestassignment);		free(bestCX);		free(order);	}	return(sse);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -