⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 kmrand.cpp

📁 高效的k-means算法实现
💻 CPP
📖 第 1 页 / 共 2 页
字号:
//	defined to be min_{i != j} dist(cc[i],cc[j])/std, where cc[i] is//	the i-th cluster center and std is the global deviation.  It is//	defined to be the square root of the trace of the covariance//	matrix, which is equivalent to the std_dev for each coordinate//	times sqrt(d).//----------------------------------------------------------------------static KMpointArray cgClusters = NULL;	// cluster storagevoid kmClusGaussPts(		// clustered-Gaussian distribution	KMpointArray	pa,		// point array (modified)	int		n,		// number of points	int		dim,		// dimension	int		n_col,		// number of colors	bool		new_clust,	// generate new clusters.	double		std_dev,	// standard deviation within clusters	double*		clus_sep)	// cluster separation (returned){    if (cgClusters == NULL || new_clust) {// need new cluster centers	if (cgClusters != NULL)		// clusters already exist	    kmDeallocPts(cgClusters);	// get rid of them	cgClusters = kmAllocPts(n_col, dim);					// generate cluster center coords	for (int i = 0; i < n_col; i++) {	    for (int d = 0; d < dim; d++) {		cgClusters[i][d] = (KMcoord) kmRanUnif(-1,1);	    }	}    }    double minDist = double(dim);	// minimum inter-center sq'd distance    for (int i = 0; i < n_col; i++) {	// compute minimum separation	for (int j = i+1; j < n_col; j++) {	    double dist = kmDist(dim, cgClusters[i], cgClusters[j]);	    if (dist < minDist) minDist = dist;	}    }					// cluster separation    if (clus_sep != NULL)	*clus_sep = sqrt(minDist)/(sqrt(double(dim))*std_dev);    for (int i = 0; i < n; i++) {	int c = kmRanInt(n_col);	// generate cluster index	for (int d = 0; d < dim; d++) {          pa[i][d] = (KMcoord) (std_dev*kmRanGauss() + cgClusters[c][d]);	}    }}KMpointArray kmGetCGclusters()	// get clustered gauss cluster centers{    return cgClusters;}//----------------------------------------------------------------------//  kmClusOrthFlats - points clustered along orthogonal flats////	This distribution consists of a collection points clustered//	among a collection of low dimensional flats (hyperplanes) in//	the hypercube [-1,1]^d.  A set of n_col orthogonal flats are//	generated, each whose dimension is a random number between 1//	and max_dim.  The points are evenly distributed among the clusters.//	For each cluster, we generate points uniformly distributed along//	the flat within the hypercube.////	This is done as follows.  Each cluster is defined by a d-element//	control vector whose components are either://	//		CO_FLAG	indicating that this component is to be generated//			uniformly in [-1,1],//		x 	a value other than CO_FLAG in the range [-1,1],//			which indicating that this coordinate is to be//			generated as x plus a Gaussian random deviation//			with the given standard deviation.//			//	The number of zero components is the dimension of the flat, which//	is a random integer in the range from 1 to max_dim.  The points//	are disributed between clusters in nearly equal sized groups.////	Note: Once cluster centers have been set, they are not changed,//	unless new_clust = true.  This is so that subsequent calls generate//	points from the same distribution.  It follows, of course, that any//	attempt to change the dimension or number of clusters without//	generating new clusters is asking for trouble.////	To make this a bad scenario at query time, query points should be//	selected from a different distribution, e.g. uniform or Gaussian.////	We use a little programming trick to generate groups of roughly//	equal size.  If n is the total number of points, and n_col is//	the number of clusters, then the c-th cluster (0 <= c < n_col)//	is given floor((n+c)/n_col) points.  It can be shown that this//	will exactly consume all n points.////----------------------------------------------------------------------void kmClusOrthFlats(		// clustered along orthogonal flats	KMpointArray	pa,		// point array (modified)	int		n,		// number of points	int		dim,		// dimension	int		n_col,		// number of colors	bool		new_clust,	// generate new clusters.	double		std_dev,	// standard deviation within clusters	int		max_dim)	// maximum dimension of the flats{    const double CO_FLAG = 999;			// special flag value    static KMpointArray control = NULL;	// control vectors    if (control == NULL || new_clust) {		// need new cluster centers	if (control != NULL) {			// clusters already exist	    kmDeallocPts(control);		// get rid of them	}	control = kmAllocPts(n_col, dim);	for (int c = 0; c < n_col; c++) {	// generate clusters	    int n_dim = 1 + kmRanInt(max_dim);	// number of dimensions in flat	    for (int d = 0; d < dim; d++) {	// generate side locations						// prob. of picking next dim	    	double Prob = ((double) n_dim)/((double) (dim-d));		if (kmRan0() < Prob) {		// add this one to flat		    control[c][d] = CO_FLAG;	// flag this entry		    n_dim--;			// one fewer dim to fill		}		else {				// don't take this one		    control[c][d] = kmRanUnif(-1,1);// random value in [-1,1]		}	    }	}    }    int next = 0;				// next slot to fill    for (int c = 0; c < n_col; c++) {		// generate clusters	int pick = (n+c)/n_col;			// number of points to pick	for (int i = 0; i < pick; i++) {	    for (int d = 0; d < dim; d++) {		if (control[c][d] == CO_FLAG)	// dimension on flat        	    pa[next][d] = (KMcoord) kmRanUnif(-1,1);		else				// dimension off flat        	    pa[next][d] =			(KMcoord) (std_dev*kmRanGauss() + control[c][d]);	    }	    next++;	}    }}//----------------------------------------------------------------------//  kmClusEllipsoids - points clustered around ellipsoids////	This distribution consists of a collection points clustered//	among a collection of low dimensional ellipsoids in the//	hypercube [-1,1]^d.  The objective is to model distributions//	in which the points are distributed in lower dimensional//	subspaces, and within this lower dimensional space the points//	are distributed with a Gaussian distribution (with no//	correlation between the dimensions).////	The distribution is given the number of clusters or "colors"//	(n_col), maximum number of dimensions (max_dim) of the lower//	dimensional subspace, a "small" standard deviation (std_dev_small),//	and a "large" standard deviation range (std_dev_lo, std_dev_hi).////	The algorithm generates n_col cluster centers uniformly from the//	hypercube [-1,1]^d.  For each cluster, it selects the dimension//	of the subspace as a random number r between 1 and max_dim.//	These are the dimensions of the ellipsoid.  Then it generates//	a d-element std dev vector whose entries are the standard//	deviation for the coordinates of each cluster in the distribution.//	Among the d-element control vector, r randomly chosen values are//	chosen uniformly from the range [std_dev_lo, std_dev_hi].  The//	remaining values are set to std_dev_small.////	Note that kmClusGaussPts is a special case of this in which//	max_dim = 0, and std_dev = std_dev_small.////	If the flag new_clust is set, then new cluster centers are//	generated.////----------------------------------------------------------------------void kmClusEllipsoids(		// clustered around ellipsoids	KMpointArray	pa,		// point array (modified)	int		n,		// number of points	int		dim,		// dimension	int		n_col,		// number of colors	bool		new_clust,	// generate new clusters.	double		std_dev_small,	// small standard deviation	double		std_dev_lo,	// low standard deviation for ellipses	double		std_dev_hi,	// high standard deviation for ellipses	int		max_dim)	// maximum dimension of the flats{    static KMpointArray clusters = NULL;	// cluster centers    static KMpointArray stdDev = NULL;		// standard deviations    if (clusters == NULL || new_clust) {	// need new cluster centers	if (clusters != NULL)			// clusters already exist	    kmDeallocPts(clusters);		// get rid of them	if (stdDev != NULL)			// std deviations already exist	    kmDeallocPts(stdDev);		// get rid of them	clusters = kmAllocPts(n_col, dim);	// alloc new clusters and devs	stdDev   = kmAllocPts(n_col, dim);	for (int i = 0; i < n_col; i++) {	// gen cluster center coords	    for (int d = 0; d < dim; d++) {		clusters[i][d] = (KMcoord) kmRanUnif(-1,1);	    }	}	for (int c = 0; c < n_col; c++) {	// generate cluster std dev	    int n_dim = 1 + kmRanInt(max_dim);	// number of dimensions in flat	    for (int d = 0; d < dim; d++) {	// generate std dev's						// prob. of picking next dim	    	double Prob = ((double) n_dim)/((double) (dim-d));		if (kmRan0() < Prob) {		// add this one to ellipse						// generate random std dev		    stdDev[c][d] = kmRanUnif(std_dev_lo, std_dev_hi);		    n_dim--;			// one fewer dim to fill		}		else {				// don't take this one		    stdDev[c][d] = std_dev_small;// use small std dev		}	    }	}    }    int next = 0;				// next slot to fill    for (int c = 0; c < n_col; c++) {		// generate clusters	int pick = (n+c)/n_col;			// number of points to pick	for (int i = 0; i < pick; i++) {	    for (int d = 0; d < dim; d++) {        	pa[next][d] = (KMcoord)			(stdDev[c][d]*kmRanGauss() + clusters[c][d]);	    }	    next++;	}    }}//----------------------------------------------------------------------//  kmMultiClus - multi-sized clusters//	This distribution is designed to be a challenge for clustering//	algorithm.  It consists of a clusters of varying sizes, located//	within the cube [-1,1]^d.  The cluster centers are uniformly//	distributed.  We generate clusters one by one.  For each//	cluster, the size of the cluster m is chosen so that the//	probability of generating a cluster of size 2^i is 1/2^i.////	We want each cluster to have a similar total squared distortion//	(variance).  Thus a group of size m is generated from a gaussian//	distribution with a standard deviation of B*sqrt(1/m).  We call//	B the base standard deviation, which is given as an argument.////	The expected number of clusters is (lg n)-2, but there is a very//	high variance here.  The total distortion in each dimension then//	is B^2*((lg n)-2) (but don't trust me here, since this is just//	based on back-of- the-envelope computations).  Hence the expected//	average distortion, for n points in dim d would be:////		d*B^2*((log n)-2)/n////	The reason that this is challenging is that each cluster has an//	equal claim to a center (since distortions are similar) but many//	sampling based methods will favor placing clusters in the//	relatively few clusters with a large number of points.//----------------------------------------------------------------------void kmMultiClus(		// multi-sized clusters	KMpointArray	pa,		// point array (modified)	int		n,		// number of points	int		dim,		// dimension	int		&k,		// number of clusters (returned)	double		base_dev)	// base standard deviation{    int next = 0;			// next point in array    int nSamp = 0;			// number of points sampled    k = 0;				// number of clusters generated    KMpoint clusCenter = kmAllocPt(dim); // allocate center storage    while (nSamp < n) {			// until we have sampled enough	int remain = n - nSamp;		// number remaining to sample	int clusSize = 2;					// repeatedly double cluster size					// with prob 1/2	while ((clusSize < remain) && (kmRan0() < 0.5))	    clusSize *= 2;					// don't exceed upper limit	if (clusSize > remain) clusSize = remain;					// generate center uniformly	for (int d = 0; d < dim; d++) {	    clusCenter[d] = (KMcoord) kmRanUnif(-1,1);	}    					// desired std dev for cluster	double stdDev = base_dev*sqrt(1.0/clusSize);					// generate cluster points	for (int i = 0; i < clusSize; i++) {	    for (int d = 0; d < dim; d++) {		pa[next][d] = (KMcoord) (stdDev*kmRanGauss()+clusCenter[d]);	    }	    next++;	}	nSamp += clusSize;		// update number sampled	k++;				// one more cluster    }    kmDeallocPt(clusCenter);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -