📄 kmrand.cpp
字号:
// defined to be min_{i != j} dist(cc[i],cc[j])/std, where cc[i] is// the i-th cluster center and std is the global deviation. It is// defined to be the square root of the trace of the covariance// matrix, which is equivalent to the std_dev for each coordinate// times sqrt(d).//----------------------------------------------------------------------static KMpointArray cgClusters = NULL; // cluster storagevoid kmClusGaussPts( // clustered-Gaussian distribution KMpointArray pa, // point array (modified) int n, // number of points int dim, // dimension int n_col, // number of colors bool new_clust, // generate new clusters. double std_dev, // standard deviation within clusters double* clus_sep) // cluster separation (returned){ if (cgClusters == NULL || new_clust) {// need new cluster centers if (cgClusters != NULL) // clusters already exist kmDeallocPts(cgClusters); // get rid of them cgClusters = kmAllocPts(n_col, dim); // generate cluster center coords for (int i = 0; i < n_col; i++) { for (int d = 0; d < dim; d++) { cgClusters[i][d] = (KMcoord) kmRanUnif(-1,1); } } } double minDist = double(dim); // minimum inter-center sq'd distance for (int i = 0; i < n_col; i++) { // compute minimum separation for (int j = i+1; j < n_col; j++) { double dist = kmDist(dim, cgClusters[i], cgClusters[j]); if (dist < minDist) minDist = dist; } } // cluster separation if (clus_sep != NULL) *clus_sep = sqrt(minDist)/(sqrt(double(dim))*std_dev); for (int i = 0; i < n; i++) { int c = kmRanInt(n_col); // generate cluster index for (int d = 0; d < dim; d++) { pa[i][d] = (KMcoord) (std_dev*kmRanGauss() + cgClusters[c][d]); } }}KMpointArray kmGetCGclusters() // get clustered gauss cluster centers{ return cgClusters;}//----------------------------------------------------------------------// kmClusOrthFlats - points clustered along orthogonal flats//// This distribution consists of a collection points clustered// among a collection of low dimensional flats (hyperplanes) in// the hypercube [-1,1]^d. A set of n_col orthogonal flats are// generated, each whose dimension is a random number between 1// and max_dim. The points are evenly distributed among the clusters.// For each cluster, we generate points uniformly distributed along// the flat within the hypercube.//// This is done as follows. Each cluster is defined by a d-element// control vector whose components are either:// // CO_FLAG indicating that this component is to be generated// uniformly in [-1,1],// x a value other than CO_FLAG in the range [-1,1],// which indicating that this coordinate is to be// generated as x plus a Gaussian random deviation// with the given standard deviation.// // The number of zero components is the dimension of the flat, which// is a random integer in the range from 1 to max_dim. The points// are disributed between clusters in nearly equal sized groups.//// Note: Once cluster centers have been set, they are not changed,// unless new_clust = true. This is so that subsequent calls generate// points from the same distribution. It follows, of course, that any// attempt to change the dimension or number of clusters without// generating new clusters is asking for trouble.//// To make this a bad scenario at query time, query points should be// selected from a different distribution, e.g. uniform or Gaussian.//// We use a little programming trick to generate groups of roughly// equal size. If n is the total number of points, and n_col is// the number of clusters, then the c-th cluster (0 <= c < n_col)// is given floor((n+c)/n_col) points. It can be shown that this// will exactly consume all n points.////----------------------------------------------------------------------void kmClusOrthFlats( // clustered along orthogonal flats KMpointArray pa, // point array (modified) int n, // number of points int dim, // dimension int n_col, // number of colors bool new_clust, // generate new clusters. double std_dev, // standard deviation within clusters int max_dim) // maximum dimension of the flats{ const double CO_FLAG = 999; // special flag value static KMpointArray control = NULL; // control vectors if (control == NULL || new_clust) { // need new cluster centers if (control != NULL) { // clusters already exist kmDeallocPts(control); // get rid of them } control = kmAllocPts(n_col, dim); for (int c = 0; c < n_col; c++) { // generate clusters int n_dim = 1 + kmRanInt(max_dim); // number of dimensions in flat for (int d = 0; d < dim; d++) { // generate side locations // prob. of picking next dim double Prob = ((double) n_dim)/((double) (dim-d)); if (kmRan0() < Prob) { // add this one to flat control[c][d] = CO_FLAG; // flag this entry n_dim--; // one fewer dim to fill } else { // don't take this one control[c][d] = kmRanUnif(-1,1);// random value in [-1,1] } } } } int next = 0; // next slot to fill for (int c = 0; c < n_col; c++) { // generate clusters int pick = (n+c)/n_col; // number of points to pick for (int i = 0; i < pick; i++) { for (int d = 0; d < dim; d++) { if (control[c][d] == CO_FLAG) // dimension on flat pa[next][d] = (KMcoord) kmRanUnif(-1,1); else // dimension off flat pa[next][d] = (KMcoord) (std_dev*kmRanGauss() + control[c][d]); } next++; } }}//----------------------------------------------------------------------// kmClusEllipsoids - points clustered around ellipsoids//// This distribution consists of a collection points clustered// among a collection of low dimensional ellipsoids in the// hypercube [-1,1]^d. The objective is to model distributions// in which the points are distributed in lower dimensional// subspaces, and within this lower dimensional space the points// are distributed with a Gaussian distribution (with no// correlation between the dimensions).//// The distribution is given the number of clusters or "colors"// (n_col), maximum number of dimensions (max_dim) of the lower// dimensional subspace, a "small" standard deviation (std_dev_small),// and a "large" standard deviation range (std_dev_lo, std_dev_hi).//// The algorithm generates n_col cluster centers uniformly from the// hypercube [-1,1]^d. For each cluster, it selects the dimension// of the subspace as a random number r between 1 and max_dim.// These are the dimensions of the ellipsoid. Then it generates// a d-element std dev vector whose entries are the standard// deviation for the coordinates of each cluster in the distribution.// Among the d-element control vector, r randomly chosen values are// chosen uniformly from the range [std_dev_lo, std_dev_hi]. The// remaining values are set to std_dev_small.//// Note that kmClusGaussPts is a special case of this in which// max_dim = 0, and std_dev = std_dev_small.//// If the flag new_clust is set, then new cluster centers are// generated.////----------------------------------------------------------------------void kmClusEllipsoids( // clustered around ellipsoids KMpointArray pa, // point array (modified) int n, // number of points int dim, // dimension int n_col, // number of colors bool new_clust, // generate new clusters. double std_dev_small, // small standard deviation double std_dev_lo, // low standard deviation for ellipses double std_dev_hi, // high standard deviation for ellipses int max_dim) // maximum dimension of the flats{ static KMpointArray clusters = NULL; // cluster centers static KMpointArray stdDev = NULL; // standard deviations if (clusters == NULL || new_clust) { // need new cluster centers if (clusters != NULL) // clusters already exist kmDeallocPts(clusters); // get rid of them if (stdDev != NULL) // std deviations already exist kmDeallocPts(stdDev); // get rid of them clusters = kmAllocPts(n_col, dim); // alloc new clusters and devs stdDev = kmAllocPts(n_col, dim); for (int i = 0; i < n_col; i++) { // gen cluster center coords for (int d = 0; d < dim; d++) { clusters[i][d] = (KMcoord) kmRanUnif(-1,1); } } for (int c = 0; c < n_col; c++) { // generate cluster std dev int n_dim = 1 + kmRanInt(max_dim); // number of dimensions in flat for (int d = 0; d < dim; d++) { // generate std dev's // prob. of picking next dim double Prob = ((double) n_dim)/((double) (dim-d)); if (kmRan0() < Prob) { // add this one to ellipse // generate random std dev stdDev[c][d] = kmRanUnif(std_dev_lo, std_dev_hi); n_dim--; // one fewer dim to fill } else { // don't take this one stdDev[c][d] = std_dev_small;// use small std dev } } } } int next = 0; // next slot to fill for (int c = 0; c < n_col; c++) { // generate clusters int pick = (n+c)/n_col; // number of points to pick for (int i = 0; i < pick; i++) { for (int d = 0; d < dim; d++) { pa[next][d] = (KMcoord) (stdDev[c][d]*kmRanGauss() + clusters[c][d]); } next++; } }}//----------------------------------------------------------------------// kmMultiClus - multi-sized clusters// This distribution is designed to be a challenge for clustering// algorithm. It consists of a clusters of varying sizes, located// within the cube [-1,1]^d. The cluster centers are uniformly// distributed. We generate clusters one by one. For each// cluster, the size of the cluster m is chosen so that the// probability of generating a cluster of size 2^i is 1/2^i.//// We want each cluster to have a similar total squared distortion// (variance). Thus a group of size m is generated from a gaussian// distribution with a standard deviation of B*sqrt(1/m). We call// B the base standard deviation, which is given as an argument.//// The expected number of clusters is (lg n)-2, but there is a very// high variance here. The total distortion in each dimension then// is B^2*((lg n)-2) (but don't trust me here, since this is just// based on back-of- the-envelope computations). Hence the expected// average distortion, for n points in dim d would be://// d*B^2*((log n)-2)/n//// The reason that this is challenging is that each cluster has an// equal claim to a center (since distortions are similar) but many// sampling based methods will favor placing clusters in the// relatively few clusters with a large number of points.//----------------------------------------------------------------------void kmMultiClus( // multi-sized clusters KMpointArray pa, // point array (modified) int n, // number of points int dim, // dimension int &k, // number of clusters (returned) double base_dev) // base standard deviation{ int next = 0; // next point in array int nSamp = 0; // number of points sampled k = 0; // number of clusters generated KMpoint clusCenter = kmAllocPt(dim); // allocate center storage while (nSamp < n) { // until we have sampled enough int remain = n - nSamp; // number remaining to sample int clusSize = 2; // repeatedly double cluster size // with prob 1/2 while ((clusSize < remain) && (kmRan0() < 0.5)) clusSize *= 2; // don't exceed upper limit if (clusSize > remain) clusSize = remain; // generate center uniformly for (int d = 0; d < dim; d++) { clusCenter[d] = (KMcoord) kmRanUnif(-1,1); } // desired std dev for cluster double stdDev = base_dev*sqrt(1.0/clusSize); // generate cluster points for (int i = 0; i < clusSize; i++) { for (int d = 0; d < dim; d++) { pa[next][d] = (KMcoord) (stdDev*kmRanGauss()+clusCenter[d]); } next++; } nSamp += clusSize; // update number sampled k++; // one more cluster } kmDeallocPt(clusCenter);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -