📄 rand.cpp
字号:
// Cluster centers are uniformly distributed over [-1,1], and the// standard deviation within each cluster is fixed.//// Note: Once cluster centers have been set, they are not changed,// unless new_clust = true. This is so that subsequent calls generate// points from the same distribution. It follows, of course, that any// attempt to change the dimension or number of clusters without// generating new clusters is asking for trouble.//// Note: Cluster centers are not generated by a call to uniformPts().// Although this could be done, it has been omitted for// compatibility with annClusGaussPts() in the colored version,// rand_c.cc.//----------------------------------------------------------------------void annClusGaussPts( // clustered-Gaussian distribution ANNpointArray pa, // point array (modified) int n, // number of points int dim, // dimension int n_clus, // number of colors ANNbool new_clust, // generate new clusters. double std_dev) // standard deviation within clusters{ static ANNpointArray clusters = NULL;// cluster storage if (clusters == NULL || new_clust) {// need new cluster centers if (clusters != NULL) // clusters already exist annDeallocPts(clusters); // get rid of them clusters = annAllocPts(n_clus, dim); // generate cluster center coords for (int i = 0; i < n_clus; i++) { for (int d = 0; d < dim; d++) { clusters[i][d] = (ANNcoord) annRanUnif(-1,1); } } } for (int i = 0; i < n; i++) { int c = annRanInt(n_clus); // generate cluster index for (int d = 0; d < dim; d++) { pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + clusters[c][d]); } }}//----------------------------------------------------------------------// annClusOrthFlats - points clustered along orthogonal flats//// This distribution consists of a collection points clustered// among a collection of axis-aligned low dimensional flats in// the hypercube [-1,1]^d. A set of n_clus orthogonal flats are// generated, each whose dimension is a random number between 1// and max_dim. The points are evenly distributed among the clusters.// For each cluster, we generate points uniformly distributed along// the flat within the hypercube.//// This is done as follows. Each cluster is defined by a d-element// control vector whose components are either:// // CO_FLAG indicating that this component is to be generated// uniformly in [-1,1],// x a value other than CO_FLAG in the range [-1,1],// which indicates that this coordinate is to be// generated as x plus a Gaussian random deviation// with the given standard deviation.// // The number of zero components is the dimension of the flat, which// is a random integer in the range from 1 to max_dim. The points// are disributed between clusters in nearly equal sized groups.//// Note: Once cluster centers have been set, they are not changed,// unless new_clust = true. This is so that subsequent calls generate// points from the same distribution. It follows, of course, that any// attempt to change the dimension or number of clusters without// generating new clusters is asking for trouble.//// To make this a bad scenario at query time, query points should be// selected from a different distribution, e.g. uniform or Gaussian.//// We use a little programming trick to generate groups of roughly// equal size. If n is the total number of points, and n_clus is// the number of clusters, then the c-th cluster (0 <= c < n_clus)// is given floor((n+c)/n_clus) points. It can be shown that this// will exactly consume all n points.//// This procedure makes use of the utility procedure, genOrthFlat// which generates points in one orthogonal flat, according to// the given control vector.////----------------------------------------------------------------------const double CO_FLAG = 999; // special flag valuestatic void genOrthFlat( // generate points on an orthog flat ANNpointArray pa, // point array int n, // number of points int dim, // dimension double *control, // control vector double std_dev) // standard deviation{ for (int i = 0; i < n; i++) { // generate each point for (int d = 0; d < dim; d++) { // generate each coord if (control[d] == CO_FLAG) // dimension on flat pa[i][d] = (ANNcoord) annRanUnif(-1,1); else // dimension off flat pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + control[d]); } }}void annClusOrthFlats( // clustered along orthogonal flats ANNpointArray pa, // point array (modified) int n, // number of points int dim, // dimension int n_clus, // number of colors ANNbool new_clust, // generate new clusters. double std_dev, // standard deviation within clusters int max_dim) // maximum dimension of the flats{ static ANNpointArray control = NULL; // control vectors if (control == NULL || new_clust) { // need new cluster centers if (control != NULL) { // clusters already exist annDeallocPts(control); // get rid of them } control = annAllocPts(n_clus, dim); for (int c = 0; c < n_clus; c++) { // generate clusters int n_dim = 1 + annRanInt(max_dim); // number of dimensions in flat for (int d = 0; d < dim; d++) { // generate side locations // prob. of picking next dim double Prob = ((double) n_dim)/((double) (dim-d)); if (annRan0() < Prob) { // add this one to flat control[c][d] = CO_FLAG; // flag this entry n_dim--; // one fewer dim to fill } else { // don't take this one control[c][d] = annRanUnif(-1,1);// random value in [-1,1] } } } } int offset = 0; // offset in pa array for (int c = 0; c < n_clus; c++) { // generate clusters int pick = (n+c)/n_clus; // number of points to pick // generate the points genOrthFlat(pa+offset, pick, dim, control[c], std_dev); offset += pick; // increment offset }}//----------------------------------------------------------------------// annClusEllipsoids - points clustered around axis-aligned ellipsoids//// This distribution consists of a collection points clustered// among a collection of low dimensional ellipsoids whose axes// are alligned with the coordinate axes in the hypercube [-1,1]^d.// The objective is to model distributions in which the points are// distributed in lower dimensional subspaces, and within this// lower dimensional space the points are distributed with a// Gaussian distribution (with no correlation between the// dimensions).//// The distribution is given the number of clusters or "colors"// (n_clus), maximum number of dimensions (max_dim) of the lower// dimensional subspace, a "small" standard deviation// (std_dev_small), and a "large" standard deviation range// (std_dev_lo, std_dev_hi).//// The algorithm generates n_clus cluster centers uniformly from// the hypercube [-1,1]^d. For each cluster, it selects the// dimension of the subspace as a random number r between 1 and// max_dim. These are the dimensions of the ellipsoid. Then it// generates a d-element std dev vector whose entries are the// standard deviation for the coordinates of each cluster in the// distribution. Among the d-element control vector, r randomly// chosen values are chosen uniformly from the range [std_dev_lo,// std_dev_hi]. The remaining values are set to std_dev_small.//// Note that annClusGaussPts is a special case of this in which// max_dim = 0, and std_dev = std_dev_small.//// If the flag new_clust is set, then new cluster centers are// generated.//// This procedure makes use of the utility procedure genGauss// which generates points distributed according to a Gaussian// distribution.////----------------------------------------------------------------------static void genGauss( // generate points on a general Gaussian ANNpointArray pa, // point array int n, // number of points int dim, // dimension double *center, // center vector double *std_dev) // standard deviation vector{ for (int i = 0; i < n; i++) { for (int d = 0; d < dim; d++) { pa[i][d] = (ANNcoord) (std_dev[d]*annRanGauss() + center[d]); } }}void annClusEllipsoids( // clustered around ellipsoids ANNpointArray pa, // point array (modified) int n, // number of points int dim, // dimension int n_clus, // number of colors ANNbool new_clust, // generate new clusters. double std_dev_small, // small standard deviation double std_dev_lo, // low standard deviation for ellipses double std_dev_hi, // high standard deviation for ellipses int max_dim) // maximum dimension of the flats{ static ANNpointArray centers = NULL; // cluster centers static ANNpointArray std_dev = NULL; // standard deviations if (centers == NULL || new_clust) { // need new cluster centers if (centers != NULL) // clusters already exist annDeallocPts(centers); // get rid of them if (std_dev != NULL) // std deviations already exist annDeallocPts(std_dev); // get rid of them centers = annAllocPts(n_clus, dim); // alloc new clusters and devs std_dev = annAllocPts(n_clus, dim); for (int i = 0; i < n_clus; i++) { // gen cluster center coords for (int d = 0; d < dim; d++) { centers[i][d] = (ANNcoord) annRanUnif(-1,1); } } for (int c = 0; c < n_clus; c++) { // generate cluster std dev int n_dim = 1 + annRanInt(max_dim); // number of dimensions in flat for (int d = 0; d < dim; d++) { // generate std dev's // prob. of picking next dim double Prob = ((double) n_dim)/((double) (dim-d)); if (annRan0() < Prob) { // add this one to ellipse // generate random std dev std_dev[c][d] = annRanUnif(std_dev_lo, std_dev_hi); n_dim--; // one fewer dim to fill } else { // don't take this one std_dev[c][d] = std_dev_small;// use small std dev } } } } int offset = 0; // next slot to fill for (int c = 0; c < n_clus; c++) { // generate clusters int pick = (n+c)/n_clus; // number of points to pick // generate the points genGauss(pa+offset, pick, dim, centers[c], std_dev[c]); offset += pick; // increment offset in array }}//----------------------------------------------------------------------// annPlanted - Generates points from a "planted" distribution// In high dimensional spaces, interpoint distances tend to be// highly clustered around the mean value. Approximate nearest// neighbor searching makes little sense in this context, unless it// is the case that each query point is significantly closer to its// nearest neighbor than to other points. Thus, the query points// should be planted close to the data points. Given a source data// set, this procedure generates a set of query points having this// property.//// We are given a source data array and a standard deviation. We// generate points as follows. We select a random point from the// source data set, and we generate a Gaussian point centered about// this random point and perturbed by a normal distributed random// variable with mean zero and the given standard deviation along// each coordinate.//// Note that this essentially the same a clustered Gaussian// distribution, but where the cluster centers are given by the// source data set.//----------------------------------------------------------------------void annPlanted( // planted nearest neighbors ANNpointArray pa, // point array (modified) int n, // number of points int dim, // dimension ANNpointArray src, // source point array int n_src, // source size double std_dev) // standard deviation about source{ for (int i = 0; i < n; i++) { int c = annRanInt(n_src); // generate source index for (int d = 0; d < dim; d++) { pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + src[c][d]); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -