📄 kctree.cpp
字号:
int n_child = 0; // n_data of child KMpoint s_child = NULL; // sum of child double ssq_child = 0; // sum of squares for child n_data = 0; // initialize no. points // process each child for (int i = KM_LO; i <= KM_HI; i++) { // visit low child child[i]->makeSums(n_child, s_child, ssq_child); n_data += n_child; // increment no. points for (int d = 0; d < kcDim; d++) { // update sum and sumSq sum[d] += s_child[d]; } sumSq += ssq_child; } n = n_data; // return results theSum = sum; theSumSq = sumSq;}//----------------------------------------------------------------------void KCleaf::makeSums( int &n, // number of points (returned) KMpoint &theSum, // sum (returned) double &theSumSq) // sum of squares (returned){ assert(sum != NULL); // should already be allocated sumSq = 0; for (int i = 0; i < n_data; i++) { // compute sum for (int d = 0; d < kcDim; d++) { KMcoord theCoord = kcPoints[bkt[i]][d]; sum[d] += theCoord; sumSq += theCoord * theCoord; } } n = n_data; // return results theSum = sum; theSumSq = sumSq;}//----------------------------------------------------------------------// kc-tree destructors - deletes kc-tree // (The other elements of the underlying kd-tree are deleted along// with the base class.)//// Because of our "dirty trick" of recasting all child pointers// to type KCptr (from KMkd_ptr), we must be careful to not// allow the kd-tree constructors apply themselves recursively// to the child nodes. We do this by setting the child pointers// to NULL after deleting them, which keeps the KMkd_split// destructor from activating itself.//----------------------------------------------------------------------KCtree::~KCtree() // tree destructor{ if (root != NULL) delete root; if (pidx != NULL) delete [] pidx;}KCnode::~KCnode() // node destructor{ if (sum != NULL) kmDeallocPt(sum); // deallocate sum}//----------------------------------------------------------------------// Sample a center point// This implements an approach suggested by Matoushek for sampling// a center point. A node of the kd-tree is selected at random.// If this is an interior node, a point is sampled uniformly from a// 3x expansion of the cell about its center. If the node is a// leaf, then a data point is sampled at random from the associated// bucket.// // Here is how sampling is done from an interior node. Let m// denote the number of data points descended from this node. Then// with probability 1/(2m-1), this cell is chosen. Otherwise, let// mL and mR denote the number of points associated with the left// and right subtrees, respectively. We sample from the left// subtree with probability (2mL-1)/(2m-1) and sample from the// right subtree with probability (2mR-1)/(2m-1).//// The rationale is that, assuming there is exactly one point per// leaf node, a subtree with m points has exactly 2m-1 total nodes.// (This should be true for this implementation, but it depends in// general on the fact that there is exactly one point per leaf// node.) Hence the root should be sampled with probability// 1/(2m-1), and the subtrees should be sampled with the given// probabilities.//----------------------------------------------------------------------void KCtree::sampleCtr(KMpoint c) // sample a point{ initBasicGlobals(dim, n_pts, pts); // initialize globals // TODO: bb_save check is just for debugging. KMorthRect bb_save(dim, bnd_box); // save bounding box root->sampleCtr(c, bnd_box); // start at root for (int i = 0; i < dim; i++) { // check that bnd_box unchanged assert(bb_save.lo[i] == bnd_box.lo[i] && bb_save.hi[i] == bnd_box.hi[i]); }}void KCsplit::sampleCtr( // sample from splitting node KMpoint c, // the sampled point (returned) KMorthRect &bnd_box) // bounding box for current node{ int r = kmRanInt(n_nodes()); // random integer [0..n_nodes-1] if (r == 0) { // sample from this node KMorthRect expBox(kcDim); bnd_box.expand(kcDim, 3, expBox); // compute 3x expanded box expBox.sample(kcDim, c); // sample c from box } else if (r <= child[KM_LO]->n_nodes()) { // sample from left KMcoord save = bnd_box.hi[cut_dim]; // save old upper bound bnd_box.hi[cut_dim] = cut_val; // modify for left subtree child[KM_LO]->sampleCtr(c, bnd_box); bnd_box.hi[cut_dim] = save; // restore upper bound } else { // sample from right subtree KMcoord save = bnd_box.lo[cut_dim]; // save old lower bound bnd_box.lo[cut_dim] = cut_val; // modify for right subtree child[KM_HI]->sampleCtr(c, bnd_box); bnd_box.lo[cut_dim] = save; // restore lower bound }}void KCleaf::sampleCtr( // sample from leaf node KMpoint c, // the sampled point (returned) KMorthRect &bnd_box) // bounding box for current node{ int ri = kmRanInt(n_data); // generate random index kmCopyPt(kcDim, kcPoints[bkt[ri]], c); // copy to destination}//----------------------------------------------------------------------// Printing the kc-tree // These routines print a kc-tree in reverse inorder (high then// root then low). (This is so that if you look at the output// from the right side it appear from left to right in standard// inorder.) When outputting leaves we output only the point// indices rather than the point coordinates. There is an option// to print the point coordinates separately.//// The tree printing routine calls the printing routines on the// individual nodes of the tree, passing in the level or depth// in the tree. The level in the tree is used to print indentation// for readability.//----------------------------------------------------------------------void KCsplit::print( // print splitting node int level) // depth of node in tree{ // print high child child[KM_HI]->print(level+1); *kmOut << " "; // print indentation for (int i = 0; i < level; i++) *kmOut << "."; kmOut->precision(4); *kmOut << "Split" // print without address << " cd=" << cut_dim << " cv=" << setw(6) << cut_val << " nd=" << n_data << " sm="; kmPrintPt(sum, kcDim, true); *kmOut << " ss=" << sumSq << "\n"; // print low child child[KM_LO]->print(level+1);}//----------------------------------------------------------------------void KCleaf::print( // print leaf node int level) // depth of node in tree{ *kmOut << " "; for (int i = 0; i < level; i++) // print indentation *kmOut << "."; // *kmOut << "Leaf <" << (void*) this << ">"; *kmOut << "Leaf"; // print without address *kmOut << " n=" << n_data << " <"; for (int j = 0; j < n_data; j++) { *kmOut << bkt[j]; if (j < n_data-1) *kmOut << ","; } *kmOut << ">" << " sm="; kmPrintPt(sum, kcDim, true); *kmOut << " ss=" << sumSq << "\n";}//----------------------------------------------------------------------void KCtree::print( // print entire tree bool with_pts) // print points as well?{ if (with_pts) { // print point coordinates *kmOut << " Points:\n"; for (int i = 0; i < n_pts; i++) { *kmOut << "\t" << i << ": "; kmPrintPt(pts[i], kcDim, true); *kmOut << "\n"; } } if (root == NULL) // empty tree? *kmOut << " Null tree.\n"; else { root->print(0); // invoke printing at root }}//----------------------------------------------------------------------// DistGlobals - globals used in computing distortions// To prevent long argument lists in the computation of// distortions, we store a number of common global variables here.// These are initialized in KCtree::getNeighbors.//// Note: kcDim and kcPoints (from Basic Globals) are used as well.//----------------------------------------------------------------------int kcKCtrs; // number of centersint* kcWeights; // weights of each pointKMpointArray kcCenters; // the center pointsKMpointArray kcSums; // sumsdouble* kcSumSqs; // sum of squaresdouble* kcDists; // distortionsKMpoint kcBoxMidpt; // bounding-box midpoint//----------------------------------------------------------------------// initDistGlobals - initialize distortion globals//----------------------------------------------------------------------static void initDistGlobals( // initialize distortion globals KMfilterCenters& ctrs) // the centers{ initBasicGlobals(ctrs.getDim(), ctrs.getNPts(), ctrs.getDataPts()); kcKCtrs = ctrs.getK(); kcCenters = ctrs.getCtrPts(); // get ptrs to KMcenter arrays kcWeights = ctrs.getWeights(false); kcSums = ctrs.getSums(false); kcSumSqs = ctrs.getSumSqs(false); kcDists = ctrs.getDists(false); kcBoxMidpt = kmAllocPt(kcDim); for (int j = 0; j < kcKCtrs; j++) { // initialize sums kcWeights[j] = 0; kcSumSqs[j] = 0; for (int d = 0; d < kcDim; d++) { kcSums[j][d] = 0; } }}static void deleteDistGlobals() // delete distortion globals{ kmDeallocPt(kcBoxMidpt);}//----------------------------------------------------------------------// getNeighbors - get neighbors for each candidate// This is the heart of the filter-based k-means algorithm. It is// given an array of centers (ctrs) and an array of center sums// (sums), and an array of sums of squares (sumSqs). All three// arrays consist of k points. It computes the sum, sum of// squares, and weights of all the neighbors of each center, and// stores the results in these arrays. From these quantities, the// final centroid and distortion (mean squared error) can be// computed.//// This is done by determining the set of candidates for each node// in the kc-tree. When the number of candidates for a node is// equal to 1 (it cannot be 0) then all of the points in the// subtree rooted at this node are assigned as neighbors to this// center. This means that the centroid and weight for this cell// is added into the neighborhood centroid sum for this center. If// this node is a leaf, then we compute (by brute-force) the// distance from each candidate to each data point, and assign the// data point to the closest center.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -