📄 kctree.cpp

📁 高效的k-means算法实现
💻 CPP
📖 第 1 页 / 共 3 页
字号:
    int n_child = 0;				// n_data of child    KMpoint s_child = NULL;			// sum of child    double ssq_child = 0;			// sum of squares for child    n_data = 0;					// initialize no. points    						// process each child    for (int i = KM_LO; i <= KM_HI; i++) {    						// visit low child	child[i]->makeSums(n_child, s_child, ssq_child);	n_data += n_child;			// increment no. points	for (int d = 0; d < kcDim; d++) {	// update sum and sumSq	    sum[d] += s_child[d];	}	sumSq += ssq_child;    }    n = n_data;					// return results    theSum = sum;    theSumSq = sumSq;}//----------------------------------------------------------------------void KCleaf::makeSums(    int			&n,			// number of points (returned)    KMpoint		&theSum,		// sum (returned)    double		&theSumSq)		// sum of squares (returned){    assert(sum != NULL);			// should already be allocated    sumSq = 0;    for (int i = 0; i < n_data; i++) {		// compute sum	for (int d = 0; d < kcDim; d++) {	    KMcoord theCoord = kcPoints[bkt[i]][d];	    sum[d] += theCoord;	    sumSq += theCoord * theCoord;	}    }    n = n_data;					// return results    theSum = sum;    theSumSq = sumSq;}//----------------------------------------------------------------------//  kc-tree destructors - deletes kc-tree //	(The other elements of the underlying kd-tree are deleted along//	with the base class.)////	Because of our "dirty trick" of recasting all child pointers//	to type KCptr (from KMkd_ptr), we must be careful to not//	allow the kd-tree constructors apply themselves recursively//	to the child nodes.  We do this by setting the child pointers//	to NULL after deleting them, which keeps the KMkd_split//	destructor from activating itself.//----------------------------------------------------------------------KCtree::~KCtree()		// tree destructor{    if (root != NULL) delete root;    if (pidx != NULL) delete [] pidx;}KCnode::~KCnode()		// node destructor{    if (sum != NULL) kmDeallocPt(sum);	// deallocate sum}//----------------------------------------------------------------------//  Sample a center point//	This implements an approach suggested by Matoushek for sampling//	a center point.  A node of the kd-tree is selected at random.//	If this is an interior node, a point is sampled uniformly from a//	3x expansion of the cell about its center.  If the node is a//	leaf, then a data point is sampled at random from the associated//	bucket.//	//	Here is how sampling is done from an interior node.  Let m//	denote the number of data points descended from this node.  Then//	with probability 1/(2m-1), this cell is chosen.  Otherwise, let//	mL and mR denote the number of points associated with the left//	and right subtrees, respectively.  We sample from the left//	subtree with probability (2mL-1)/(2m-1) and sample from the//	right subtree with probability (2mR-1)/(2m-1).////	The rationale is that, assuming there is exactly one point per//	leaf node, a subtree with m points has exactly 2m-1 total nodes.//	(This should be true for this implementation, but it depends in//	general on the fact that there is exactly one point per leaf//	node.)  Hence the root should be sampled with probability//	1/(2m-1), and the subtrees should be sampled with the given//	probabilities.//----------------------------------------------------------------------void KCtree::sampleCtr(KMpoint c)		// sample a point{    initBasicGlobals(dim, n_pts, pts);		// initialize globals    // TODO: bb_save check is just for debugging.    KMorthRect bb_save(dim, bnd_box);		// save bounding box    root->sampleCtr(c, bnd_box);		// start at root    for (int i = 0; i < dim; i++) {		// check that bnd_box unchanged	assert(bb_save.lo[i] == bnd_box.lo[i] &&	       bb_save.hi[i] == bnd_box.hi[i]);    }}void KCsplit::sampleCtr(			// sample from splitting node    KMpoint		c,			// the sampled point (returned)    KMorthRect		&bnd_box)		// bounding box for current node{    int r = kmRanInt(n_nodes());		// random integer [0..n_nodes-1]    if (r == 0) {				// sample from this node	KMorthRect expBox(kcDim);	bnd_box.expand(kcDim, 3, expBox);	// compute 3x expanded box	expBox.sample(kcDim, c);		// sample c from box    }    else if (r <= child[KM_LO]->n_nodes()) {	// sample from left	KMcoord save = bnd_box.hi[cut_dim];	// save old upper bound	bnd_box.hi[cut_dim] = cut_val;		// modify for left subtree	child[KM_LO]->sampleCtr(c, bnd_box);	bnd_box.hi[cut_dim] = save;		// restore upper bound    }    else {					// sample from right subtree	KMcoord save = bnd_box.lo[cut_dim];	// save old lower bound	bnd_box.lo[cut_dim] = cut_val;		// modify for right subtree	child[KM_HI]->sampleCtr(c,  bnd_box);	bnd_box.lo[cut_dim] = save;		// restore lower bound    }}void KCleaf::sampleCtr(				// sample from leaf node    KMpoint		c,			// the sampled point (returned)    KMorthRect		&bnd_box)		// bounding box for current node{    int ri = kmRanInt(n_data);			// generate random index    kmCopyPt(kcDim, kcPoints[bkt[ri]], c);	// copy to destination}//----------------------------------------------------------------------//  Printing the kc-tree //	These routines print a kc-tree in reverse inorder (high then//	root then low).  (This is so that if you look at the output//	from the right side it appear from left to right in standard//	inorder.)  When outputting leaves we output only the point//	indices rather than the point coordinates.  There is an option//	to print the point coordinates separately.////	The tree printing routine calls the printing routines on the//	individual nodes of the tree, passing in the level or depth//	in the tree.  The level in the tree is used to print indentation//	for readability.//----------------------------------------------------------------------void KCsplit::print(		// print splitting node    int		level)			// depth of node in tree{    					// print high child    child[KM_HI]->print(level+1);    *kmOut << "    ";			// print indentation    for (int i = 0; i < level; i++)	*kmOut << ".";    kmOut->precision(4);    *kmOut << "Split"			// print without address        << " cd=" << cut_dim << " cv=" << setw(6) << cut_val       	<< " nd=" << n_data       	<< " sm=";  kmPrintPt(sum, kcDim, true);    *kmOut << " ss=" << sumSq << "\n";    					// print low child    child[KM_LO]->print(level+1);}//----------------------------------------------------------------------void KCleaf::print(			// print leaf node    int		level)			// depth of node in tree{    *kmOut << "    ";    for (int i = 0; i < level; i++)	// print indentation	*kmOut << ".";    // *kmOut << "Leaf <" << (void*) this << ">";    *kmOut << "Leaf";			// print without address    *kmOut << " n=" << n_data << " <";    for (int j = 0; j < n_data; j++) {	*kmOut << bkt[j];	if (j < n_data-1) *kmOut << ",";    }    *kmOut << ">"       	<< " sm=";  kmPrintPt(sum, kcDim, true);    *kmOut << " ss=" << sumSq << "\n";}//----------------------------------------------------------------------void KCtree::print(			// print entire tree    bool	with_pts)			// print points as well?{    if (with_pts) {			// print point coordinates	*kmOut << "    Points:\n";	for (int i = 0; i < n_pts; i++) {	    *kmOut << "\t" << i << ": ";	    kmPrintPt(pts[i], kcDim, true);            *kmOut << "\n";	}    }    if (root == NULL)			// empty tree?	*kmOut << "    Null tree.\n";    else {    	root->print(0);			// invoke printing at root    }}//----------------------------------------------------------------------// DistGlobals - globals used in computing distortions// 	To prevent long argument lists in the computation of// 	distortions, we store a number of common global variables here.// 	These are initialized in KCtree::getNeighbors.//// 	Note: kcDim and kcPoints (from Basic Globals) are used as well.//----------------------------------------------------------------------int		kcKCtrs;		// number of centersint*		kcWeights;		// weights of each pointKMpointArray	kcCenters;		// the center pointsKMpointArray	kcSums;			// sumsdouble*		kcSumSqs;		// sum of squaresdouble*		kcDists;		// distortionsKMpoint		kcBoxMidpt;		// bounding-box midpoint//----------------------------------------------------------------------//  initDistGlobals - initialize distortion globals//----------------------------------------------------------------------static void initDistGlobals(		// initialize distortion globals    KMfilterCenters& ctrs)			// the centers{    initBasicGlobals(ctrs.getDim(), ctrs.getNPts(), ctrs.getDataPts());    kcKCtrs	= ctrs.getK();    kcCenters	= ctrs.getCtrPts();		// get ptrs to KMcenter arrays    kcWeights	= ctrs.getWeights(false);    kcSums	= ctrs.getSums(false);    kcSumSqs	= ctrs.getSumSqs(false);    kcDists	= ctrs.getDists(false);    kcBoxMidpt  = kmAllocPt(kcDim);    for (int j = 0; j < kcKCtrs; j++) {		// initialize sums	kcWeights[j] = 0;	kcSumSqs[j] = 0;	for (int d = 0; d < kcDim; d++) {    	    kcSums[j][d] = 0;	}    }}static void deleteDistGlobals()		// delete distortion globals{    kmDeallocPt(kcBoxMidpt);}//----------------------------------------------------------------------// getNeighbors - get neighbors for each candidate//	This is the heart of the filter-based k-means algorithm.  It is//	given an array of centers (ctrs) and an array of center sums//	(sums), and an array of sums of squares (sumSqs).  All three//	arrays consist of k points.  It computes the sum, sum of//	squares, and weights of all the neighbors of each center, and//	stores the results in these arrays.  From these quantities, the//	final centroid and distortion (mean squared error) can be//	computed.////	This is done by determining the set of candidates for each node//	in the kc-tree.  When the number of candidates for a node is//	equal to 1 (it cannot be 0) then all of the points in the//	subtree rooted at this node are assigned as neighbors to this//	center.  This means that the centroid and weight for this cell//	is added into the neighborhood centroid sum for this center.  If//	this node is a leaf, then we compute (by brute-force) the//	distance from each candidate to each data point, and assign the//	data point to the closest center.
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -