📄 bulkload.cpp

📁 M树源代码
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*********************************************************************
*                                                                    *
* Copyright (c) 1997,1998, 1999                                      *
* Multimedia DB Group and DEIS - CSITE-CNR,                          *
* University of Bologna, Bologna, ITALY.                             *
*                                                                    *
* All Rights Reserved.                                               *
*                                                                    *
* Permission to use, copy, and distribute this software and its      *
* documentation for NON-COMMERCIAL purposes and without fee is       *
* hereby granted provided  that this copyright notice appears in     *
* all copies.                                                        *
*                                                                    *
* THE AUTHORS MAKE NO REPRESENTATIONS OR WARRANTIES ABOUT THE        *
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING  *
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,      *
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. THE AUTHOR  *
* SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY LICENSEE AS A      *
* RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS    *
* DERIVATIVES.                                                       *
*                                                                    *
*********************************************************************/

#ifdef UNIX
#include <unistd.h>
#endif
#include "MT.h"

extern double MIN_UTIL;

// find the node having the minimum number of children
// this is used in the redistributing phase of the BulkLoad algorithm
int
FindMin(int *children, int max)
{
	int j, jmin=0;

	for(j=1; j<max; j++)
		if(children[j]<children[jmin]) jmin=j;
	return jmin;
}

// return root level+1 (the height of the tree)
// this is used in the "splitting" phase of the BulkLoad algorithm
int
MT::MaxLevel() const
{
	GiSTnode *root;
	GiSTpath path;
	int i;

	path.MakeRoot();
	root=ReadNode(path);
	i=root->Level();
	delete root;
	return i+1;
}

// split this M-tree into a list of trees having height level
// this is used in the "splitting" phase of the BulkLoad algorithm
GiSTlist<char *> *
MT::SplitTree(int *ncreated, int level, GiSTlist<MTentry *> *children, char *name)
// ncreated is the number of created sub-trees,
// level is the split level for the tree,
// children is the list of the parents of each sub-tree,
// name is the root for the sub-trees names
// the return value is the list of splitted sub-trees
{
	GiSTlist<char *> *trees=new GiSTlist<char *>;	// this is the results list
	GiSTlist<MTnode *> *oldList=new GiSTlist<MTnode *>;	// this is the nodes list
	GiSTpath path;
	MTnode *node=new MTnode;	// this is because the first operation on node is a delete
	char newname[50];

	path.MakeRoot();
	oldList->Append((MTnode *)ReadNode(path));
	do {	// build the roots list
		GiSTlist<MTnode *> *newList=new GiSTlist<MTnode *>;	// this is the list of the current level nodes

		while(!oldList->IsEmpty()) {
			delete node;	// delete the old node created by ReadNode
			node=oldList->RemoveFront();	// retrieve next node to be examined
			path=node->Path();
			for(int i=0; i<node->NumEntries(); i++) {	// append all its children to the new list
				MTentry *e=(MTentry *)(*node)[i].Ptr();

				path.MakeChild(e->Ptr());
				newList->Append((MTnode *)ReadNode(path));
				path.MakeParent();
			}
		}
		delete oldList;
		oldList=newList;
	} while(node->Level()>level);	// stop if we're at the split level
	delete node;
	while(!oldList->IsEmpty()) {	// now append each sub-tree to its root
		MT *subtree=new MT;
		MTnode *newnode;

		node=oldList->RemoveFront();
		sprintf(newname, "%s.%i", name, ++(*ncreated));
		unlink(newname);	// if this M-tree already exists, delete it
		subtree->Create(newname);	// create a new M-tree
		path.MakeRoot();
		newnode=(MTnode *)subtree->ReadNode(path);	// read the root of the tree
		subtree->Append(newnode, (MTnode *)node->Copy());	// append the sub-tree of current node to the root of this M-tree
		children->Append(node->Entry());	// insert the root entry into the list
		trees->Append(strdup(newname));	// insert the new M-tree name into the list
		delete node;
		delete newnode;
		delete subtree;
	}
	delete oldList;
	return trees;
}

// load this M-tree with n data using the BulkLoad algorithm [CP98]
void
MT::BulkLoad(MTentry **data, int n, double padFactor, char *name)
// data is an array of n entries
// padFactor is the maximum node utilization (use 1)
// name is the name of the tree
{
	int i, Size=0, totSize, addEntrySize=(sizeofEntry()? sizeof(GiSTpage): sizeof(GiSTlte)+sizeof(GiSTpage)), minSize=(int)(Store()->PageSize()*MIN_UTIL), NumEntries;	// this is the total size of entries

	if(sizeofEntry()) Size=n*(sizeof(GiSTpage)+sizeofEntry());	// (only valid if we've fixed size entries)
	else for(i=0; i<n; i++) Size+=sizeof(GiSTlte)+sizeof(GiSTpage)+data[i]->CompressedLength();
	totSize=Size+GIST_PAGE_HEADER_SIZE+sizeof(GiSTlte);
	NumEntries=(int)(Store()->PageSize()*padFactor*n)/totSize;
//	cout << "exp. size=" << totSize << endl;
	if(totSize>Store()->PageSize()) {	// we need to split the entries into several sub-trees
		GiSTlist<char *> nameslist, othernameslist;	// list of the sub-trees names
		GiSTlist<MTentry *> plist, parentslist;	// lists of the root entries of each sub-tree
		GiSTlist<int> *lists=NULL;	// list of entries for each sub-tree
		GiSTlist<double> *dists=NULL;	// list of distances for each sub-tree
		char **trees;	// array of the sub-trees names
		GiSTlist<MTnode *> *oldList=new GiSTlist<MTnode *>;
		GiSTpath path;
		MTentry ***arrays;	// array of the entries for each sub-tree
		MTentry **parents;	// array of the root entries for each sub-tree
		MTnode *node=NULL;
		GiSTlist<double *> *distm;	// distance matrix
		int s=(int)MAX(MIN(NumEntries, ceil(((float)n)/NumEntries)), NumEntries*MIN_UTIL);	// initial number of samples
		int j, nsamples, *samples=new int[s], *sizes=NULL, *ns=NULL, ncreated=0, minLevel=MAXINT, nInit, l, iters=0, MAXITER=s*s;
		char newname[50];
		BOOL *sampled=new BOOL[n];	// is this entry in the samples set?

//		cout << "NE*pF=" << NumEntries*padFactor << ", n/NE*pF=" << n/floor(NumEntries*padFactor) << ", s=" << s << endl;
		distm=(GiSTlist<double *> *)calloc(s,sizeof(GiSTlist<double *>));
		do {	// sampling phase
			iters++;
			if(iters>1) {	// this is a new sampling phase
//				cout << "Re-sampling: " << iters << "/" << MAXITER << endl;
				while(!lists[0].IsEmpty()) {
					lists[0].RemoveFront();
					dists[0].RemoveFront();
				}
				delete []lists;
				delete []dists;
				delete []sizes;
				delete []ns;
				while(!distm[0].IsEmpty()) delete []distm[0].RemoveFront();	// empty the distance list
				for(i=1; i<s; i++) distm[i].front=distm[i].rear=NULL;
			}
//			if(iters>=MAXITER) minSize=0;
			if(iters>=MAXITER) {
				cout << "Too many loops in BulkLoad!\nPlease select a lower minimum node utilization or a bigger node size.\n";
				exit(1);
			}
			nsamples=0;
			for(i=0; i<n; i++) sampled[i]=FALSE;
			// pick samples to create parents
			while(nsamples<s) {
				do i=PickRandom(0, n);
				while(sampled[i]);
				sampled[i]=TRUE;
				samples[nsamples++]=i;
			}
//			cout << "Samples:\n";
//			for(i=0; i<s; i++) cout << "\t" << i << ":\t" << data[samples[i]];
			lists=new GiSTlist<int>[s];
			dists=new GiSTlist<double>[s];
			sizes=new int[s];
			ns=new int[s];
			for(i=0; i<s; i++) {
				sizes[i]=GIST_PAGE_HEADER_SIZE+sizeof(GiSTlte);
				ns[i]=1;
			}
			for(i=0; i<s; i++) distm[i].Prepend(new double[s]);
			for(i=0; i<s; i++) {	// compute the relative distances between samples
				for(j=0; j<i; j++)
					distm[j].front->entry[i]=(distm[i].front->entry[j]=data[samples[j]]->object().distance(data[samples[i]]->object()));
				distm[i].front->entry[i]=0;
			}
			for(i=0; i<n; i++) {	// assign each entry to its nearest parent
//				cout << "Now assigning " << data[i];
				if(sampled[i]) {
					for(j=0; samples[j]!=i; j++);
					lists[j].Prepend(i);	// insert the entry in the right list
					dists[j].Prepend(0);
					sizes[j]+=addEntrySize+data[i]->CompressedLength();
//					cout << "\tAssigned (0) to " << j << ", " << data[samples[j]];
				}
				else {	// here we optimize the distance computations (like we do in the insert algorithm)
					double *dist=new double[s];
					int minindex=0;

					dist[0]=data[samples[0]]->object().distance(data[i]->object());
					for(j=1; j<s; j++) {
						BOOL cont=TRUE;

						dist[j]=-1;
						if(fabs(data[samples[j]]->Key()->distance-data[i]->Key()->distance)>dist[minindex]) continue;	// pruning for reference point (parent)
						for(int k=0; (k<j)&&cont; k++)	// pruning for reference points (other samples)
							if(dist[k]<0) continue;
							else cont=(fabs(dist[k]-distm[j].front->entry[k])<dist[minindex]);
						if(!cont) continue;
						dist[j]=data[samples[j]]->object().distance(data[i]->object());	// we have to compute this distance
						if(dist[j]<dist[minindex]) minindex=j;
					}
//					cout << "\tAssigned (" << dist[minindex] << ") to " << minindex << ", " << data[samples[minindex]];
					lists[minindex].Append(i);
					dists[minindex].Append(dist[minindex]);
					sizes[minindex]+=addEntrySize+data[i]->CompressedLength();
					ns[minindex]++;
					if(sizes[minindex]>=minSize) delete []dist;
					else distm[minindex].Append(dist);
				}
			}
			// redistribute underfilled parents
//			cout << "Underfilled parents redistribution...\n";
			while(sizes[i=FindMin(sizes, nsamples)]<minSize) {
				GiSTlist<int> list=lists[i];
				GiSTlist<double *> dlist=distm[i];

				while(!dists[i].IsEmpty()) dists[i].RemoveFront();
//				cout << "Redistributing " << i << "' set (" << sizes[i] << "/" << minSize << ")...\n";
				for(j=0; j<nsamples; j++)
					for(GiSTlistnode<double *> *lnode=distm[j].front; lnode; lnode=lnode->next) lnode->entry[i]=lnode->entry[nsamples-1];
				// substitute this set with last set 
				distm[i]=distm[nsamples-1];
				lists[i]=lists[nsamples-1];
				dists[i]=dists[nsamples-1];
				samples[i]=samples[nsamples-1];
				sizes[i]=sizes[nsamples-1];
				ns[i]=ns[nsamples-1];
				nsamples--;
				while(!list.IsEmpty()) {	// assign each entry to its nearest parent
					double *d=dlist.RemoveFront();
					int k=list.RemoveFront(), index, minindex=-1;
//					cout << "Now assigning " << data[k];

					for(index=0; (index<nsamples)&&(minindex<0); index++)	// search for a computed distance
						if(d[index]>0) minindex=index;
					if(minindex<0) {	// no distance was computed (i.e. all distances were pruned)
						d[0]=data[samples[0]]->object().distance(data[k]->object());
						minindex=0;
					}
					for(index=0; index<nsamples; index++) {
						BOOL cont=TRUE;

						if(index==minindex) continue;
						if(d[index]<0) {	// distance wasn't computed
							if(fabs(data[samples[index]]->Key()->distance-data[k]->Key()->distance)>d[minindex]) continue;	// pruning for reference point (parent)
							for(l=0; (l<index)&&cont; l++)	// pruning for reference points (other samples)
								if(d[l]<0) continue;
								else cont=(fabs(d[l]-distm[index].front->entry[l])<d[minindex]);
							if(!cont) continue;
							d[index]=data[samples[index]]->object().distance(data[k]->object());	// we have to compute this distance
						}
						if(d[index]<d[minindex]) minindex=index;
					}
//					cout << "\tAssigned (" << d[minindex] << ") to " << minindex << ", " << data[samples[minindex]];
					lists[minindex].Append(k);
					dists[minindex].Append(d[minindex]);
					sizes[minindex]+=addEntrySize+data[k]->CompressedLength();
					ns[minindex]++;
					if(sizes[minindex]>=minSize) delete []d;
					else distm[minindex].Append(d);
				}
				assert(dlist.IsEmpty());
			}
		} while(nsamples==1);	// if there's only one child, repeat the sampling phase
//		cout << "Samples:\n";
//		for(i=0; i<nsamples; i++) cout << "\t" << i << ": " << data[samples[i]];
		arrays=new MTentry **[nsamples];
		for(i=0; i<nsamples; i++) {	// convert the lists into arrays
			arrays[i]=new MTentry *[ns[i]];
			for(j=0; j<ns[i]; j++) {
				int k=lists[i].RemoveFront();

				arrays[i][j]=(MTentry *)data[k]->Copy();
				arrays[i][j]->Key()->distance=dists[i].RemoveFront();
			}
			assert(lists[i].IsEmpty());
			assert(dists[i].IsEmpty());
		}
		delete []dists;
		delete []lists;
		for(i=0; i<nsamples; i++)
			while(!distm[i].IsEmpty())
				delete [](distm[i].RemoveFront());
		free(distm);
		// build an M-tree under each parent
		nInit=nsamples;
//		cout << "Now building " << nsamples << " sub-trees...\n";
		MT *tree=new MT;

//		for(i=0; i<nsamples; i++) cout << i+1 << "' set: " << ns[i] << endl;
		for(i=0; i<nInit; i++) {
			MTnode *root;
			GiSTpath path;

			sprintf(newname, "%s.%i", name, ++ncreated);
			unlink(newname);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -