📄 vptree.c
字号:
/* * vptree.c * * VP-tree * * - 1) Dataset reading * - 2) Building VP-Tree * - 3) Read/Write VP-Tree from/to a file * - 4) K-NNSearch * - 5) Releasing Resource * * By Philip Fu * * Fri Jan 12 22:58:44 EST 2001 * */#include <stdio.h>#include <stdlib.h>#include <limits.h>#include <math.h>#include <assert.h>#include "vptree.h"#include "qsort.h"#include "standard.h"////////////////////////////////////////////////////////////////// Function Definitions//////////////////////////////////////////////////////////////////#define _DEBUG/////////////////////////////////////////////////////////// (1) Reading the dataset/////////////////////////////////////////////////////////DataSet *readDataSet(int n, int dim, char *dataFile){ FILE *fp; double *ptr,f; DataSet *dataSet; char buf[255]; int i; ///////////////////////////////////////////// // (1) Open the dataFile fp = fopen(dataFile,"r"); if (fp == NULL) { printf("Err : Cannot open file [%s] for reading!\n",dataFile); return NULL; } ///////////////////////////////////////////// // (2) Initialize dataSet dataSet = (DataSet *) malloc(sizeof(DataSet)); if (dataSet == NULL) { printf("Err : not enough memory for data allocation!\n\n"); return NULL; } ///////////////////////////////////////////// // (3) Read the header // read number of points // fgets(buf,255,fp); // sscanf(buf,"%d",&i); dataSet->nPoints = n; // read dimension // fgets(buf,255,fp); // sscanf(buf,"%d",&i); dataSet->dimension = dim; //printf("number of points : %d\n",dataSet->nPoints); //printf("dimension : %d\n",dataSet->dimension); ///////////////////////////////////////////// // (4) Read the points dataSet->points = (double *) malloc(sizeof(double)*(dataSet->nPoints)*(dataSet->dimension)); if (dataSet->points == NULL) { printf("Err : not enough memory for data allocation!\n\n"); return NULL; } ptr = dataSet->points; for (i=0; i<(dataSet->nPoints)*(dataSet->dimension); i++) { fscanf(fp,"%lf",&f); *ptr++ = f; } ///////////////////////////////////////////// // (5) Done fclose(fp); return dataSet;}/////////////////////////////////////////////////////////// (2) Building the VP Tree/////////////////////////////////////////////////////////doublesqrDist(double *p1, double *p2, int dim){ double dist; int i; dist = pow( ((*p1++) - (*p2++)), 2.0 ); for (i=1; i<dim; i++) dist += pow( ((*p1++) - (*p2++)), 2.0 ); return dist;}// Building a VP NodeVPNode *buildVPNode(double *points, int *dataID, int nPoints, int numBranch, int dimension){ VPNode *vpNode; int i,j; double *distances; int nSubset; int *subset; double stddev,bestStddev,dist; int vp,bestVP; double *ptrVP, *ptrPt, *ptrDist; int *sortList; double *tmpPt; int *tmpID, *ptrID; int lastIndex,childIndex; #ifdef _DEBUG printf("\nEnter buildVPNode (nPoints : %d) and (address %d)\n", nPoints,points); printf("dataID : %d\n",dataID[0]); #endif ///////////////////////////////////////////// // (1) Who am I? vpNode = (VPNode *) malloc(sizeof(VPNode)); if (vpNode == NULL) errexit("Err : Not enough memory for allocation!\n\n"); ///////////////////////////////////////////// // (2) Fill in information // avoid leakage when free() vpNode->child = NULL; vpNode->medians = NULL; vpNode->points = NULL; vpNode->dataID = NULL; // should we stop? if (nPoints <=_MAX_POINTS_NODE) { vpNode->isLeaf = _TRUE; // alloc. data points in leaf node vpNode->nPoints = nPoints; vpNode->points = (double *) malloc(sizeof(double)*dimension*nPoints); vpNode->dataID = (int *) malloc(sizeof(int)*nPoints); if (!vpNode->points || !vpNode->dataID) errexit("Err : Not enough memory for allocation!\n\n"); memcpy(vpNode->points, points, sizeof(double)*dimension*nPoints); memcpy(vpNode->dataID, dataID, sizeof(int)*nPoints); return vpNode; } else { vpNode->isLeaf = _FALSE; // alloc. vp vpNode->nPoints = 1; vpNode->points = (double *) malloc(sizeof(double)*dimension); vpNode->dataID = (int *) malloc(sizeof(int)); if (!vpNode->points || !vpNode->dataID) errexit("Err : Not enough memory for allocation!\n\n"); // * vp to be found later } // Initialize distances distances = (double *) malloc(sizeof(double)*(nPoints-1)); if (!distances) errexit("Err : Not enough memory for allocation!\n\n"); ///////////////////////////////////////////// // (3) Find the vantage point // first find a random "subset" in the dataset if (nPoints <= _NUM_PT_RANDOM_SUBSET) { // no need to find a random subset nSubset = nPoints; subset = (int *) malloc(sizeof(int)*nSubset); if (subset == NULL) errexit("Err : Not enough memory for allocation!\n\n"); for (i=0; i<nSubset; i++) subset[i] = i; } else { // find a random subset nSubset = _NUM_PT_RANDOM_SUBSET; subset = (int *) malloc(sizeof(int)*nSubset); if (subset == NULL) errexit("Err : Not enough memory for allocation!\n\n"); srand(time(NULL)); for (i=0; i<nSubset; i++) subset[i] = rand() % nPoints; } // Randomize some candidate vps and the one // with the largest standard dev. will be the VP srand(time(NULL)); // initialize bestStddev = 0.0; bestVP = -1; for (j=0; j<_NUM_CAND_VP; j++) { // randomize a candidate vp vp = rand() % nPoints; ptrVP = points + vp*dimension; // measure the standard dev. against this vp dist = 0.0; for (i=0; i<nSubset; i++) dist += sqrDist( ptrVP, points + subset[i]*dimension, dimension ); stddev = sqrt(dist/nSubset); // largest? if (bestStddev < stddev) { bestStddev = stddev; bestVP = vp; } } free(subset); #ifdef _DEBUG printf("bestVP : %d at (%f %f %f) with stdev %f\n", bestVP,ptrVP[0],ptrVP[1],ptrVP[2],bestStddev); #endif ///////////////////////////////////////////// // (4) Make the VP at the 1st // position by swapping // copy from (bestVP)th to vpNode memcpy(vpNode->points, points + bestVP*dimension, sizeof(double)*dimension); vpNode->dataID[0] = dataID[bestVP]; // copy from 0th element to (bestVP)th memcpy(points + bestVP*dimension, points, sizeof(double)*dimension); dataID[bestVP] = dataID[0]; // copy from vpNode to 0th element memcpy(points, vpNode->points, sizeof(double)*dimension); dataID[0] = vpNode->dataID[0]; ///////////////////////////////////////////// // (5) Find the distance of points to the vp ptrPt = points + dimension; // skip the 1st vp ptrDist = distances; for (i=0; i<nPoints-1; i++) { *ptrDist++ = sqrt( sqrDist(points, ptrPt, dimension) ); ptrPt += dimension; #ifdef _DEBUG printf("cal dist. %f (%f %f %f)\n",*(ptrDist-1), *(ptrPt-3),*(ptrPt-2),*(ptrPt-1)); #endif } ///////////////////////////////////////////// // (6) Sort the points in the array in the // ascending order of their corr. distances // initialize sortList = (int *) malloc(sizeof(int)*(nPoints-1)); if (sortList == NULL) errexit("Err : Not enough memory for allocation!\n\n"); for (i=0; i<nPoints-1; i++) sortList[i] = i; // sorting (find the sort index) //bubbleSortByIndex(&sortList, distances, nPoints-1); q_sort(sortList,distances,0,nPoints-2,nPoints-1); // rearrange the points -> tmpPt and dataID -> tmpID tmpPt = (double *) malloc(sizeof(double)*(nPoints-1)*dimension); tmpID = (int *) malloc(sizeof(int)*(nPoints-1)); if (!tmpPt || !tmpID) errexit("Err : Not enough memory for allocation!\n\n"); ptrPt = tmpPt; ptrID = tmpID; for (i=0; i<nPoints-1; i++) { memcpy( ptrPt, points + (sortList[i]+1)*dimension, // +1 to skip 1st vp sizeof(double)*dimension ); *ptrID++ = dataID[sortList[i]+1]; ptrPt += dimension; } // update points and dataID memcpy(points+dimension, tmpPt, sizeof(double)*(nPoints-1)*dimension); memcpy(dataID+1, tmpID, sizeof(int)*(nPoints-1)); free(tmpPt); free(tmpID); #ifdef _DEBUG for (i=0; i<nPoints-1; i++) printf("dist[%02d] = %f\n",i,distances[i]); for (i=0; i<nPoints-1; i++) printf("sort[%02d] = %d\n",i,sortList[i]); #endif ///////////////////////////////////////////// // (7) Fill in medians // initialize medians vpNode->medians = (double *) malloc(sizeof(double)*(numBranch-1)); if (!vpNode->medians) errexit("Err : Not enough memory for allocation!\n\n"); childIndex = 1; for (i=0; i<numBranch-1; i++) { // last point in this branch lastIndex = 1 + (nPoints-1)*(i+1) / numBranch - 1; if (i == numBranch-1) lastIndex = nPoints - 1; // set median vpNode->medians[i] = distances[lastIndex-1]; // next branch childIndex = lastIndex+1; //printf("vpNode->medians[%d] = %f\n",i,vpNode->medians[i]); } free(distances); free(sortList); ///////////////////////////////////////////// // (8) Build VPNode of each child // initialize childs vpNode->child = (VPNode **) malloc(sizeof(VPNode *)*numBranch); if (! vpNode->child) errexit("Err : Not enough memory for allocation!\n\n"); childIndex = 1; for (i=0; i<numBranch; i++) { // last point in this branch lastIndex = 1 + (nPoints-1)*(i+1) / numBranch - 1; if (i == numBranch-1) lastIndex = nPoints - 1; #ifdef _DEBUG printf("from %d to %d\n",childIndex,lastIndex); #endif // set child vpNode->child[i] = buildVPNode(points+childIndex*dimension, dataID+childIndex, lastIndex-childIndex+1, numBranch, dimension); #ifdef _DEBUG if (i != numBranch-1) printf("median : %f\n",vpNode->medians[i]); #endif // next branch childIndex = lastIndex+1; } return vpNode;}// External Function ://// Building the VP Tree// - if numBranch <= 1, use _DEFAULT_BRANCH// - note that ordering of points in dataSet is changed after calling// (return the root node)VPTree *buildVPTree(DataSet *dataSet, int numBranch){ VPTree *vpTree; int *dataID, i; /////////////////////////////////////////// // Allocate memory vpTree = (VPTree *) malloc(sizeof(VPTree)); dataID = (int *) malloc(sizeof(int)*dataSet->nPoints); if (!vpTree || !dataID) errexit("Err : Not enough memory for allocation!\n\n"); /////////////////////////////////////////// // (2) Initialization if (numBranch <= 1) numBranch = _DEFAULT_BRANCH; vpTree->numBranch = numBranch; vpTree->dimension = dataSet->dimension; // dataIDs' keep track of where the points goes in the VP tree for (i=0; i<dataSet->nPoints; i++) dataID[i] = i;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -