⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfkm.c

📁 数据挖掘方面的源码
💻 C
📖 第 1 页 / 共 4 页
字号:
#include "vfml.h"

#include "vfkm-engine.h"

#include <stdio.h>
#include <string.h>

#include <sys/times.h>
#include <time.h>

#include <math.h>

char *gFileStem        = "DF";
char *gSourceDirectory = ".";
int   gDoTests         = 0;
int   gMessageLevel    = 0;
int   gNumClusters     = -1;
float gDelta           = 0.05;
float gErrorTarget     = 0.05;
float gThisErrorTarget = 0.05;
int   gEstimatedNumIterations = 5;
int   gBatch                  = 0;
long  gMaxExamplesPerIteration = 1000000000;
int   gInitNumber      = 1;
int   gFancyStop              = 1;
float gConvergeDelta          = 0.001;
int   gSeed             = -1;
float gR = 0;
float gAssignErrorScale = 1.0;
int   gAllowBadConverge = 0;
int   gUseNormalApprox  = 0;
long         gDBSize                 = 0;

int   gStdin            = 0;
int   gIterativeResults = 0;
int   gTestOnTrain      = 0; /* default test total center-matching distance */

/* HERE add parameters for these */
int          gDoEarlyStop            = 0;
int          gLoadCenters            = 0;
int          gReassignCentersEachRound = 0;

/* HERE some hackish globals & stuff*/
VoidAListPtr gStatsList;
int          gD;
int          gIteration = 0;
int          gRound     = 0;
int          gTotalExamplesSeen = 0;
float        gNeededDelta;
float        gTargetEkd;
long         gN;
float        *gIterationNs = 0;
int          gNumIterationNs;

static void _printUsage(char  *name) {
   printf("%s : Very Fast k-means clustering\n", name);

   printf("-f <filestem>\t Set the name of the dataset (default DF)\n");
   printf("-source <dir>\t Set the source data directory (default '.')\n");
   printf("-clusters <numClusters>\t Set the num clusters to find (REQUIRED)\n");
   printf("-dbSize <examples>\t How many examples are in the datafile (REQUIRED)\n");
   printf("-assignErrorScale <scale>\t Loosen bound by scaling the assignment part\n\t\t of it by the scale factor (default 1.0)\n");
   printf("-delta <delta>\t Find final solution with confidence (1 - <delta>) (default 5%%)\n");
   printf("-epsilon <epsilon>\t The maximum distance between a learned centroid and\n\t\t the coresponding infinite data centroid (default .05)\n");
   printf("-converge <distance>\t Stop when distance centroids move between iterations\n\t\t squared is less than <distance>  (default .001)\n");
   printf("-batch\t Do a traditional kmeans run on the whole input file.  Doesn't make\n\t\t sense to combine with -stdin (default off)\n");
   printf("-init <num>\t use the <num>th valid assignment of initial centroids\n\t\t (default 1)\n");
   printf("-maxPerIteration <num> \t use no more than num examples per iteration\n\t\t (default 1,000,000,000)\n");
   printf("-l <number>\t The estimated # of iterations to converge if you guess\n\t\t wrong and aren't using batch VFKM will fix it for you\n\t\t and do an extra round (default 10)\n");
   printf("-seed <s>\t Seed to use picking initial centers (default random)\n");
   printf("-progressive\t Don't use our fancy Lagrange based optimization pick the\n\t\t # of samples at each iteration of the next \n\t\tround (default use the optimization)\n");
   printf("-allowBadConverge\t Allows VFKM to converge at a time when km would\n\t\tnot (default off).\n");
   printf("-normalApprox\t use a normal approximation instead of the hoeffding bound\n\t\t (default hoeffding).\n");
   printf("-r <range>\t The maximum range between pairs of examples (default\n\t\t assume the range of each dimension is 0 - 1.0\n\t\t and calculate it from that)\n");
   printf("-stdin \t\t Reads training examples as a stream from stdin instead of from \n\t\t <stem>.data (default off)\n");
   printf("-testOnTrain \t loss is the sum of the square of the distances \n\t\t from all training examples to their assigned centroid (default\n\t\t compare learned centroids to centroids in <stem>.test)\n");
   printf("-loadCenters \t load initial centroids from <stem>.centers (default use random based on -init and -seed)\n");
   printf("-u\t\t Output results after each iteration\n");
   printf("-v\t\t Can be used multiple times to increase the debugging output\n");
}


static void _processArgs(int argc, char *argv[]) {
   int i;

   /* HERE on the ones that use the next arg make sure it is there */
   for(i = 1 ; i < argc ; i++) {
      if(!strcmp(argv[i], "-f")) {
         gFileStem = argv[i+1];
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-source")) {
         gSourceDirectory = argv[i+1];
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-u")) {
         gDoTests = 1;
      } else if(!strcmp(argv[i], "-testOnTrain")) {
         gTestOnTrain = 1;
      } else if(!strcmp(argv[i], "-clusters")) {
         sscanf(argv[i+1], "%d", &gNumClusters);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-dbSize")) {
         sscanf(argv[i+1], "%ld", &gDBSize);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-delta")) {
         sscanf(argv[i+1], "%f", &gDelta);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-converge")) {
         sscanf(argv[i+1], "%f", &gConvergeDelta);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-r")) {
         sscanf(argv[i+1], "%f", &gR);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-assignErrorScale")) {
         sscanf(argv[i+1], "%f", &gAssignErrorScale);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-epsilon")) {
         sscanf(argv[i+1], "%f", &gErrorTarget);
         gThisErrorTarget = gErrorTarget;
         /* ignore the next argument */
         i++;
//      } else if(!strcmp(argv[i], "-n")) {
//         sscanf(argv[i+1], "%ld", &gN);
//         /* ignore the next argument */
//         i++;
      } else if(!strcmp(argv[i], "-maxPerIteration")) {
         sscanf(argv[i+1], "%ld", &gMaxExamplesPerIteration);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-loadCenters")) {
         gLoadCenters = 1;
      } else if(!strcmp(argv[i], "-batch")) {
         gBatch = 1;
      } else if(!strcmp(argv[i], "-allowBadConverge")) {
         gAllowBadConverge = 1;
      } else if(!strcmp(argv[i], "-normalApprox")) {
         gUseNormalApprox = 1;
      } else if(!strcmp(argv[i], "-progressive")) {
         gFancyStop = 0;
      } else if(!strcmp(argv[i], "-init")) {
         sscanf(argv[i+1], "%d", &gInitNumber);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-l")) {
         sscanf(argv[i+1], "%d", &gEstimatedNumIterations);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-seed")) {
         sscanf(argv[i+1], "%d", &gSeed);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-v")) {
         gMessageLevel++;
      } else if(!strcmp(argv[i], "-h")) {
         _printUsage(argv[0]);
         exit(0);
      } else if(!strcmp(argv[i], "-stdin")) {
         gStdin = 1;
      } else {
         printf("Unknown argument: %s.  use -h for help\n", argv[i]);
         exit(0);
      }
   }

   if(gNumClusters < 0) {
      printf("Didn't supply the required '-clusters' argument\n");
      exit(0);
   }

   if(gDBSize <= 0) {
      printf("Didn't supply the required '-dbSize' argument\n");
      exit(0);
   }

   if(gMessageLevel >= 1) {
      printf("Stem: %s\n", gFileStem);
      printf("Source: %s\n", gSourceDirectory);

      if(gStdin) {
         printf("Reading examples from standard in.\n");
      }
      if(gDoTests) {
         printf("Running tests\n");
      }
      printf("DB Size %ld\n", gDBSize);
   }
}

static float _MatchCentersGetDistanceSquare(VoidAListPtr learnedCenters, VoidAListPtr testCenters) {
   /* perfom a 1 to 1 matching between the learned and test centers.
        change the class lables of the learnedCenters to correspond to
        the closest testCenters.  Do a greedy assignment that tries to
        minimize the total distance between assigned pairs */
   int i, j, init;
   double bestDistance, thisDistance;
   int lBestIndex, tBestIndex;
   VoidAListPtr unLearned = VALNew();
   VoidAListPtr unTest = VALNew();
   float totalDistance = 0.0;

   for(i = 0 ; i < VALLength(learnedCenters) ; i++) {
      VALAppend(unLearned, VALIndex(learnedCenters, i));
   }

   for(i = 0 ; i < VALLength(testCenters) ; i++) {
      VALAppend(unTest, VALIndex(testCenters, i));
   }

   bestDistance = lBestIndex = tBestIndex = 0;
   while(VALLength(unLearned) > 0) {
      init = 0;
      for(i = 0 ; i < VALLength(unLearned) ; i++) {
         for(j = 0 ; j < VALLength(unTest) ; j++) {
            thisDistance = ExampleDistance(VALIndex(unLearned, i),
                                           VALIndex(unTest, j));
            if(thisDistance < bestDistance || !init) {
               init = 1;
               bestDistance = thisDistance;
               lBestIndex = i;
               tBestIndex = j;
            }
         }
      }
      if(gMessageLevel > 2) {
         printf("\tMatched cluster %d distance %lf\n", ExampleGetClass(VALIndex(unTest, tBestIndex)), ExampleDistance(VALIndex(unLearned, lBestIndex), VALIndex(unTest, tBestIndex)));
      }

      totalDistance += bestDistance * bestDistance;
      ExampleSetClass(VALRemove(unLearned, lBestIndex),
               ExampleGetClass(VALRemove(unTest, tBestIndex)));
   }

   VALFree(unLearned);
   VALFree(unTest);

   return totalDistance;
}

static int _FindClosestCenterIndex(ExamplePtr e, VoidAListPtr centers) {
   int i;
   double bestDistance, thisDistance;
   int bestIndex;

   bestIndex = 0;
   bestDistance = ExampleDistance(e, VLIndex(centers, 0));

   for(i = 1 ; i < VLLength(centers) ; i++) {
      thisDistance = ExampleDistance(e, VLIndex(centers, i));
      if(thisDistance < bestDistance) {
         bestDistance = thisDistance;
         bestIndex = i;
      }
   }

   return bestIndex;
}

static ExamplePtr _FindClosestCenter(ExamplePtr e, VoidAListPtr centers) {
   return VALIndex(centers, _FindClosestCenterIndex(e, centers));
}

static float _CalculateIDKMEarlyBound(void) {
   int i, j, k;
   float maxError, thisError, bound, cError;
   IterationStatsPtr isL, isI;
   ExamplePtr eL, eI;

   isL = VALIndex(gStatsList, VALLength(gStatsList) - 1);

   maxError = 0;
   for(i = 0 ; i < VALLength(gStatsList) ; i++) {
      isI = VALIndex(gStatsList, i);

      if(isI->possibleIDConverge) {
         thisError = 0;
         for(j = 0 ; j < VALLength(isI->centroids) ; j++) {
            eL = VALIndex(isL->centroids, j);
            eI = VALIndex(isI->centroids, j);
            cError = 0;
            for(k = 0 ; k < ExampleGetNumAttributes(eL) ; k++) {
               /* HERE fix for discrete */
               bound = ExampleGetContinuousAttributeValue(eL, k) -
                          ExampleGetContinuousAttributeValue(eI, k) +
                           isI->errorBound[j][k];
               cError += pow(bound, 2);
            }
            thisError += cError;
            //printf("i%d c%d cError %f totalError %f\n",
            //         i, j, cError, thisError);
         }

         if(thisError > maxError) {
            maxError = thisError;
         }
      }
   }

   return maxError;
}

static float _CalculateErrorBound(void) {
   int i;
   float maxError = 0;
   float earlyError;
   IterationStatsPtr isL;

   isL = VALIndex(gStatsList, VALLength(gStatsList) - 1);

   /* find the error from the final iteration */
   for(i = 0 ; i < VALLength(isL->centroids) ; i++) {
      if(gMessageLevel > 2) {
         printf("max ekd %.4f max error %.4f error%d %.4f error%d^2 %.4f\n",
            isL->maxEkd, maxError, i, isL->lastBound[i], i, 
              pow(isL->lastBound[i], 2));
      }
      maxError += pow(isL->lastBound[i], 2);
   }

   if(!gAllowBadConverge) {
      earlyError = _CalculateIDKMEarlyBound();

      if(earlyError > maxError) {
         if(gMessageLevel > 1) {
            printf("early stop error (%.5f) greater than El (%.5f)\n", 
                 earlyError, maxError);
         }

         maxError = earlyError;
      }
   }

   return maxError;
}

static int _WhenKMConverged() {
   int i;
   IterationStatsPtr is;

   for(i = 0 ; i < VLLength(gStatsList) ; i++) {
      is = VLIndex(gStatsList, i);
      if(is->wouldKMConverge) {
         return i;
      }
   }

   return VLLength(gStatsList) - 1;
}

static void _doTests(ExampleSpecPtr es, VoidAListPtr learnedCenters, long learnCount, long learnTime, int finalOutput) {
   char fileNames[255];
   FILE *exampleIn, *testCentersIn, *centersOut;
   ExamplePtr e, tc, lc;
   VoidAListPtr testCenters;
   long i;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -