⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfem.c

📁 数据挖掘方面的源码
💻 C
📖 第 1 页 / 共 4 页
字号:
#include "vfml.h"#include "vfem-engine.h"#include <stdio.h>#include <string.h>#include <sys/times.h>#include <time.h>#include <math.h>char *gFileStem        = "DF";char *gSourceDirectory = ".";int   gDoTests         = 0;int   gMessageLevel    = 0;int   gNumClusters     = -1;float gDelta           = 0.05;float gErrorTarget     = 0.01;float gThisErrorTarget = 0.01;int   gEstimatedNumIterations = 5;int   gBatch                  = 0;long  gMaxExamplesPerIteration = 1000000000;int   gInitNumber      = 1;int   gFancyStop              = 1;float gConvergeDelta          = 0.001;int   gSeed             = -1;float gR = 0;float gAssignErrorScale = 1.0;int   gAllowBadConverge = 0;float gSigmaSquare      = .05;int   gStdin            = 0;int   gIterativeResults = 0;int   gTestOnTrain      = 0; /* default test total center-matching distance */int   gNormalApprox     = 0;long         gDBSize                 = 0;/* HERE add parameters for these */int          gDoEarlyStop            = 0;int          gLoadCenters            = 0;int          gReassignCentersEachRound = 0;int          gUseGeoffBound          = 1;/* HERE some hackish globals & stuff*/VoidAListPtr gStatsList;int          gD;int          gIteration = 0;int          gRound     = 0;int          gTotalExamplesSeen = 0;float        gNeededDelta;float        gTargetEkd;long         gN;float        *gIterationNs = 0;int          gNumIterationNs;static void _printUsage(char  *name) {   printf("%s : Very Fast Expectation Maximization\n", name);   printf("-f <filestem>\t Set the name of the dataset (default DF)\n");   printf("-source <dir>\t Set the source data directory (default '.')\n");   printf("-clusters <numClusters>\t Set the num clusters to find (REQUIRED)\n");   printf("-variance <value>\t The variance of the Gaussians, the current\n\t\t version of EM needs this as a parameter (default 0.05)\n");   printf("-assignErrorScale <scale>\t Loosen bound by scaling the assignment\n\t\t part of it by the scale factor (default 1.0)\n");   printf("-delta <delta>\t Find final solution with confidence (1 - <delta>)\n\t\t (default 5%%)\n");   printf("-epsilon <epsilon>\t The maximum distance between a learned\n\t\t centroid and the coresponding infinite\n\t\t data centroid (default .01)\n");   printf("-converge <distance>\t Stop when distance centroids move between\n\t\t iterations squared is less than <distance>  (default .001)\n");   printf("-batch\t Do a traditional kmeans run on the whole input file.  Doesn't\n\t\t make sense to combine with -stdin (default off)\n");   printf("-init <num>\t use the <num>th valid assignment of initial centroids\n\t\t (default 1)\n");   printf("-maxPerIteration <num> \t use no more than num examples per\n\t\t iteration (default 1,000,000,000)\n");   printf("-l <number>\t The estimated # of iterations to converge if you\n\t\t guess wrong and aren't using batch VFEM will fix it for you\n\t\t and do an extra round (default 10)\n");   printf("-seed <s>\t Seed to use picking initial centers (default random)\n");   printf("-progressive\t Don't use the Lagrange based optimization\n\t\t to pick the # of samples at each iteration of the\n\t\t next round (default use the optimization)\n");   printf("-tightBound\t Use a tighter assignment error bound (requires\n\t\t a second pass over the dataset) (default use a\n\t\t looser one-pass bound)\n");   printf("-allowBadConverge\t Allows VFEM to converge at a time when\n\t\t em wouldn't (default off).\n");   printf("-r <range>\t The maximum range between pairs of examples\n\t\t (default assume the range of each dimension is 0 - 1.0\n\t\t and calculate it from that)\n");   printf("-stdin \t\t Reads training examples as a stream from stdin\n\t\t instead of from <stem>.data (default off)\n");   printf("-testOnTrain \t loss is the sum of the square of the distances\n\t\t from allu training examples to their assigned\n\t\t centroid (default compare learned centroids to\n\t\t centroids in <stem>.test)\n");   printf("-normalApprox \t use a normal approximation of the Hoeffding\n\t\t bound (default use hoeffding bound)\n");   printf("-loadCenters \t load initial centroids from <stem>.centers\n\t\t (default use random based on -init and -seed)\n");   printf("-u\t\t Output results after each iteration\n");   printf("-v\t\t Can be used multiple times to increase the debugging output\n");}static void _processArgs(int argc, char *argv[]) {   int i;   /* HERE on the ones that use the next arg make sure it is there */   for(i = 1 ; i < argc ; i++) {      if(!strcmp(argv[i], "-f")) {         gFileStem = argv[i+1];         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-source")) {         gSourceDirectory = argv[i+1];         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-u")) {         gDoTests = 1;      } else if(!strcmp(argv[i], "-testOnTrain")) {         gTestOnTrain = 1;      } else if(!strcmp(argv[i], "-clusters")) {         sscanf(argv[i+1], "%d", &gNumClusters);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-dbSize")) {         sscanf(argv[i+1], "%ld", &gDBSize);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-delta")) {         sscanf(argv[i+1], "%f", &gDelta);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-converge")) {         sscanf(argv[i+1], "%f", &gConvergeDelta);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-r")) {         sscanf(argv[i+1], "%f", &gR);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-assignErrorScale")) {         sscanf(argv[i+1], "%f", &gAssignErrorScale);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-variance")) {         sscanf(argv[i+1], "%f", &gSigmaSquare);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-epsilon")) {         sscanf(argv[i+1], "%f", &gErrorTarget);         gThisErrorTarget = gErrorTarget;         /* ignore the next argument */         i++;//      } else if(!strcmp(argv[i], "-n")) {//         sscanf(argv[i+1], "%ld", &gN);//         /* ignore the next argument *///         i++;      } else if(!strcmp(argv[i], "-maxPerIteration")) {         sscanf(argv[i+1], "%ld", &gMaxExamplesPerIteration);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-batch")) {         gBatch = 1;      } else if(!strcmp(argv[i], "-allowBadConverge")) {         gAllowBadConverge = 1;      } else if(!strcmp(argv[i], "-progressive")) {         gFancyStop = 0;      } else if(!strcmp(argv[i], "-tightBound")) {         gUseGeoffBound = 0;      } else if(!strcmp(argv[i], "-init")) {         sscanf(argv[i+1], "%d", &gInitNumber);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-l")) {         sscanf(argv[i+1], "%d", &gEstimatedNumIterations);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-seed")) {         sscanf(argv[i+1], "%d", &gSeed);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-normalApprox")) {         gNormalApprox = 1;      } else if(!strcmp(argv[i], "-loadCenters")) {         gLoadCenters = 1;      } else if(!strcmp(argv[i], "-v")) {         gMessageLevel++;      } else if(!strcmp(argv[i], "-h")) {         _printUsage(argv[0]);         exit(0);      } else if(!strcmp(argv[i], "-stdin")) {         gStdin = 1;      } else {         printf("Unknown argument: %s.  use -h for help\n", argv[i]);         exit(0);      }   }   if(gNumClusters < 0) {      printf("Didn't supply the required '-clusters' argument\n");      exit(0);   }   if(gDBSize <= 0) {      printf("Didn't supply the required '-dbSize' argument\n");      exit(0);   }   if(gMessageLevel >= 1) {      printf("Stem: %s\n", gFileStem);      printf("Source: %s\n", gSourceDirectory);      if(gStdin) {         printf("Reading examples from standard in.\n");      }      if(gDoTests) {         printf("Running tests\n");      }      printf("DB Size %ld\n", gDBSize);   }}static float _MatchCentersGetDistanceSquare(VoidAListPtr learnedCenters, VoidAListPtr testCenters) {   /* perfom a 1 to 1 matching between the learned and test centers.        change the class lables of the learnedCenters to correspond to        the closest testCenters.  Do a greedy assignment that tries to        minimize the total distance between assigned pairs */   int i, j, init;   double bestDistance, thisDistance;   int lBestIndex, tBestIndex;   VoidAListPtr unLearned = VALNew();   VoidAListPtr unTest = VALNew();   float totalDistance = 0.0;   for(i = 0 ; i < VALLength(learnedCenters) ; i++) {      VALAppend(unLearned, VALIndex(learnedCenters, i));   }   for(i = 0 ; i < VALLength(testCenters) ; i++) {      VALAppend(unTest, VALIndex(testCenters, i));   }   bestDistance = lBestIndex = tBestIndex = 0;   while(VALLength(unLearned) > 0) {      init = 0;      for(i = 0 ; i < VALLength(unLearned) ; i++) {         for(j = 0 ; j < VALLength(unTest) ; j++) {            thisDistance = ExampleDistance(VALIndex(unLearned, i),                                           VALIndex(unTest, j));            if(thisDistance < bestDistance || !init) {               init = 1;               bestDistance = thisDistance;               lBestIndex = i;               tBestIndex = j;            }         }      }      if(gMessageLevel > 0) {         printf("\tMatched cluster %d distance %lf\n", ExampleGetClass(VALIndex(unTest, tBestIndex)), ExampleDistance(VALIndex(unLearned, lBestIndex), VALIndex(unTest, tBestIndex)));      }      totalDistance += bestDistance * bestDistance;      ExampleSetClass(VALRemove(unLearned, lBestIndex),               ExampleGetClass(VALRemove(unTest, tBestIndex)));   }   VALFree(unLearned);   VALFree(unTest);   return totalDistance;}static int _FindClosestCenterIndex(ExamplePtr e, VoidAListPtr centers) {   int i;   double bestDistance, thisDistance;   int bestIndex;   bestIndex = 0;   bestDistance = ExampleDistance(e, VLIndex(centers, 0));   for(i = 1 ; i < VLLength(centers) ; i++) {      thisDistance = ExampleDistance(e, VLIndex(centers, i));      if(thisDistance < bestDistance) {         bestDistance = thisDistance;         bestIndex = i;      }   }   return bestIndex;}static ExamplePtr _FindClosestCenter(ExamplePtr e, VoidAListPtr centers) {   return VALIndex(centers, _FindClosestCenterIndex(e, centers));}static float _CalculateIDKMEarlyBound(void) {   int i, j, k;   float maxError, thisError, bound, cError;   IterationStatsPtr isL, isI;   ExamplePtr eL, eI;   isL = VALIndex(gStatsList, VALLength(gStatsList) - 1);   maxError = 0;   for(i = 0 ; i < VALLength(gStatsList) ; i++) {      isI = VALIndex(gStatsList, i);      if(isI->possibleIDConverge) {         thisError = 0;         for(j = 0 ; j < VALLength(isI->centroids) ; j++) {            eL = VALIndex(isL->centroids, j);            eI = VALIndex(isI->centroids, j);            cError = 0;            for(k = 0 ; k < ExampleGetNumAttributes(eL) ; k++) {               /* HERE fix for discrete */               bound = ExampleGetContinuousAttributeValue(eL, k) -                          ExampleGetContinuousAttributeValue(eI, k) +                          IterationStatsErrorBoundDimension(isI, j, k);               cError += pow(bound, 2);            }            thisError += cError;            //printf("i%d c%d cError %f totalError %f\n",            //         i, j, cError, thisError);         }         if(thisError > maxError) {            maxError = thisError;         }      }   }   return maxError;}static float _CalculateErrorBound(void) {   int i;   float maxError = 0;   float earlyError;   IterationStatsPtr isL;   isL = VALIndex(gStatsList, VALLength(gStatsList) - 1);   /* find the error from the final iteration */   for(i = 0 ; i < VALLength(isL->centroids) ; i++) {      maxError += pow(isL->lastBound[i], 2);      if(gMessageLevel > 2) {         printf("max ekd %.4f sum bnd^2 %.4f bnd%d %.4f bnd^2%d %.4f\n",            isL->maxEkd, maxError, i, isL->lastBound[i], i,               pow(isL->lastBound[i], 2));      }   }   if(!gAllowBadConverge) {      earlyError = _CalculateIDKMEarlyBound();      if(earlyError > maxError) {         if(gMessageLevel > 1) {            printf("early stop error (%.5f) greater than El (%.5f)\n",                  earlyError, maxError);         }         maxError = earlyError;      }   }   return maxError;}static int _WhenEMConverged(void) {   int i;   IterationStatsPtr is;   for(i = 0 ; i < VLLength(gStatsList) ; i++) {      is = VLIndex(gStatsList, i);      if(is->wouldEMConverge) {         return i;      }   }   return VLLength(gStatsList) - 1;}static void _doTests(ExampleSpecPtr es, VoidAListPtr learnedCenters, long learnCount, long learnTime, int finalOutput) {   char fileNames[255];   FILE *exampleIn, *testCentersIn, *centersOut;   ExamplePtr e, tc, lc;   VoidAListPtr testCenters;   long i;   float loss;   float bound;   int foundBound, guarenteeIDConverge, lastFoundBound;   long tested = 0;   foundBound = 0;   if(VALLength(gStatsList) > 0) {      lastFoundBound = ((IterationStatsPtr)VALIndex(gStatsList,                VALLength(gStatsList) - 1))->foundBound;      guarenteeIDConverge = ((IterationStatsPtr)VALIndex(gStatsList,                     VALLength(gStatsList) - 1))->guarenteeIDConverge;      if(lastFoundBound) {         if(guarenteeIDConverge) {            foundBound = 1;            if(gMessageLevel >= 1) {               printf("Have a bound and ID would converge\n");            }         } else if(((IterationStatsPtr)VALIndex(gStatsList,                        VALLength(gStatsList) - 1))->wouldEMConverge &&                    gAllowBadConverge) {            if(gMessageLevel >= 1) {               printf("Have a bound and ID may or may not converge, we converge.\n");

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -