⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfkm.c

📁 数据挖掘方面的源码
💻 C
📖 第 1 页 / 共 4 页
字号:
   while(!done) {
      e = ExampleRead(data, es);
      DebugError(e == 0, "Unable to get enough unique initial centroids");

      /* make sure this isn't too close to one in the list */
      used = 0;
      for(i = 0 ; i < VALLength(centroids) && !used ; i++) {
         if(ExampleDistance(e, VALIndex(centroids, i)) <= minDistance) {
            used = 1;
         }
      }

      /* if it's ok then use it */
     if(!used) {
         done = 1;
      } else {
         ExampleFree(e);
      }
   }
   return e;
}

static void _PickInitialCentroids(ExampleSpecPtr es, VoidAListPtr centroids,
                                       FILE *data) {
   /* pick the first unique 'gNumClusters' points from the dataset to
            be centroids  HERE should I add some randomness to this? */
   int j;
   ExamplePtr e;
   char fileNames[255];
   FILE *centersIn;

   /* burn some examples to make the selection be more random */
   for(j = RandomRange(0, 10 * gNumClusters) ; j > 0 ; j--) {
      ExampleFree(ExampleRead(data, es));
   }

   /* if instructed, read in the initial centroids */
   if(gLoadCenters) {
      sprintf(fileNames, "%s.centers", gFileStem);
      centersIn = fopen(fileNames, "r");

      if(centersIn) {
         if(gMessageLevel > 0) {
            printf("Loading inital centers from %s\n", fileNames);
         }

         e = ExampleRead(centersIn, es);
         while(e != 0) {
            VALAppend(centroids, e);
            e = ExampleRead(centersIn, es);

         }
         fclose(centersIn);
      }
   }

   while(VALLength(centroids) < gNumClusters) {
      VALAppend(centroids, _PickInitalCentroid(es, centroids, data));
   }
}

//static void _OutputGoodCentroids(float bound) {
//   int i;
//   char fileNames[255];
//   FILE *centersOut;
//   IterationStatsPtr thisIs, firstIs;
//   float totalError, errorThreshold;

   /* write out the centroids, if we didn't get a bound only write out the
        good ones, where lastBound < (100 / gNumClusers)% of the total
         lastBound error */

//   thisIs = VALIndex(gStatsList, VALLength(gStatsList) - 1);
//   firstIs = VALIndex(gStatsList, 0);

//   sprintf(fileNames, "%s.centers", gFileStem);
//   centersOut = fopen(fileNames, "w");

//   totalError = 0;
//   for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {
//      totalError += thisIs->lastBound[i];
//   }

//   errorThreshold = totalError / (float)gNumClusters;
   //IterationStatsWrite(thisIs, stdout);

//   if(gMessageLevel > 0) {
//      printf("output good centers, thresh: %f\n", errorThreshold);
//   }

//   for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {
      /* if we finished with a bound or if the centroid's error was ok */
//      if((thisIs->guarenteeIDConverge && thisIs->foundBound &&
//                                  bound <= gThisErrorTarget) || 
//           (thisIs->lastBound[i] <  errorThreshold)) {
//         ExampleWrite(VALIndex(firstIs->centroids, i), centersOut);
//      }
//   }
//   fclose(centersOut);
//}

static void _OutputAllCentroids(float bound) {
   int i;
   char fileNames[255];
   FILE *centersOut;
   IterationStatsPtr thisIs;

   thisIs = VALIndex(gStatsList, VALLength(gStatsList) - 1);

   sprintf(fileNames, "%s.centers", gFileStem);
   centersOut = fopen(fileNames, "w");

   for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {
      ExampleWrite(VALIndex(thisIs->centroids, i), centersOut);
   }
   fclose(centersOut);
}

static IterationStatsPtr _FindLastIsWithBound(void) {
   int i;
   IterationStatsPtr is;

   for(i = VALLength(gStatsList) - 1 ; i >= 0 ; i--) {
      is = VALIndex(gStatsList, i);
      if(is->foundBound) {
         return is;
      }
   }

   DebugWarn(1, "_FindLastIsWithBound didn't find a bound\n");
   return 0;
}

static float _FindMedian(float *array, int len) {
   float *errorArray = MNewPtr(sizeof(float) * len);
   float tmp, median;
   int i, j;

   /* create the sorted error array which we'll need to find the median */
   for(i = 0 ; i < len ; i++) {
      errorArray[i] = array[i];
   }
   for(i = 0 ; i < len ; i++) {
      for(j = 0 ; j < len - (i + 1) ; j++) {
         if(errorArray[j] > errorArray[j + 1]) {
            tmp = errorArray[j + 1];
            errorArray[j + 1] = errorArray[j];
            errorArray[j] = tmp;
         }
      }
   }

   if(len % 2 == 0) {
      i = (len / 2) - 1;
      median = (errorArray[i] + errorArray[i + 1]) / 2.0;
   } else {
      i = ((len + 1) / 2) - 1;
      median = errorArray[i];
   }
   MFreePtr(errorArray);

   return median;
}

VoidAListPtr _GetCentroidsForNextRound(FILE *data, ExampleSpecPtr es) {
   IterationStatsPtr is = _FindLastIsWithBound();
//   IterationStatsPtr is = VALIndex(gStatsList, VALLength(gStatsList) - 1);
   float median = _FindMedian(is->lastBound, VALLength(is->centroids));
   IterationStatsPtr iIs = VALIndex(gStatsList, 0);
   VoidAListPtr newCentroids = VALNew();
   int i;


   //IterationStatsWrite(is, stdout);

   for(i = 0 ; i < VALLength(is->centroids) ; i++) {
      if(is->lastBound[i] <= median * 5) {
         VALAppend(newCentroids, ExampleClone(VALIndex(iIs->centroids, i)));
         //VALAppend(newCentroids, ExampleClone(VALIndex(is->centroids, i)));
      } else if(gMessageLevel > 1) {
         printf("   Reassigning centroid %d bound %f median %f\n", 
                       i, is->lastBound[i], median);
         fflush(stdout);
      }
   }

   while(VALLength(newCentroids) < VALLength(is->centroids)) {
      VALAppend(newCentroids, _PickInitalCentroid(es, newCentroids, data));
   }

   return newCentroids;
}

/* this should be static */
void CalculateExamplesPerIteration(VoidAListPtr last, float **nextNiOut, int *num);

int main(int argc, char *argv[]) {
   char fileNames[255];

   FILE *exampleIn;
   ExampleSpecPtr es;
   ExamplePtr e;
   VoidListPtr centers, newCenters = 0;
   float iterationNSum;

   long learnTime;
   int i;
   int fileDone;
   long nIncrement;
   float lastDelta, bound;

   struct tms starttime;
   struct tms endtime;

   IterationStatsPtr thisIs, lastIs;
   int breakOut;

   _processArgs(argc, argv);


   if(gStdin) {
      /* This is a hack because when I pipe clusterdata to vfkm
           vfkm tries to read the spec before clusterdata can write it */
      sleep(5);
   }

   sprintf(fileNames, "%s/%s.names", gSourceDirectory, gFileStem);
   es = ExampleSpecRead(fileNames);
   DebugError(es == 0, "Unable to open the .names file");

   
   RandomInit();
   /* seed for the concept */
   if(gSeed != -1) {
      RandomSeed(gSeed);
   } else {
      gSeed = RandomRange(1, 30000);
      RandomSeed(gSeed);
   }
   if(gMessageLevel > 0) {
      printf("running with seed %d\n", gSeed);
   }

   /* initialize some globals */
   gStatsList = VALNew();
   gD = ExampleSpecGetNumAttributes(es);

   if(gR == 0) {
      gR = sqrt(gD);
   }

   if(!gAllowBadConverge) {
      /* use a tighter bound so we get a better convergence behavior */
      gThisErrorTarget = min(gErrorTarget, gConvergeDelta / 3.0);
   } else {
      gThisErrorTarget = min(gErrorTarget, gConvergeDelta);
   }
   gNeededDelta = 1.0 - pow(1.0 - gDelta, 1.0 /
                 (float)(gD * gNumClusters * gEstimatedNumIterations));
   gTargetEkd = sqrt(gThisErrorTarget / ((float)gNumClusters * (float)gD));

   gN = (gNumClusters / 2.0) * pow(1.0/gTargetEkd, 2) * 
                          log(2.0/gNeededDelta) * 1.1;

   //if(gUseNormalApprox) {
   //   /* hueristic to correct for normal being tighter */
   //   gN /= 5;
   //}

   if(gMessageLevel > 1) {
      printf("Target Ekd: %.3lf\n", gTargetEkd);
   }

   if(gStdin) {
      exampleIn = stdin;
   } else {
      /* Pick the initial centroids from the dataset */

      /* in stream mode we never close & reopen the file */
      sprintf(fileNames, "%s/%s.data", gSourceDirectory, gFileStem);
      exampleIn = fopen(fileNames, "r");
      DebugError(exampleIn == 0, "Unable to open the .data file");
   }

   centers = VALNew();
   /* try get the gInitNumber-ith batch of centroids */
   for(i = 0 ; i < gInitNumber ; i++) {
      while(VALLength(centers) > 0) {
         ExampleFree(VALRemove(centers, VALLength(centers) - 1));
      }
      _PickInitialCentroids(es, centers, exampleIn);
   }

   if(!gStdin) {
      /* in stream mode we never close & reopen the file */
      fclose(exampleIn);
   }

   /* create the initial stats structure from the initial centroids */
   VALAppend(gStatsList, IterationStatsInitial(centers));

   if(gMessageLevel >= 1) {
      printf("allocation %ld\n", MGetTotalAllocation());
   }
   learnTime = 0;

   gIteration = 0;
   gRound = 0;

   times(&starttime);
   do {
      gRound++;

      /* if we aren't on the first round free the cruft from the prvious one */
      if(gRound > 1) {

         if(gReassignCentersEachRound) {
            if(!gStdin) {
               /* in stream mode we never close & reopen the file */
               sprintf(fileNames, "%s/%s.data", gSourceDirectory, gFileStem);
               exampleIn = fopen(fileNames, "r");
               DebugError(exampleIn == 0, "Unable to open the .data file");
            }

            /* burn some examples to make the selection be more random */
            for(i = RandomRange(0, 10 * gNumClusters) ; i > 0 ; i--) {
               ExampleFree(ExampleRead(exampleIn, es));
            }

            /* HERE MEM this newCenters may leak memory */
            newCenters = _GetCentroidsForNextRound(exampleIn, es);
         }

         for(i = VALLength(gStatsList) - 1 ; i >= 0 ; i--) {
            IterationStatsFree(VALRemove(gStatsList, i));
         }
         if(gReassignCentersEachRound) {
            VALAppend(gStatsList, IterationStatsInitial(newCenters));
         } else {
            /* use the inital centroids again */
            VALAppend(gStatsList, IterationStatsInitial(centers));
         }
      }

      gIteration = 1;

      if(!gStdin) {
         /* in stream mode we never close & reopen the file */
         sprintf(fileNames, "%s/%s.data", gSourceDirectory, gFileStem);
         exampleIn = fopen(fileNames, "r");
         DebugError(exampleIn == 0, "Unable to open the .data file");
      }

      if(gMessageLevel > 0) {
         printf("========================\n");
         printf("round %d n %ld\n", gRound, gN);
      }

      while(!_DoClusterIterationDidConverge(exampleIn, es)) {
         times(&endtime);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -