⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfem.c

📁 数据挖掘方面的源码
💻 C
📖 第 1 页 / 共 4 页
字号:
            printf("  c#%d: ", i);            ExampleWrite(VALIndex(is->centroids, i), stdout);         }          fflush(stdout);      }      /* do the Ws */      denominator = 0;      for(i = 0 ; i < VALLength(is->centroids) ; i++) {         centroid = VLIndex(is->centroids, i);         denominator += exp( (-1.0 / (2.0 * gSigmaSquare)) *            pow(ExampleDistance(e, centroid), 2));      }      for(i = 0 ; i < VALLength(is->centroids) ; i++) {         centroid = VLIndex(is->centroids, i);         numerator = exp( (-1.0 / (2.0 * gSigmaSquare)) *            pow(ExampleDistance(e, centroid), 2));//if(i == 4) { printf("Denom: %.3f Numer: %.3f w: %.3f distance: %.3f, true: %d\n",//        denominator, numerator, (numerator / denominator),//           ExampleDistance(e, centroid), ExampleGetClass(e)); }                 is->w[i] += (numerator / denominator);         for(j = 0 ; j < ExampleSpecGetNumAttributes(es) ; j++) {            is->wx[i][j] += (numerator / denominator) *                  ExampleGetContinuousAttributeValue(e, j);         }      }      if(gUseGeoffBound) {         _RecordGeoffBoundInfo(e, is, es);      }      if(gMessageLevel > 3) {         IterationStatsWrite(is, es, stdout);         fflush(stdout);      }      ExampleFree(e);      /* check to see if we should move to the next iteration */      if(!gBatch && gFancyStop && gIteration <= gNumIterationNs) {         /* Test to see if this iteration is done */         if(seen >= gIterationNs[gIteration - 1]) {            done = 1;         }      } else if(!gBatch && seen >= gN) {         done = 1;      } else if(seen > gMaxExamplesPerIteration) {         done = 1;      }      if(!done) {         /* if we didn't get stopped by the termination check get another */         e = ExampleRead(data, es);      }   }      if(gMessageLevel > 1) {      printf("Finished an iteration, n is %ld.\n", is->n);      IterationStatsWrite(is, es, stdout);   }   newIs = IterationStatsNext(is, gNeededDelta, 1.0,                 gAssignErrorScale, es, !gUseGeoffBound, boundData);   VALAppend(gStatsList, newIs);   if(gMessageLevel > 1) {      printf("exit iteration %d seen %d\n", gIteration, gTotalExamplesSeen);      fflush(stdout);   }      if(newIs) {      return _CheckConverganceUpdateStats(is, newIs);   } else {      /* we didn't converge, but this round will be stoped by           the foundBound of 0 */      return 0;   }}static ExamplePtr _PickInitalCentroid(ExampleSpecPtr es, VoidAListPtr centroids, FILE *data) {   float minDistance;   int done  = 0;   ExamplePtr e;   int used;   int i;   minDistance = gR / ((float)gNumClusters * 4);   //minDistance = gR / ((float)gNumClusters * 2);   while(!done) {      e = ExampleRead(data, es);      DebugError(e == 0, "Unable to get enough unique initial centroids");      /* make sure this isn't too close to one in the list */      used = 0;      for(i = 0 ; i < VALLength(centroids) && !used ; i++) {         /* HERE make a parameter? */         if(ExampleDistance(e, VALIndex(centroids, i)) <= minDistance) {            used = 1;         }      }      /* if it's ok then use it */     if(!used) {         done = 1;      } else {         ExampleFree(e);      }   }   return e;}static void _PickInitialCentroids(ExampleSpecPtr es, VoidAListPtr centroids,                                       FILE *data) {   /* pick the first unique 'gNumClusters' points from the dataset to            be centroids  HERE should I add some randomness to this? */   int j;   ExamplePtr e;   char fileNames[255];   FILE *centersIn;   /* burn some examples to make the selection be more random */   for(j = RandomRange(0, 10 * gNumClusters) ; j > 0 ; j--) {      ExampleFree(ExampleRead(data, es));   }   /* if instructed, read in the initial centroids */   if(gLoadCenters) {      sprintf(fileNames, "%s.centers", gFileStem);      centersIn = fopen(fileNames, "r");      if(centersIn) {         if(gMessageLevel > 0) {            printf("Loading inital centers from %s\n", fileNames);         }         e = ExampleRead(centersIn, es);         while(e != 0) {            VALAppend(centroids, e);            e = ExampleRead(centersIn, es);         }         fclose(centersIn);      }   }   while(VALLength(centroids) < gNumClusters) {      VALAppend(centroids, _PickInitalCentroid(es, centroids, data));   }   if(gMessageLevel > 0) {      printf("loss for initial centroids:\n");      _doTests(es, centroids, 0, 0, 0);   }}//static void _OutputGoodCentroids(float bound) {//   int i;//   char fileNames[255];//   FILE *centersOut;//   IterationStatsPtr thisIs, firstIs;//   float totalError, errorThreshold;   /* write out the centroids, if we didn't get a bound only write out the        good ones, where lastBound < (100 / gNumClusers)% of the total         lastBound error *///   thisIs = VALIndex(gStatsList, VALLength(gStatsList) - 1);//   firstIs = VALIndex(gStatsList, 0);//   sprintf(fileNames, "%s.centers", gFileStem);//   centersOut = fopen(fileNames, "w");//   totalError = 0;//   for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {//      totalError += thisIs->lastBound[i];//   }//   errorThreshold = totalError / (float)gNumClusters;   //IterationStatsWrite(thisIs, stdout);//   if(gMessageLevel > 0) {//      printf("output good centers, thresh: %f\n", errorThreshold);//   }//   for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {//      /* if we finished with a bound or if the centroid's error was ok *///      if((thisIs->guarenteeIDConverge && thisIs->foundBound &&//                                  bound <= gThisErrorTarget) || //           (thisIs->lastBound[i] <  errorThreshold)) {//         ExampleWrite(VALIndex(firstIs->centroids, i), centersOut);//      }//   }//   fclose(centersOut);//}static void _OutputAllCentroids(float bound) {   int i;   char fileNames[255];   FILE *centersOut;   IterationStatsPtr thisIs;   thisIs = VALIndex(gStatsList, VALLength(gStatsList) - 1);   sprintf(fileNames, "%s.centers", gFileStem);   centersOut = fopen(fileNames, "w");   for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {      ExampleWrite(VALIndex(thisIs->centroids, i), centersOut);   }   fclose(centersOut);}static void _OutputCentroidMovement(void) {   IterationStatsPtr firstIs, lastIs;   int i;   float total, current, totalSquare;   firstIs = VALIndex(gStatsList, 0);   lastIs = VALIndex(gStatsList, VALLength(gStatsList) - 1);   total = 0;   totalSquare = 0;   for(i = 0 ; i < VALLength(firstIs->centroids) ; i++) {      current = ExampleDistance(VALIndex(firstIs->centroids, i),                             VALIndex(lastIs->centroids, i));      total += current;      totalSquare += pow(current, 2);      printf(" centroid %d moved: %f move^2: %f\n", i, current, pow(current, 2));   }   printf("   total: %f total^2: %f\n", total, totalSquare);}static IterationStatsPtr _FindLastIsWithBound(void) {   int i;   IterationStatsPtr is;   for(i = VALLength(gStatsList) - 1 ; i >= 0 ; i--) {      is = VALIndex(gStatsList, i);      if(is->foundBound) {         return is;      }   }   DebugWarn(1, "_FindLastIsWithBound didn't find a bound\n");   return 0;}static float _FindMedian(float *array, int len) {   float *errorArray = MNewPtr(sizeof(float) * len);   float tmp, median;   int i, j;   /* create the sorted error array which we'll need to find the median */   for(i = 0 ; i < len ; i++) {      errorArray[i] = array[i];   }   for(i = 0 ; i < len ; i++) {      for(j = 0 ; j < len - (i + 1) ; j++) {         if(errorArray[j] > errorArray[j + 1]) {            tmp = errorArray[j + 1];            errorArray[j + 1] = errorArray[j];            errorArray[j] = tmp;         }      }   }   if(len % 2 == 0) {      i = (len / 2) - 1;      median = (errorArray[i] + errorArray[i + 1]) / 2.0;   } else {      i = ((len + 1) / 2) - 1;      median = errorArray[i];   }   MFreePtr(errorArray);   return median;}VoidAListPtr _GetCentroidsForNextRound(FILE *data, ExampleSpecPtr es) {   IterationStatsPtr is = _FindLastIsWithBound();//   IterationStatsPtr is = VALIndex(gStatsList, VALLength(gStatsList) - 1);   float median = _FindMedian(is->lastBound, VALLength(is->centroids));   IterationStatsPtr iIs = VALIndex(gStatsList, 0);   VoidAListPtr newCentroids = VALNew();   int i;   //IterationStatsWrite(is, stdout);   for(i = 0 ; i < VALLength(is->centroids) ; i++) {      if(is->lastBound[i] <= median * 5) {         VALAppend(newCentroids, ExampleClone(VALIndex(iIs->centroids, i)));         //VALAppend(newCentroids, ExampleClone(VALIndex(is->centroids, i)));      } else if(gMessageLevel > 1) {         printf("   Reassigning centroid %d bound %f median %f\n",                        i, is->lastBound[i], median);         fflush(stdout);      }   }   while(VALLength(newCentroids) < VALLength(is->centroids)) {      VALAppend(newCentroids, _PickInitalCentroid(es, newCentroids, data));   }   return newCentroids;}/* this should be static */void CalculateExamplesPerIteration(VoidAListPtr last, float **nextNiOut, int *num);int main(int argc, char *argv[]) {   char fileNames[255];   FILE *exampleIn = 0, *boundDataIn = 0;   ExampleSpecPtr es;   ExamplePtr e;   VoidListPtr centers, newCenters = 0;   float iterationNSum;   long learnTime;   int i;   int breakOut;   int fileDone;   long nIncrement;   float lastDelta, bound;   struct tms starttime;   struct tms endtime;   IterationStatsPtr thisIs, lastIs;   _processArgs(argc, argv);   if(gStdin) {      /* This is a hack because when I pipe clusterdata to vfem           vfem tries to read the spec before clusterdata can write it */      sleep(5);   }   sprintf(fileNames, "%s/%s.names", gSourceDirectory, gFileStem);   es = ExampleSpecRead(fileNames);   DebugError(es == 0, "Unable to open the .names file");      RandomInit();   /* seed for the concept */   if(gSeed != -1) {      RandomSeed(gSeed);   } else {      gSeed = RandomRange(1, 30000);      RandomSeed(gSeed);   }   if(gMessageLevel > 0) {      printf("running with seed %d\n", gSeed);   }   /* initialize some globals */   gStatsList = VALNew();   gD = ExampleSpecGetNumAttributes(es);   if(gR == 0) {      gR = sqrt(gD);   }   if(!gAllowBadConverge) {      /* use a tighter bound so we get a better convergence behavior */      gThisErrorTarget = min(gErrorTarget, gConvergeDelta / 3.0);   } else {      /* HERE this is a hack for now, take it out!! */      gThisErrorTarget = gErrorTarget;      //gThisErrorTarget = min(gErrorTarget, gConvergeDelta);   }   gNeededDelta = 1.0 - pow(1.0 - gDelta, 1.0 /                 (float)(gD * gNumClusters * gEstimatedNumIterations));   gTargetEkd = sqrt(gThisErrorTarget / ((float)gNumClusters * (float)gD));   /* HERE fix this for the new bound */   gN = (gNumClusters / 2.0) * pow(1.0/gTargetEkd, 2) *                           log(2.0/gNeededDelta) * 1.1;   if(gMessageLevel > 1) {      printf("Target Ekd: %.3lf\n", gTargetEkd);   }   if(gStdin) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -