⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfkm.c

📁 数据挖掘方面的源码
💻 C
📖 第 1 页 / 共 4 页
字号:
   float loss;
   float bound;
   int foundBound;
   long tested = 0;


   foundBound = 0;
   if(((IterationStatsPtr)VALIndex(gStatsList,
                     VALLength(gStatsList) - 1))->foundBound) {
      if(((IterationStatsPtr)VALIndex(gStatsList,
                     VALLength(gStatsList) - 1))->guarenteeIDConverge) {
         foundBound = 1;
      } else if(((IterationStatsPtr)VALIndex(gStatsList,
                     VALLength(gStatsList) - 1))->wouldKMConverge &&
                 gAllowBadConverge) {

         if(gMessageLevel >= 1) {
            printf("found a bad converge bound.\n");
         }
         foundBound = 1;
      }
   }

   if(foundBound) {
      bound = _CalculateErrorBound();
   } else {
      bound = -1;
   }

   if(!gTestOnTrain) {
      /* just output the distance between matched centers */
      /* load the test centers */
      testCenters = VALNew();

      sprintf(fileNames, "%s/%s.test", gSourceDirectory, gFileStem);
      testCentersIn = fopen(fileNames, "r");
      DebugError(testCentersIn == 0, "Unable to open the .test file");
      
      if(gMessageLevel >= 1) {
         printf("reading the test centers file...\n");
      }

      tc = ExampleRead(testCentersIn, es);
      while(tc != 0) {
         VALAppend(testCenters, tc);
         tc = ExampleRead(testCentersIn, es);
      }
      fclose(testCentersIn);
   
      /* Match learned centers with the test centers */
      loss = _MatchCentersGetDistanceSquare(learnedCenters, testCenters);

      /* free the test centers */
      for(i = 0 ; i < VALLength(testCenters) ; i++) {
         ExampleFree(VALIndex(testCenters, i));
      }
      VALFree(testCenters);
   } else { /* Sum Square distance of example to assigned cluster */
      loss = 0;

      sprintf(fileNames, "%s/%s.data", gSourceDirectory, gFileStem);
      exampleIn = fopen(fileNames, "r");
      DebugError(exampleIn == 0, "Unable to open the .data file");
      
      if(gMessageLevel >= 1) {
         printf("opened test file, starting scan...\n");
      }

      e = ExampleRead(exampleIn, es);
      while(e != 0 && tested < gMaxExamplesPerIteration) {
         tested++;
         lc = _FindClosestCenter(e, learnedCenters);
         loss += pow(ExampleDistance(e, lc), 2);
         ExampleFree(e);
         e = ExampleRead(exampleIn, es);
      }
      if(e != 0) {
         ExampleFree(e);
      }
      fclose(exampleIn);
   }


   if(finalOutput) {
      printf("%.4f\t0\n", loss);
   } else {
      if(foundBound) {
         printf("%d\t%ld\t%d\t%.6f\t%.6f\t%.2lf\n",
                gRound, learnCount, gTotalExamplesSeen,
                bound, loss, ((double)learnTime) / 100);
      } else {
         if(gMessageLevel > 1) {
             printf("   No bound, Current bound estimate is %f guarenteed converge %d\n",
                       _CalculateErrorBound(),
                       ((IterationStatsPtr)VALIndex(gStatsList,
                            VALLength(gStatsList) - 1))->guarenteeIDConverge);
         }
         printf("%d\t%ld\t%d\t***\t%.6f\t%.2lf\n",
                gRound, learnCount, gTotalExamplesSeen,
                loss, ((double)learnTime) / 100);
      }
   }
   fflush(stdout); 


   if(0) {//gOutputCenters) {
      sprintf(fileNames, "%s-%lu.centers", gFileStem, learnCount);
      centersOut = fopen(fileNames, "w");

      for(i = 0 ; i < VALLength(learnedCenters) ; i++) {
         ExampleWrite(VALIndex(learnedCenters, i), centersOut);
//         ExampleWrite(VALIndex(learnedCenters, i), stdout);
      }
//      printf("------------------\n");
      fclose(centersOut);
   }

}

static int _CheckConverganceUpdateStats(IterationStatsPtr last, 
                                 IterationStatsPtr current) {
   float thisDistance;
   float bound, lowerBound, upperBound, clusterBound;
   float error;
   ExamplePtr eThis, eLast;
   int i, j;


   bound = 0;
   lowerBound = 0;
   upperBound = 0;

   for(i = 0 ; i < VALLength(last->centroids) ; i++) {
      eLast = VALIndex(last->centroids, i);
      eThis = VALIndex(current->centroids, i);
      clusterBound = 0;
      for(j = 0 ; j < ExampleGetNumAttributes(eThis) ; j++) {
         /* HERE fix for discrete */
         thisDistance = ExampleGetContinuousAttributeValue(eLast, j) -
                    ExampleGetContinuousAttributeValue(eThis, j);
         if(thisDistance < 0) {
            thisDistance *= -1;
         }

         error = last->errorBound[i][j] + current->errorBound[i][j];
         bound += pow(thisDistance, 2);
         clusterBound += pow(thisDistance, 2);
         lowerBound += pow(max(thisDistance - error, 0), 2);
         upperBound += pow(thisDistance + error, 2);
         if(gMessageLevel > 2) {
            printf("e: %f bnd %f cbnd %f lbnd %f ubnd %f\n", 
                error, bound, clusterBound, lowerBound, upperBound);
         }
      }
      if(gMessageLevel > 0) {
         printf("   cluster %d moved by %f\n", i, clusterBound);
      }
   }

   if(gMessageLevel > 0) {
      printf("   clusters moved [ %f - %f - %f ]\n",
                              lowerBound, bound, upperBound);
   }

   if(bound <= gConvergeDelta / 3.0) {
      current->convergeVFKM = 1;
   }

   if(lowerBound <= gConvergeDelta) {
      current->possibleIDConverge = 1;

      if(bound <= gConvergeDelta) {
         current->wouldKMConverge = 1;
      }

      if(upperBound <= gConvergeDelta) { 
         current->guarenteeIDConverge = 1;
      } else if(gMessageLevel > 0) {
         printf("      IDKM may have or may not have converged.\n");
      }
   }

   if(gBatch || gAllowBadConverge) {
      if(gMessageLevel > 0 && gAllowBadConverge && current->wouldKMConverge) {
         printf("      found a potentially bad converge.\n");
      }
      return current->wouldKMConverge;
   } else {
      return current->guarenteeIDConverge || 
          (current->convergeVFKM && last->convergeVFKM);
   }
}

static int _DoClusterIterationDidConverge(FILE *data, ExampleSpecPtr es) {
   int i,j;
   ExamplePtr e, centroid;
   int centerIndex;
   long seen = 0;
   int done;
   int nPlus;
   IterationStatsPtr is, newIs;
   float delta;
   float *plus, *minus, *distances;
   double *sums;

   is = VALIndex(gStatsList, VALLength(gStatsList) - 1);

   distances = MNewPtr(sizeof(float) * gNumClusters);

   if(gMessageLevel > 1) {
      printf("enter iteration %d seen %d\n", gIteration, gTotalExamplesSeen);
      fflush(stdout);
   }
   done = 0;
   e = ExampleRead(data, es);
   while(e != 0 && !done ) {
      seen++;
      is->n++;
      gTotalExamplesSeen++;

      /* find the distance to all the clusters */
      for(i = 0 ; i < VALLength(is->centroids) ; i++) {
         distances[i] = ExampleDistance(e, VLIndex(is->centroids, i));
      }

      /* find the nearest cluster center & update counts*/
      centerIndex = 0;
      for(i = 1 ; i < VALLength(is->centroids) ; i++) {
         if(distances[i] < distances[centerIndex]) {
            centerIndex = i;
         }
      }

      (is->nHat[centerIndex])++;
      sums = is->wonSum[centerIndex];
      for(i = 0 ; i < ExampleSpecGetNumAttributes(es) ; i++) {
         if(!ExampleIsAttributeUnknown(e, i)) {
            if(ExampleIsAttributeContinuous(e, i)) {
               is->xMaxSquareSum[centerIndex][i] += 
                  pow(ExampleGetContinuousAttributeValue(e, i), 2);
               is->xMinSum[centerIndex][i] += 
                  ExampleGetContinuousAttributeValue(e, i);
               sums[i] += ExampleGetContinuousAttributeValue(e, i);
            } else {
               /* HERE what to do about discrete attributes */
            }
         }
      }

      /* find all other possible winners and update counts */
      nPlus = 0;
      for(i = 0 ; i < VALLength(is->centroids) ; i++) {
         if(i != centerIndex) {
            if(distances[i] < distances[centerIndex] + 
                            (is->lastBound[i] + is->lastBound[centerIndex])) {
               /* here is a point that someone else might win */
               nPlus = 1;
               centroid = VALIndex(is->centroids, i);
               plus = is->deltaPlus[i];
               minus = is->deltaMinus[i];
               (is->nMinus[i])++;
               for(j = 0 ; j < ExampleSpecGetNumAttributes(es) ; j++) {
                  if(!ExampleIsAttributeUnknown(e, j)) {
                     if(ExampleIsAttributeContinuous(e, j)) {
                        is->xMaxSquareSum[i][j] += 
                            pow(ExampleGetContinuousAttributeValue(e, j), 2);

                        delta = ExampleGetContinuousAttributeValue(e, j) - 
                              ExampleGetContinuousAttributeValue(centroid, j);
                        if(delta > 0) {
                           plus[j] += delta;
                        } else {
                           minus[j] += -delta;
                        }
                     } else {
                        /* HERE what to do about discrete attributes */
                     }
                  }
               }
            }
         }
      }

      /* if someone else might have won then update winner's nPlus */
      if(nPlus) {
         centroid = VALIndex(is->centroids, centerIndex);
         plus = is->deltaPlus[centerIndex];
         minus = is->deltaMinus[centerIndex];
         (is->nPlus[centerIndex])++;
         for(j = 0 ; j < ExampleSpecGetNumAttributes(es) ; j++) {
            if(!ExampleIsAttributeUnknown(e, j)) {
               if(ExampleIsAttributeContinuous(e, j)) {
                  is->xMinSum[centerIndex][j] -= 
                      ExampleGetContinuousAttributeValue(e, j);
                  delta = ExampleGetContinuousAttributeValue(e, j) - 
                        ExampleGetContinuousAttributeValue(centroid, j);
                  if(delta < 0) {
                     plus[j] += -delta;
                  } else {
                     minus[j] += delta;
                  }
               } else {
                  /* HERE what to do about discrete attributes */
               }
            }
         }
      }

      ExampleFree(e);

      /* check to see if we should move to the next iteration */
      if(!gBatch && gFancyStop && gIteration <= gNumIterationNs) {
         /* Test to see if this iteration is done */
         if(seen >= gIterationNs[gIteration - 1]) {
            done = 1;
         }
      } else if(!gBatch && seen >= gN) {
         done = 1;
      } else if(seen > gMaxExamplesPerIteration) {
         done = 1;
      }

      if(!done) {
         /* if we didn't get stopped by the termination check get another */
         e = ExampleRead(data, es);
      }
   }
   MFreePtr(distances);
   
   if(gMessageLevel > 1) {
      IterationStatsWrite(is, stdout);
   }

   newIs = IterationStatsNext(is, gNeededDelta, 1.0, gAssignErrorScale, es);
   VALAppend(gStatsList, newIs);

   if(gMessageLevel > 1) {
      printf("exit iteration %d seen %d\n", gIteration, gTotalExamplesSeen);
      fflush(stdout);
   }
   
   if(newIs) {
      return _CheckConverganceUpdateStats(is, newIs);
   } else {
      /* we didn't converge, but this round will be stoped by
           the foundBound of 0 */
      return 0;
   }
}

static ExamplePtr _PickInitalCentroid(ExampleSpecPtr es, VoidAListPtr centroids, FILE *data) {
   float minDistance;
   int done  = 0;
   ExamplePtr e;
   int used;
   int i;

   //minDistance = gR / ((float)gNumClusters * 4);
   minDistance = gR / ((float)gNumClusters * 2);
   //minDistance = 0.00001;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -