⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfdt-engine.c

📁 此算法是数据挖掘中的聚类算法
💻 C
📖 第 1 页 / 共 4 页
字号:
#include "vfdt-engine.h"
#include "memory.h"
#include <math.h>

#include <stdlib.h> /* for system for bootstrapc45 */

#define lg(x)       (log(x) / log(2))

static void _DecisionTreeWrite(DecisionTreePtr dt);
static float _BonferonniDelta(VFDTPtr vfdt, int numChoices);

static void _GrowingDataFlushExampleCache(VFDTGrowingDataPtr gd) {
   int i;
   if(gd->exampleCache != 0) {
      for(i = VALLength(gd->exampleCache) - 1 ; i >= 0 ; i--) {
         ExampleFree(VALRemove(gd->exampleCache, i));
      }
   }
}

static void _GrowingDataFreeExampleCache(VFDTGrowingDataPtr gd) {
   if(gd->exampleCache != 0) {
      _GrowingDataFlushExampleCache(gd);
      VALFree(gd->exampleCache);
      gd->exampleCache = 0;
   }
}

VFDTPtr VFDTNew(ExampleSpecPtr spec, float splitConfidence,
           float tieConfidence) {
   VFDTGrowingDataPtr gd = MNewPtr(sizeof(VFDTGrowingData));
   VFDTPtr vfdt = MNewPtr(sizeof(VFDT));
   AttributeTrackerPtr at;

   vfdt->spec = spec;
   vfdt->dtree = DecisionTreeNew(spec);

   at = AttributeTrackerInitial(vfdt->spec);
   gd->egs = ExampleGroupStatsNew(vfdt->spec, at);
                    

   vfdt->cacheExamples = 1;
   vfdt->recoverMinimum = 3 * 1024 * 1024;
   if(vfdt->cacheExamples) {
      gd->exampleCache = VALNew();
   } else {
      gd->exampleCache = 0;
   }

   //   gd->startedOnExampleNum = 0;
   gd->seenExamplesCount = 0;
   gd->seenSinceLastProcess = 0;

   gd->treeLevel = 0;
   gd->prePruned = 0;

   DecisionTreeSetTypeGrowing(vfdt->dtree);
   DecisionTreeSetGrowingData(vfdt->dtree, gd);

   vfdt->processChunkSize = 300;
   vfdt->splitConfidence = splitConfidence;
   vfdt->tieConfidence = tieConfidence;

   vfdt->useGini = 0;
   vfdt->messageLevel = 0;
   vfdt->examplesSeen = 0;
   vfdt->numBoundsUsed = 0;
   vfdt->numGrowing = 1;
   vfdt->maxAllocationBytes = 10000;
   vfdt->maxAllocationBytes *= 1024;
   vfdt->maxAllocationBytes *= 1024;

   vfdt->reactivateLeaves = 1;
   vfdt->highestDeactivatedIndex = 1.0;
   vfdt->reactivateScanPeriod = 1473777;

   vfdt->batchMode = 0;
   vfdt->prePruneTau = 0.0;
   vfdt->laplace = 0;

   /* needs the completed vfdt, so make sure it happens after that */
   gd->splitConfidence = _BonferonniDelta(vfdt, AttributeTrackerNumActive(at));

   return vfdt;
}

void VFDTFree(VFDTPtr vfdt) {
   int i;
   VoidAListPtr list = VALNew();
   VFDTGrowingDataPtr gd;

   DecisionTreeGatherGrowingNodes(vfdt->dtree, list);

   for(i = VALLength(list) - 1 ; i >= 0 ; i--) {
      gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(VALIndex(list,i));

      _GrowingDataFreeExampleCache(gd);

      ExampleGroupStatsFree(gd->egs);
      MFreePtr(gd);
   }

   VALFree(list);

   DecisionTreeFree(vfdt->dtree);
   MFreePtr(vfdt);
}

void VFDTSetMessageLevel(VFDTPtr vfdt, int value) {
   vfdt->messageLevel = value;
}

void VFDTSetMaxAllocationMegs(VFDTPtr vfdt, int value) {
   vfdt->maxAllocationBytes = value;
   vfdt->maxAllocationBytes *= 1024;
   vfdt->maxAllocationBytes *= 1024;
}

void VFDTSetProcessChunkSize(VFDTPtr vfdt, int value) {
   vfdt->processChunkSize = value;
}

void VFDTSetUseGini(VFDTPtr vfdt, int value) {
   vfdt->useGini = value;
}

void VFDTSetRestartLeaves(VFDTPtr vfdt, int value) {
   vfdt->reactivateLeaves = value;
}

void VFDTSetCacheTrainingExamples(VFDTPtr vfdt, int value) {
   VoidAListPtr growingNodes;
   VFDTGrowingDataPtr gd;
   int i;

   vfdt->cacheExamples = value;

   if(vfdt->cacheExamples == 0) {
      /* disable this everywhere HERE maybe enable it everywhere in the
            case where the value is 1?? */

      growingNodes = VALNew();
      DecisionTreeGatherGrowingNodes(vfdt->dtree, growingNodes);
      
      for(i = 0 ; i < VALLength(growingNodes) ; i++) {
         gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(VALIndex(growingNodes, i));
         if(gd->exampleCache != 0) {
            _GrowingDataFreeExampleCache(gd);
         }
      }

      VALFree(growingNodes);
   }   
}

void VFDTSetPrePruneTau(VFDTPtr vfdt, float value) {
   vfdt->prePruneTau = value;
}

void VFDTSetLaplace(VFDTPtr vfdt, int value) {
   vfdt->laplace = value;
}

void VFDTSetDoBonferonni(VFDTPtr vfdt, int value) {
   vfdt->doBonferonni = value;
}

static void _DoMakeLeaf(VFDTPtr vfdt, DecisionTreePtr currentNode) {
   VFDTGrowingDataPtr gd = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode));
   ExampleGroupStatsPtr egs = gd->egs;
   int mostCommonClass = ExampleGroupStatsGetMostCommonClassLaplace(egs,
                    gd->parentClass, vfdt->laplace);

   vfdt->numGrowing--;
   DecisionTreeSetTypeLeaf(currentNode);

   DecisionTreeSetClass(currentNode, mostCommonClass);

   gd->seenSinceDeactivated = 0;
   gd->errorsSinceDeactivated = 0;

   ExampleGroupStatsDeactivate(egs);
   _GrowingDataFreeExampleCache(gd);
//   ExampleGroupStatsFree(egs);
//   MFreePtr(DecisionTreeGetGrowingData(currentNode));
}

static void _DoReactivateLeaf(VFDTPtr vfdt, DecisionTreePtr currentNode) {
   VFDTGrowingDataPtr gd = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode));
   ExampleGroupStatsPtr egs = gd->egs;

   vfdt->numGrowing++;
   DecisionTreeSetTypeGrowing(currentNode);

   ExampleGroupStatsReactivate(egs);
   if(vfdt->cacheExamples) {
      gd->exampleCache = VALNew();
   }
}

static float _CalculateReactivateIndex(DecisionTreePtr node, long totalSeen) {
   /* this is the same hueristic as the calculate leafify index but gets 
              its informationg from a slightly different place */
   float percent;
   VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(node);
   float estimatedError;

   if(gd == 0) {
      /* this would happen if we've already split on attribute */
      return 0;
   }

   percent = gd->seenExamplesCount / totalSeen;

   /* use a laplace correction for the estimated error: 
          (errors + (m * parentError)) / (seen + m) */

   estimatedError = 
      (gd->errorsSinceDeactivated +
          (10 * gd->parentErrorRate)) / 
      (10 + gd->seenSinceDeactivated);

   /* return an estimate of the percentage of examples falling on 
        weighted by the accuracy the node would have if we pruned now */

   /* leaves that give more long term benefit should have higher values */
   return percent * estimatedError;
}

static float _CalculateLeafifyIndex(DecisionTreePtr node, long totalSeen) {
   float percent;
   VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(node);
   float estimatedError;

   percent = gd->seenExamplesCount / totalSeen;

   /* use a laplace correction for the estimated error: 
          (errors + (m * parentError)) / (seen + m) */

   estimatedError = 
      ((ExampleGroupStatsNumExamplesSeen(gd->egs) -
          ExampleGroupStatsGetMostCommonClassCount(gd->egs)) +
          (10 * gd->parentErrorRate)) / 
      (10 + ExampleGroupStatsNumExamplesSeen(gd->egs));

   /* return an estimate of the percentage of examples falling on 
        weighted by the accuracy the node would have if we pruned now */

   /* leaves that give more long term benefit should have higher values */
   return percent * estimatedError;
}

static void _LeafifyLeastImportantNodes(VFDTPtr vfdt, long number) {
   float *indexes;
   DecisionTreePtr *nodes;
   int i, j;
   float thisIndex, tmpIndex;
   DecisionTreePtr cNode, tmpNode;
   VoidAListPtr growingNodes = VALNew();

   DecisionTreeGatherGrowingNodes(vfdt->dtree, growingNodes);

   if(vfdt->messageLevel >= 1) {
      printf("leafifying %ld of %d growing nodes\n", number, VLLength(growingNodes));
   }

   indexes = MNewPtr(sizeof(float) * number);
   nodes   = MNewPtr(sizeof(DecisionTreePtr) * number);

   /* take the first bunch of growing nodes to start */
   for(i = 0 ; i < number ; i++) {
     nodes[i] = VALIndex(growingNodes, i);
     indexes[i] = _CalculateLeafifyIndex(nodes[i], vfdt->examplesSeen);
   }

   /* EFF I should use some kind of something smart here if 
      I'm trying to prune a lot of nodes */

   /* now look at the rest of the growing nodes, if they are worse use them */
   for(i = number ; i < VALLength(growingNodes) ; i++) {
      thisIndex = _CalculateLeafifyIndex(VALIndex(growingNodes, i),
                                                      vfdt->examplesSeen);
      cNode = VALIndex(growingNodes, i);
      for(j = 0 ; j < number ; j++) {
         if(thisIndex < indexes[j]) {
            tmpIndex = indexes[j];
            indexes[j] = thisIndex;
            thisIndex = tmpIndex;
            tmpNode = nodes[j];
            nodes[j] = cNode;
            cNode = tmpNode;
         }
      }
   }

   vfdt->highestDeactivatedIndex = 0;
   /* now make them all leaves! */
   for(i = 0 ; i < number ; i++) {
      if(indexes[i] > vfdt->highestDeactivatedIndex) {
         vfdt->highestDeactivatedIndex = indexes[i];
      }
      _DoMakeLeaf(vfdt, nodes[i]);
      if(vfdt->messageLevel >= 2) {
         printf("leafifying a node with index %g\n", indexes[i]);
      }
   }

   MFreePtr(indexes);
   MFreePtr(nodes);
   VALFree(growingNodes);
}

void _FlushExampleCachesCheckStop(VFDTPtr vfdt) {
   long initialAllocation = MGetTotalAllocation();
   float *indexes;
   DecisionTreePtr *nodes;
   int i, j;
   float thisIndex, tmpIndex;
   DecisionTreePtr cNode, tmpNode;
   long finalAllocation;
   VoidAListPtr growingNodes;
   VFDTGrowingDataPtr gd;
   long number;
   long flushed;

   if(!vfdt->cacheExamples) { return; }

   growingNodes = VALNew();
   if(vfdt->messageLevel >= 2) {
      printf("flushing example cache...\n");
   }

   DecisionTreeGatherGrowingNodes(vfdt->dtree, growingNodes);

   number = VALLength(growingNodes) * .35;
   /* round up */
   number++;


   if(vfdt->messageLevel >= 2) {
      printf("thinking about flushing %ld caches\n", number);
   }

   indexes = MNewPtr(sizeof(float) * number);
   nodes   = MNewPtr(sizeof(DecisionTreePtr) * number);

   /* take the first bunch of growing nodes to start */
   for(i = 0 ; i < number ; i++) {
      nodes[i] = VALIndex(growingNodes, i);
      gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(nodes[i]);
      if(gd->exampleCache == 0) {
         indexes[i] = 1.0;
      } else {
         indexes[i] = _CalculateLeafifyIndex(nodes[i], vfdt->examplesSeen);
      }
   }

   /* EFF I should use some kind of something smart here if 
      I'm trying to do this to a lot of nodes */

   /* now look at the rest of the growing nodes, if they are worse use them */
   for(i = number ; i < VALLength(growingNodes) ; i++) {
      gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(VALIndex(
                                                growingNodes, i));
      if(gd->exampleCache == 0) {
         thisIndex = 1.0;
      } else {
         thisIndex = _CalculateLeafifyIndex(VALIndex(growingNodes, i),
                                                      vfdt->examplesSeen);
      }

      cNode = VALIndex(growingNodes, i);
      for(j = 0 ; j < number ; j++) {
         if(thisIndex < indexes[j]) {
            tmpIndex = indexes[j];
            indexes[j] = thisIndex;
            thisIndex = tmpIndex;
            tmpNode = nodes[j];
            nodes[j] = cNode;
            cNode = tmpNode;
         }
      }
   }

   flushed = 0;
   for(i = 0 ; i < number && (initialAllocation - MGetTotalAllocation() < (0.05 * vfdt->maxAllocationBytes)); i++) {
      gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(nodes[i]);
      if(gd->exampleCache != 0) {
         flushed++;
         _GrowingDataFreeExampleCache(gd);
      }
   }

   MFreePtr(indexes);
   MFreePtr(nodes);

   finalAllocation = MGetTotalAllocation();

   /* HERE, this .05 should be a parameter & it is used above as well */
   if(vfdt->messageLevel >= 2) {
      printf("flushed %ld freed %ld bytes stop threshold %.2f\n",
               flushed, initialAllocation - finalAllocation,
               0.05 * vfdt->maxAllocationBytes);
   }
   if(initialAllocation - finalAllocation < (0.05 * vfdt->maxAllocationBytes)) {
      /* if we don't recover at least some minimum then stop caching */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -