⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfdt-engine.c

📁 此算法是数据挖掘中的聚类算法
💻 C
📖 第 1 页 / 共 4 页
字号:
      flushed = 0;
      for(i = 0 ; i < VALLength(growingNodes) ; i++) {
         gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(VALIndex(growingNodes, i));
         if(gd->exampleCache != 0) {
            flushed++;
            _GrowingDataFreeExampleCache(gd);
         }
      }
      vfdt->cacheExamples = 0;
      if(vfdt->messageLevel >= 1) {
         printf("Stopped caching, there were %ld active...\n", flushed);
      }
   }

   VALFree(growingNodes);
}

static void _DoReactivateScan(VFDTPtr vfdt) {
   long number, allocation;
   float *indexes;
   DecisionTreePtr *nodes;
   int i, j;
   float thisIndex, tmpIndex;
   DecisionTreePtr cNode, tmpNode;
   VoidAListPtr leaves = VALNew();
   VFDTGrowingDataPtr gd;

   DecisionTreeGatherLeaves(vfdt->dtree, leaves);

   /* ugh, now I remove all pre-pruned nodes, this is pretty bad
       because it is expensive to remove nodes from my list */
   for(i = 0 ; i < VALLength(leaves) ; i++) {
      tmpNode = (DecisionTreePtr)VALIndex(leaves, i);
      gd = DecisionTreeGetGrowingData(tmpNode);

      if(gd->prePruned) {
         VALRemove(leaves, i);
      }
   }

   number = VALLength(leaves) * 0.05;

   if(number < 5) {
      VALFree(leaves);
      return;
   }

   if(vfdt->messageLevel >= 1) {
      printf("looking into reactivating %ld nodes\n", number);
   }

   indexes = MNewPtr(sizeof(float) * number);
   nodes   = MNewPtr(sizeof(DecisionTreePtr) * number);

   /* take the first bunch of leaves nodes to start */
   for(i = 0 ; i < number ; i++) {
     nodes[i] = VALIndex(leaves, i);
     indexes[i] = _CalculateReactivateIndex(nodes[i], vfdt->examplesSeen);
   }

   /* EFF I should use some kind of something smart here if 
      I'm trying to consider a lot of nodes */

   /* now look at the rest of the leaves, if they are better use them */
   for(i = number ; i < VALLength(leaves) ; i++) {
      thisIndex = _CalculateReactivateIndex(VALIndex(leaves, i),
                                                      vfdt->examplesSeen);
      cNode = VALIndex(leaves, i);
      for(j = 0 ; j < number ; j++) {
         if(thisIndex > indexes[j]) {
            tmpIndex = indexes[j];
            indexes[j] = thisIndex;
            thisIndex = tmpIndex;
            tmpNode = nodes[j];
            nodes[j] = cNode;
            cNode = tmpNode;
         }
      }
   }

   /* now make all of them that look good compared to our last leafify
       back into growing nodes */
   for(i = 0 ; i < number ; i++) {
      if(indexes[i] > vfdt->highestDeactivatedIndex) {
         _DoReactivateLeaf(vfdt, nodes[i]);
         if(vfdt->messageLevel >= 2) {
            printf("reactivating a node with index %f\n", indexes[i]);
         }
      }
   }

   MFreePtr(indexes);
   MFreePtr(nodes);
   VALFree(leaves);


   allocation = MGetTotalAllocation();
   if(vfdt->maxAllocationBytes < allocation) {
      if(vfdt->cacheExamples) {
         _FlushExampleCachesCheckStop(vfdt);
      } else {
         /* HERE may want to control how many to leafify with a flag */
         _LeafifyLeastImportantNodes(vfdt, vfdt->numGrowing * .2);
      }
   }

}

static void _AddExampleToGrowingNode(VFDTPtr vfdt, 
                                     DecisionTreePtr currentNode,
                                     ExamplePtr e,
                                     int checkSplits);

static float _BonferonniDelta(VFDTPtr vfdt, int numChoices) {
  // return vfdt->splitConfidence;
  // return 1 - pow(1 - vfdt->splitConfidence, 1.0 / (float)numChoices);
  //  printf("\t in bonf old confidence %f num choices %d new conf %.10f\n",
  //         vfdt->splitConfidence, numChoices, 
  //                   vfdt->splitConfidence / (float)numChoices);
   if(vfdt->doBonferonni) {
      return vfdt->splitConfidence / (float)numChoices;
   } else {
      return vfdt->splitConfidence;
   }
}

static void _DoContinuousSplit(VFDTPtr vfdt,
                                DecisionTreePtr currentNode, int attNum,
                                float threshold) {
   int i;
   AttributeTrackerPtr newAT;
   int childCount;

   VFDTGrowingDataPtr parentGD = (VFDTGrowingDataPtr)(DecisionTreeGetGrowingData(currentNode));
   AttributeTrackerPtr at =
                     ExampleGroupStatsGetAttributeTracker(parentGD->egs);
   VFDTGrowingDataPtr gd;
   long allocation;
   int parentClass;
   float parentErrorRate;
   DecisionTreePtr parent, target;
   ExamplePtr e;

   parent = currentNode;

   DebugMessage(1, 2, "  Continuous split info, attrib %s, thresh %f\n",
              ExampleSpecGetAttributeName(vfdt->spec, attNum), threshold);

   DecisionTreeSplitOnContinuousAttribute(currentNode, attNum, threshold);
   childCount = DecisionTreeGetChildCount(currentNode);

   vfdt->numGrowing--;
   vfdt->numGrowing += childCount;

   parentClass = ExampleGroupStatsGetMostCommonClass(parentGD->egs);

   parentErrorRate =
       (1.0 - (ExampleGroupStatsGetMostCommonClassCount(parentGD->egs) /
            (float)ExampleGroupStatsNumExamplesSeen(parentGD->egs)));

   /* left child, < threshold */
   newAT = AttributeTrackerClone(at);

   gd = MNewPtr(sizeof(VFDTGrowingData));
   gd->egs = ExampleGroupStatsNew(vfdt->spec, newAT);
   gd->parentClass = parentClass;
   
   gd->treeLevel = parentGD->treeLevel + 1;
   gd->prePruned = 0;
   gd->splitConfidence = _BonferonniDelta(vfdt,
                              AttributeTrackerNumActive(newAT));

   if(vfdt->cacheExamples) {
      gd->exampleCache = VALNew();
   } else {
      gd->exampleCache = 0;
   }
   gd->parentErrorRate = parentErrorRate;

   gd->seenExamplesCount = 
                  ExampleGroupStatsGetPercentBelowThreshold(parentGD->egs,
                    attNum, threshold) * parentGD->seenExamplesCount;

   gd->seenSinceLastProcess = 0;

   DecisionTreeSetGrowingData(DecisionTreeGetChild(currentNode, 0), gd);
   DecisionTreeSetClass(DecisionTreeGetChild(currentNode, 0),
                                               parentClass);
   /* right child, >= threshold */
   newAT = AttributeTrackerClone(at);

   gd = MNewPtr(sizeof(VFDTGrowingData));
   gd->egs = ExampleGroupStatsNew(vfdt->spec, newAT);
   gd->parentClass = parentClass;

   gd->treeLevel = parentGD->treeLevel + 1;
   gd->prePruned = 0;
   gd->splitConfidence = _BonferonniDelta(vfdt,
                              AttributeTrackerNumActive(newAT));

   if(vfdt->cacheExamples) {
      gd->exampleCache = VALNew();
   } else {
      gd->exampleCache = 0;
   }
   gd->parentErrorRate = parentErrorRate;

   gd->seenExamplesCount = (1.0 - 
               ExampleGroupStatsGetPercentBelowThreshold(parentGD->egs,
                    attNum, threshold)) * parentGD->seenExamplesCount;

   gd->seenSinceLastProcess = 0;

   DecisionTreeSetGrowingData(DecisionTreeGetChild(currentNode, 1), gd);
   DecisionTreeSetClass(DecisionTreeGetChild(currentNode, 1),
                                               parentClass);

   /* split the example cache */
   if(parentGD->exampleCache != 0) {
      for(i = VALLength(parentGD->exampleCache) - 1 ; i >= 0 ; i--) {
         e = VALRemove(parentGD->exampleCache, i);
         target = DecisionTreeOneStepClassify(parent, e);
         _AddExampleToGrowingNode(vfdt, target, e, 0);
      }
   }

   //_DecisionTreeWrite(currentNode);

   /* this will check to see if there is a cache first */
   _GrowingDataFreeExampleCache(parentGD);

   ExampleGroupStatsFree(parentGD->egs);
   MFreePtr(parentGD);

   DecisionTreeSetGrowingData(currentNode, 0);

   allocation = MGetTotalAllocation();
   if(vfdt->maxAllocationBytes < allocation) {
      if(vfdt->cacheExamples) {
         _FlushExampleCachesCheckStop(vfdt);
      } else {
         /* HERE may want to control how many to leafify with a flag */
         _LeafifyLeastImportantNodes(vfdt, vfdt->numGrowing * .2);
      }
   }
}

static void _DoDiscreteSplit(VFDTPtr vfdt, DecisionTreePtr splitNode,
                             int attNum) {
   int i;
   AttributeTrackerPtr newAT;
   int childCount;

   VFDTGrowingDataPtr parentGD = (VFDTGrowingDataPtr)(DecisionTreeGetGrowingData(splitNode));
   AttributeTrackerPtr at =
                     ExampleGroupStatsGetAttributeTracker(parentGD->egs);
   VFDTGrowingDataPtr gd;
   long allocation;
   int parentClass;
   float parentErrorRate;
   DecisionTreePtr target;
   ExamplePtr e;

   parentClass = ExampleGroupStatsGetMostCommonClass(parentGD->egs);

   if(AttributeTrackerAreAllInactive(at)) {
      /* made a split on every attribute along this path so don't try
          to split any more */
      DecisionTreeSetTypeLeaf(splitNode);
      DecisionTreeSetClass(splitNode, parentClass);
   } else {
      DecisionTreeSplitOnDiscreteAttribute(splitNode, attNum);
      childCount = DecisionTreeGetChildCount(splitNode);

      vfdt->numGrowing--;
      vfdt->numGrowing += childCount;

      parentErrorRate =
          (1.0 - (ExampleGroupStatsGetMostCommonClassCount(parentGD->egs) /
               (float)ExampleGroupStatsNumExamplesSeen(parentGD->egs)));

      for(i = 0 ; i < childCount ; i++) {
         newAT = AttributeTrackerClone(at);
         AttributeTrackerMarkInactive(newAT, attNum);

         gd = MNewPtr(sizeof(VFDTGrowingData));
         gd->egs = ExampleGroupStatsNew(vfdt->spec, newAT);
         gd->parentClass = parentClass;

         gd->treeLevel = parentGD->treeLevel + 1;
         gd->prePruned = 0;
         gd->splitConfidence = _BonferonniDelta(vfdt,
                     AttributeTrackerNumActive(newAT));

         if(vfdt->cacheExamples) {
            gd->exampleCache = VALNew();
         } else {
            gd->exampleCache = 0;
         }
         gd->parentErrorRate = parentErrorRate;

         gd->seenExamplesCount =
                 ExampleGroupStatsGetValuePercent(parentGD->egs,
                          attNum, i) * parentGD->seenExamplesCount;

         gd->seenSinceLastProcess = 0;

         DecisionTreeSetGrowingData(DecisionTreeGetChild(splitNode, i), gd);
         DecisionTreeSetClass(DecisionTreeGetChild(splitNode, i),
                                                  parentClass);
      }

      /* split the example cache */
      if(parentGD->exampleCache != 0) {
         for(i = VALLength(parentGD->exampleCache) - 1 ; i >= 0 ; i--) {
            e = VALRemove(parentGD->exampleCache, i);
            target = DecisionTreeOneStepClassify(splitNode, e);
            _AddExampleToGrowingNode(vfdt, target, e, 0);
         }
      }
   }

   /* this will check to see if there is a cache first */
   _GrowingDataFreeExampleCache(parentGD);

   ExampleGroupStatsFree(parentGD->egs);
   MFreePtr(parentGD);

   DecisionTreeSetGrowingData(splitNode, 0);

   allocation = MGetTotalAllocation();
   if(vfdt->maxAllocationBytes < allocation) {
      if(vfdt->cacheExamples) {
         _FlushExampleCachesCheckStop(vfdt);
      } else {
         /* HERE may want to control how many to leafify with a flag */
         _LeafifyLeastImportantNodes(vfdt, vfdt->numGrowing * .2);
      }
   }
}

static void _CheckSplit(VFDTPtr vfdt, DecisionTreePtr currentNode, int forceSplit) {
   float hoeffdingBound;
   int i;
   float range;
   float *allIndexes;
   float firstIndex, secondIndex, nullIndex;
   int firstAttrib, secondAttrib;
   float firstThresh, secondThresh;
   float tmpIndexOne, tmpIndexTwo, tmpThreshOne, tmpThreshTwo;
   VFDTGrowingDataPtr gd = DecisionTreeGetGrowingData(currentNode);
   ExampleGroupStatsPtr egs = gd->egs;


   /* count the bounds that need to hold */
   vfdt->numBoundsUsed += 
        AttributeTrackerNumActive(ExampleGroupStatsGetAttributeTracker(egs));

   firstIndex = secondIndex = 10000;
   firstAttrib = secondAttrib = -1;
   firstThresh = secondThresh = 0;
   allIndexes =
          MNewPtr(sizeof(float) * ExampleSpecGetNumAttributes(vfdt->spec));
   for(i = 0 ; i < ExampleSpecGetNumAttributes(vfdt->spec) ; i++) {
      if(ExampleSpecIsAttributeDiscrete(vfdt->spec, i)) {
         /* set up the second tmp index to look really bad (unless updated by
              a continuous attribute to look better) */
         tmpIndexTwo = 10000;
         if(vfdt->useGini) {
            tmpIndexOne = ExampleGroupStatsGiniDiscreteAttributeSplit(egs, i);
         } else {
            tmpIndexOne = 
                     ExampleGroupStatsEntropyDiscreteAttributeSplit(egs, i);
         }
      } else { /* continuous attribute */
         if(vfdt->useGini) {
            ExampleGroupStatsGiniContinuousAttributeSplit(egs, i,
                   &tmpIndexOne, &tmpThreshOne, &tmpIndexTwo, &tmpThreshTwo);
         } else {
            ExampleGroupStatsEntropyContinuousAttributeSplit(egs, i,
                   &tmpIndexOne, &tmpThreshOne, &tmpIndexTwo, &tmpThreshTwo);
         }
      }

      allIndexes[i] = tmpIndexOne;

      if(tmpIndexOne < firstIndex) {
         /* bump the first one over */
         secondIndex = firstIndex;
         secondThresh = firstThresh;
         secondAttrib = firstAttrib;

         /* replace the first one */
         firstIndex = tmpIndexOne;
         firstThresh = tmpThreshOne;
         firstAttrib = i;

         if(tmpIndexTwo < secondIndex) {
            /* replace the second one */
            secondIndex = tmpIndexTwo;
            secondThresh = tmpThreshTwo;
            secondAttrib = i;
         }
      } else if(tmpIndexOne < secondIndex) {
         /* replace the second one */
         secondIndex = tmpIndexOne;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -