⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfdt-engine.c

📁 此算法是数据挖掘中的聚类算法
💻 C
📖 第 1 页 / 共 4 页
字号:
         secondThresh = tmpThreshOne;
         secondAttrib = i;

         /* the tmpIndexTwo must be worse than what we got so ignore it */
      }
   }

   if(vfdt->messageLevel >= 3) {
      printf("Considering split, best two indecies are:\n");
      printf("   Index: %f Thresh: %f Attrib: %d\n", firstIndex, firstThresh, firstAttrib);
      printf("   Index: %f Thresh: %f Attrib: %d\n", secondIndex, secondThresh, secondAttrib);
   }

   if(vfdt->useGini) {
      range = 1.0;
   } else {
      range = lg(ExampleSpecGetNumClasses(vfdt->spec));
   }

   /* HERE, this will crash if splitConfidence is 0 */
   hoeffdingBound = sqrt(((range * range) * log(1/gd->splitConfidence)) / 
       (2.0 * ExampleGroupStatsNumExamplesSeen(egs)));

   /* check for prepruning */
   if(vfdt->useGini) {
      nullIndex = ExampleGroupStatsGiniTotal(egs);
   } else {
      nullIndex = ExampleGroupStatsEntropyTotal(egs);
   }

   if(firstIndex - nullIndex > hoeffdingBound) {
      if(vfdt->messageLevel >= 1) {
         printf("Prepruned based on a null-win over %d, index %f, hoeffding %f, based on %ld examples, null index %f, level %d\n",
               firstAttrib, firstIndex, 
               hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs),
               nullIndex, gd->treeLevel);
      }

      gd->prePruned = 1;
   } else if((nullIndex - firstIndex < hoeffdingBound) && 
                         hoeffdingBound < vfdt->prePruneTau) {
      if(vfdt->messageLevel >= 1) {
         printf("Prepruned based on a null tie with %d, index %f, hoeffding %f, based on %ld examples, null index %f, level %d\n",
               firstAttrib, firstIndex, 
               hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs),
               nullIndex, gd->treeLevel);
      }
      gd->prePruned = 1;
   }

   if(gd->prePruned) {
      _DoMakeLeaf(vfdt, currentNode);
   }

   if(!gd->prePruned) {
      /* if we win by the hoeffding test 
         or
         if we are in forceSplit mode and it's a good time to split */
      if((secondIndex - firstIndex > hoeffdingBound)
                || 
         (forceSplit && ExampleGroupStatsNumExamplesSeen(egs) > 5 &&
          !ExampleGroupStatsIsPure(egs))) {
                 //&& ExampleGroupStatsEntropyTotal(egs) > firstIndex)) {
          

         if(vfdt->messageLevel >= 1) {
            if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
               printf("Did discrete split on attrib %s, index %f, second best %s, index %f, hoeffding %f, based on %ld examples, presplit entropy %f, level %d\n",
                     ExampleSpecGetAttributeName(vfdt->spec, firstAttrib), 
                         firstIndex, 
                     ExampleSpecGetAttributeName(vfdt->spec, secondAttrib),
                     secondIndex, hoeffdingBound,
                     ExampleGroupStatsNumExamplesSeen(egs),
                     ExampleGroupStatsEntropyTotal(egs),
                     gd->treeLevel);
            } else {
               printf("Did continuous split on attrib %s, thresh %f, index %f, second best %s, thresh %f, index %f, hoeffding %f, based on %ld examples, presplit entropy %f, level %d\n",
		   ExampleSpecGetAttributeName(vfdt->spec, firstAttrib),
                     firstThresh, firstIndex, 
                     ExampleSpecGetAttributeName(vfdt->spec, secondAttrib),
                     secondThresh, secondIndex, hoeffdingBound,
                     ExampleGroupStatsNumExamplesSeen(egs),
                     ExampleGroupStatsEntropyTotal(egs),
                     gd->treeLevel);
            }
            if(vfdt->messageLevel >= 2) {
               printf("   allocation now: %ld\n", MGetTotalAllocation());
            }
         }

         if(vfdt->messageLevel >= 4) {
            ExampleGroupStatsWrite(egs, stdout);
         }

         if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
            _DoDiscreteSplit(vfdt, currentNode, firstAttrib);
         } else {
            _DoContinuousSplit(vfdt, currentNode, firstAttrib, firstThresh);
         }

         if(vfdt->messageLevel >= 4) {
            printf("The new tree is: \n");
            DecisionTreePrint(vfdt->dtree, stdout);
         }

      } else if(hoeffdingBound < vfdt->tieConfidence && 
                 nullIndex - firstIndex > vfdt->prePruneTau) {
         /* if it looks like we have a tie */
         if(vfdt->messageLevel >= 1) {
            printf("Called it a tie between attrib %d, index %f, and %d, index %f, hoeffding %f, based on %ld examples, presplit entropy %f, level %d\n",
                  firstAttrib, firstIndex, secondAttrib, secondIndex,
                  hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs),
                  ExampleGroupStatsEntropyTotal(egs),
                  gd->treeLevel);
            if(vfdt->messageLevel >= 2) {
               printf("   allocation now: %ld\n", MGetTotalAllocation());
            }
         }

         if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
            _DoDiscreteSplit(vfdt, currentNode, firstAttrib);
         } else {
            _DoContinuousSplit(vfdt, currentNode, firstAttrib, firstThresh);
         }
      } else {
         if(vfdt->messageLevel >= 2) {
            printf("Didn't split, top indexes were %f at %d and %f at %d hoeffding %f\n",
                     firstIndex, firstAttrib, secondIndex, secondAttrib,
                       hoeffdingBound);
            //ExampleGroupStatsWrite(egs, stdout);
         }

         /* now disable any that aren't promising */
         for(i = 0 ; i < ExampleSpecGetNumAttributes(vfdt->spec) ; i++) {
            if(allIndexes[i] > firstIndex + hoeffdingBound
                   && i != firstAttrib && i != secondAttrib) {
               ExampleGroupStatsIgnoreAttribute(egs, i);

               if(vfdt->messageLevel >= 3) {
                  printf("started ignoring an attribute %d index %f, best %f\n based on %ld examples\n", i, allIndexes[i], firstIndex, ExampleGroupStatsNumExamplesSeen(egs));
               }
            } else if(!ExampleSpecIsAttributeDiscrete(vfdt->spec, i)) {
               /* start ignoring bad split points */
               if(vfdt->useGini) {
                  ExampleGroupStatsIgnoreSplitsWorseThanGini(egs,
					    i, firstIndex + hoeffdingBound);
               } else {
                  ExampleGroupStatsIgnoreSplitsWorseThanEntropy(egs,
					    i, firstIndex + hoeffdingBound);
               }
	    }
         }
      }
   }
   MFreePtr(allIndexes);
}

static void _AddExampleToGrowingNode(VFDTPtr vfdt, 
                                     DecisionTreePtr currentNode,
                                     ExamplePtr e,
                                     int checkSplits) {
   int i;
   VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode);
   ExampleGroupStatsPtr egs = gd->egs;

   /* add the information */
   ExampleGroupStatsAddExample(egs, e);

   for(i = 0 ; i < ExampleSpecGetNumAttributes(vfdt->spec) ; i++) {
      if(ExampleSpecIsAttributeContinuous(vfdt->spec, i)) {
         if(ExampleGroupStatsIsAttributeActive(egs, i)) {
            if(ExampleGroupStatsNumSplitThresholds(egs, i) > 1000) {
               ExampleGroupStatsStopAddingSplits(egs, i);
            }
         }
      }
   }

   if(vfdt->cacheExamples && gd->exampleCache != 0) {
      VALAppend(gd->exampleCache, e);
   } else {
      ExampleFree(e);
   }

   gd->seenSinceLastProcess++;
   gd->seenExamplesCount++;
   if((gd->seenSinceLastProcess >= vfdt->processChunkSize) && checkSplits) {
      gd->seenSinceLastProcess = 0;
      if(!ExampleGroupStatsIsPure(egs)) {
         _CheckSplit(vfdt, currentNode, 0);
      }
   }
}

static void _AddExampleToDeactivatedNode(VFDTPtr vfdt, 
                                     DecisionTreePtr currentNode,
                                     ExamplePtr e) {
   VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode);

   if(gd != 0) {
      /* if it's zero we've already spilt on every attribute on that path */
      gd->seenSinceDeactivated++;

      if(ExampleGetClass(e) != DecisionTreeGetClass(currentNode)) {
         gd->errorsSinceDeactivated++;
      }
   }

   ExampleFree(e);
}


static void _ProcessExampleBatch(VFDTPtr vfdt, ExamplePtr e) {
   int foundNode = 0;
   DecisionTreePtr currentNode;

   currentNode = vfdt->dtree;

   while(!foundNode && currentNode != 0) {
      if(DecisionTreeIsLeaf(currentNode)) {
         /* We are done growing in this direction, this example isn't needed */
         foundNode = 1;
      } else if(DecisionTreeIsNodeGrowing(currentNode)) {
         foundNode = 1;
      }

      if(!foundNode) {
         currentNode = DecisionTreeOneStepClassify(currentNode, e);
      }
   }
      
   if(!DecisionTreeIsLeaf(currentNode)) { /* if leaf ignore the example */
      /* here, this doesn't check for splits */
      _AddExampleToGrowingNode(vfdt, currentNode, e, 0);
   } else {
      _AddExampleToDeactivatedNode(vfdt, currentNode, e);
      if(vfdt->messageLevel >= 3) {
         printf("not learning from:");
         ExampleWrite(e, stdout);
      }
   }

   if(vfdt->examplesSeen % vfdt->reactivateScanPeriod == 0) {
      _DoReactivateScan(vfdt);
   }

   if(vfdt->maxAllocationBytes < MGetTotalAllocation()) {
      if(vfdt->cacheExamples) {
         _FlushExampleCachesCheckStop(vfdt);
      } else {
         /* HERE may want to control how many to leafify with a flag */
         _LeafifyLeastImportantNodes(vfdt, vfdt->numGrowing * .2);
      }
   }

   if((vfdt->messageLevel >= 1) && (vfdt->examplesSeen % 1000 == 0)) {
      printf("processed %ld examples\n", vfdt->examplesSeen);
   }
}


void _forceSplits(VFDTPtr vfdt, DecisionTreePtr dt) {
   int i;

   if(DecisionTreeIsLeaf(dt) || DecisionTreeIsNodeGrowing(dt)) {
      _CheckSplit(vfdt, dt, 1);
   }
   
   if(!DecisionTreeIsLeaf(dt) && !DecisionTreeIsNodeGrowing(dt)) {
      for(i = 0 ; i < DecisionTreeGetChildCount(dt) ; i++) {
         _forceSplits(vfdt, DecisionTreeGetChild(dt, i));
      }
   }
}


static void _PrintSpaces(FILE *out, int num) {
   int i;

   for(i = 0 ; i < num ; i++) {
      fprintf(out, " ");
   }
}

static void _PrintHelp(DecisionTreePtr dt, FILE *out, int indent) {
   int i;
   VFDTGrowingDataPtr gd;

   _PrintSpaces(out, indent);
   if(dt->nodeType == dtnLeaf || dt->nodeType == dtnGrowing) {
     //fprintf(out, "(leaf: %d)\n", dt->myclass);
      fprintf(out, "(leaf: %s -", 
             ExampleSpecGetClassValueName(dt->spec, dt->myclass));
      gd = DecisionTreeGetGrowingData(dt);
      for(i = 0 ; i < ExampleSpecGetNumClasses(dt->spec) ; i++) {
         fprintf(out, " %ld", gd->egs->classTotals[i]);
      }
      fprintf(out, ")\n");
   } else if(dt->nodeType == dtnDiscrete) {
      fprintf(out, "(split on %s:\n",
             ExampleSpecGetAttributeName(dt->spec, dt->splitAttribute));
      for(i = 0 ; i < VALLength(dt->children) ; i++) {
         _PrintSpaces(out, indent + 1);
         fprintf(out, "%s\n",
          ExampleSpecGetAttributeValueName(dt->spec, dt->splitAttribute, i));
         _PrintHelp(VALIndex(dt->children, i), out, indent + 2);
      }
      _PrintSpaces(out, indent);
      fprintf(out, ")\n");
   } else if(dt->nodeType == dtnContinuous) {
      fprintf(out, "(split on %s:\n",
             ExampleSpecGetAttributeName(dt->spec, dt->splitAttribute));

      /* left child */
      _PrintSpaces(out, indent + 1);
      fprintf(out, "< %f\n", dt->splitThreshold);
      _PrintHelp(VALIndex(dt->children, 0), out, indent + 2);

      /* right child */
      _PrintSpaces(out, indent + 1);
      fprintf(out, ">= %f\n", dt->splitThreshold);
      _PrintHelp(VALIndex(dt->children, 1), out, indent + 2);


      _PrintSpaces(out, indent);
      fprintf(out, ")\n");
   } else if(dt->nodeType == dtnGrowing) {
      fprintf(out, "(growing)\n");
   }
}

static void _DecisionTreeWrite(DecisionTreePtr dt) {
   _PrintHelp(dt, stdout, 0);
}

void _ProcessExamplesBatch(VFDTPtr vfdt, FILE *input) {
   ExamplePtr e;

   e = ExampleRead(input, vfdt->spec);

   while(e != 0 && vfdt->numGrowing > 0) {
      _ProcessExampleBatch(vfdt, e);
      vfdt->examplesSeen++;

      e = ExampleRead(input, vfdt->spec);
   }

   _forceSplits(vfdt, vfdt->dtree);

   //_DecisionTreeWrite(vfdt->dtree);

   if(vfdt->messageLevel >= 1) {
      printf("finished with all the examples, there are %ld growing nodes\n",
              vfdt->numGrowing);
   }
}

static void _ProcessExample(VFDTPtr vfdt, ExamplePtr e) {
   int foundNode = 0;
   DecisionTreePtr currentNode;

   currentNode = vfdt->dtree;

   while(!foundNode && currentNode != 0) {
      if(DecisionTreeIsLeaf(currentNode)) {
         /* We are done growing in this direction, this example isn't needed */
         foundNode = 1;
      } else if(DecisionTreeIsNodeGrowing(currentNode)) {
         foundNode = 1;
      }

      if(!foundNode) {
         currentNode = DecisionTreeOneStepClassify(currentNode, e);
      }
   }
      
   if(!DecisionTreeIsLeaf(currentNode)) { /* if leaf ignore the example */
      _AddExampleToGrowingNode(vfdt, currentNode, e, 1);
   } else {
      _AddExampleToDeactivatedNode(vfdt, currentNode, e);
      if(vfdt->messageLevel >= 3) {
         printf("not learning from:");
         ExampleWrite(e, stdout);
      }
   }

   if(vfdt->examplesSeen % vfdt->reactivateScanPeriod == 0) {
      _DoReactivateScan(vfdt);
   }

   if(vfdt->maxAllocationBytes < MGetTotalAllocation()) {
      if(vfdt->cacheExamples) {
         _FlushExampleCachesCheckStop(vfdt);
      } else {
         /* HERE may want to control how many to leafify with a flag */
         _LeafifyLeastImportantNodes(vfdt, vfdt->numGrowing * .2);
      }
   }
}


void _ProcessExamples(VFDTPtr vfdt, FILE *input) {
   ExamplePtr e;

   e = ExampleRead(input, vfdt->spec);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -