⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfdt-engine.c

📁 数据挖掘方面的源码
💻 C
📖 第 1 页 / 共 4 页
字号:
            secondAttrib = i;
         }
      } else if(tmpIndexOne < secondIndex) {
         /* replace the second one */
         secondIndex = tmpIndexOne;
         secondThresh = tmpThreshOne;
         secondAttrib = i;

         /* the tmpIndexTwo must be worse than what we got so ignore it */
      }
   }

   if(vfdt->messageLevel >= 3) {
      printf("Considering split, best two indecies are:\n");
      printf("   Index: %f Thresh: %f Attrib: %d\n", firstIndex, firstThresh, firstAttrib);
      printf("   Index: %f Thresh: %f Attrib: %d\n", secondIndex, secondThresh, secondAttrib);
   }

   if(vfdt->useGini) {
      range = 1.0;
   } else {
      range = log(ExampleSpecGetNumClasses(vfdt->spec));
   }
   /* HERE, this will crash if splitConfidence is 0 */
   hoeffdingBound = sqrt(((range * range) * log(1/gd->splitConfidence)) / 
       (2.0 * ExampleGroupStatsNumExamplesSeen(egs)));

   /* if we win by the hoeffding test and we aren't pre-pruned
      or
      if we are in forceSplit mode and it's a good time to split */
   if(((secondIndex - firstIndex > hoeffdingBound) && 
       (!vfdt->prePrune || ExampleGroupStatsEntropyTotal(egs) > firstIndex))
                || 
         (forceSplit && ExampleGroupStatsNumExamplesSeen(egs) > 1 &&
          !ExampleGroupStatsIsPure(egs) && 
          ExampleGroupStatsEntropyTotal(egs) > firstIndex)) {
          

      if(vfdt->messageLevel >= 1) {
         if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
            printf("Did discrete split on attrib %s index %f, second best %s index %f, hoeffding %f, based on %ld examples, presplit entropy %f\n",
                     ExampleSpecGetAttributeName(vfdt->spec, firstAttrib), 
                         firstIndex, 
                     ExampleSpecGetAttributeName(vfdt->spec, secondAttrib),
                     secondIndex, hoeffdingBound,
                     ExampleGroupStatsNumExamplesSeen(egs),
                     ExampleGroupStatsEntropyTotal(egs));
         } else {
            printf("Did continuous split on attrib %s thresh %f index %f, second best %s thresh %f index %f, hoeffding %f, based on %ld examples, presplit entropy %f\n",
		   ExampleSpecGetAttributeName(vfdt->spec, firstAttrib),
                     firstThresh, firstIndex, 
                     ExampleSpecGetAttributeName(vfdt->spec, secondAttrib),
                     secondThresh, secondIndex, hoeffdingBound,
                     ExampleGroupStatsNumExamplesSeen(egs),
                     ExampleGroupStatsEntropyTotal(egs));
         }
         printf("   allocation now: %ld\n", MGetTotalAllocation());
      }

      if(vfdt->messageLevel >= 2) {
         ExampleGroupStatsWrite(egs, stdout);
      }

      if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
         _DoDiscreteSplit(vfdt, currentNode, firstAttrib, gd->splitConfidence);
      } else {
         _DoContinuousSplit(vfdt, currentNode, firstAttrib, firstThresh,
                 gd->splitConfidence);
      }

      if(vfdt->messageLevel >= 3) {
         printf("The new tree is: \n");
         DecisionTreePrint(vfdt->dtree, stdout);
      }

   } else if(hoeffdingBound < vfdt->tieConfidence &&
       (!vfdt->prePrune || ExampleGroupStatsEntropyTotal(egs) > firstIndex)) {
         /* if it looks like we have a tie and we aren't pre-pruned */
      if(vfdt->messageLevel >= 1) {
         printf("Called it a tie between attrib %d index %f, and %d index %f, hoeffding %f, based on %ld examples\n",
                  firstAttrib, firstIndex, secondAttrib, secondIndex,
                  hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs));
         printf("   allocation now: %ld\n", MGetTotalAllocation());
      }

      if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
         _DoDiscreteSplit(vfdt, currentNode, firstAttrib, gd->splitConfidence);
      } else {
         _DoContinuousSplit(vfdt, currentNode, firstAttrib, firstThresh,
                               gd->splitConfidence);
      }
   } else {
      if(vfdt->messageLevel >= 3) {
         printf("Didn't split, top indexes were %f at %d and %f at %d hoeffding %f\n",
                     firstIndex, firstAttrib, secondIndex, secondAttrib,
                       hoeffdingBound);
         ExampleGroupStatsWrite(egs, stdout);
      }

      /* now disable any that aren't promising */
      for(i = 0 ; i < ExampleSpecGetNumAttributes(vfdt->spec) ; i++) {
         if(allIndexes[i] > firstIndex + hoeffdingBound
                   && i != firstAttrib && i != secondAttrib) {
            ExampleGroupStatsIgnoreAttribute(egs, i);
            if(vfdt->messageLevel >= 2) {
               printf("started ignoring an attribute %d index %f, best %f\n based on %ld examples\n", i, allIndexes[i], firstIndex, ExampleGroupStatsNumExamplesSeen(egs));
            }
         } else if(!ExampleSpecIsAttributeDiscrete(vfdt->spec, i)) {
            /* start ignoring bad split points */
            if(vfdt->useGini) {
               ExampleGroupStatsIgnoreSplitsWorseThanGini(egs,
					    i, firstIndex + hoeffdingBound);
            } else {
               ExampleGroupStatsIgnoreSplitsWorseThanEntropy(egs,
					    i, firstIndex + hoeffdingBound);
            }
	 }
      }
   }
   MFreePtr(allIndexes);
}

static void _AddExampleToGrowingNode(VFDTPtr vfdt, 
                                     DecisionTreePtr currentNode,
                                     ExamplePtr e,
                                     int checkSplits) {
   VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode);
   ExampleGroupStatsPtr egs = gd->egs;

  /* add the information */
   ExampleGroupStatsAddExample(egs, e);

   if(vfdt->cacheExamples && gd->exampleCache != 0) {
      VALAppend(gd->exampleCache, e);
   } else {
      ExampleFree(e);
   }

   gd->seenSinceLastProcess++;
   if((gd->seenSinceLastProcess >= vfdt->processChunkSize) && checkSplits) {
      gd->seenSinceLastProcess = 0;
      if(!ExampleGroupStatsIsPure(egs)) {
         _CheckSplit(vfdt, currentNode, 0);
      }
   }
}

static void _AddExampleToDeactivatedNode(VFDTPtr vfdt, 
                                     DecisionTreePtr currentNode,
                                     ExamplePtr e) {
   VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode);

   if(gd != 0) {
      /* if it's zero we've already spilt on every attribute on that path */
      gd->seenSinceDeactivated++;

      if(ExampleGetClass(e) != DecisionTreeGetClass(currentNode)) {
         gd->errorsSinceDeactivated++;
      }
   }

   ExampleFree(e);
}


static void _ProcessExampleBatch(VFDTPtr vfdt, ExamplePtr e) {
   int foundNode = 0;
   DecisionTreePtr currentNode;

   currentNode = vfdt->dtree;

   while(!foundNode && currentNode != 0) {
      if(DecisionTreeIsLeaf(currentNode)) {
         /* We are done growing in this direction, this example isn't needed */
         foundNode = 1;
      } else if(DecisionTreeIsNodeGrowing(currentNode)) {
         foundNode = 1;
      }

      if(!foundNode) {
         currentNode = DecisionTreeOneStepClassify(currentNode, e);
      }
   }
      
   if(!DecisionTreeIsLeaf(currentNode)) { /* if leaf ignore the example */
      _AddExampleToGrowingNode(vfdt, currentNode, e, 0);
   } else {
      _AddExampleToDeactivatedNode(vfdt, currentNode, e);
      if(vfdt->messageLevel >= 3) {
         printf("not learning from:");
         ExampleWrite(e, stdout);
      }
   }

   if(vfdt->examplesSeen % vfdt->reactivateScanPeriod == 0) {
      _DoReactivateScan(vfdt);
   }

   if(vfdt->maxAllocationBytes < MGetTotalAllocation() &&
                                           vfdt->cacheExamples) {
      _FlushExampleCachesCheckStop(vfdt);
   }

   if((vfdt->messageLevel >= 1) && (vfdt->examplesSeen % 1000 == 0)) {
      printf("processed %ld examples\n", vfdt->examplesSeen);
   }
}


void _forceSplits(VFDTPtr vfdt, DecisionTreePtr dt) {
   int i;

   if(DecisionTreeIsLeaf(dt) || DecisionTreeIsNodeGrowing(dt)) {
      _CheckSplit(vfdt, dt, 1);
   }
   
   if(!DecisionTreeIsLeaf(dt) && !DecisionTreeIsNodeGrowing(dt)) {
      for(i = 0 ; i < DecisionTreeGetChildCount(dt) ; i++) {
         _forceSplits(vfdt, DecisionTreeGetChild(dt, i));
      }
   }
}


static void _PrintSpaces(FILE *out, int num) {
   int i;

   for(i = 0 ; i < num ; i++) {
      fprintf(out, " ");
   }
}

static void _PrintHelp(DecisionTreePtr dt, FILE *out, int indent) {
   int i;
   VFDTGrowingDataPtr gd;

   _PrintSpaces(out, indent);
   if(dt->nodeType == dtnLeaf || dt->nodeType == dtnGrowing) {
     //fprintf(out, "(leaf: %d)\n", dt->myclass);
      fprintf(out, "(leaf: %s -", 
             ExampleSpecGetClassValueName(dt->spec, dt->myclass));
      gd = DecisionTreeGetGrowingData(dt);
      for(i = 0 ; i < ExampleSpecGetNumClasses(dt->spec) ; i++) {
         fprintf(out, " %ld", gd->egs->classTotals[i]);
      }
      fprintf(out, ")\n");
   } else if(dt->nodeType == dtnDiscrete) {
      fprintf(out, "(split on %s:\n",
             ExampleSpecGetAttributeName(dt->spec, dt->splitAttribute));
      for(i = 0 ; i < VALLength(dt->children) ; i++) {
         _PrintSpaces(out, indent + 1);
         fprintf(out, "%s\n",
          ExampleSpecGetAttributeValueName(dt->spec, dt->splitAttribute, i));
         _PrintHelp(VALIndex(dt->children, i), out, indent + 2);
      }
      _PrintSpaces(out, indent);
      fprintf(out, ")\n");
   } else if(dt->nodeType == dtnContinuous) {
      fprintf(out, "(split on %s:\n",
             ExampleSpecGetAttributeName(dt->spec, dt->splitAttribute));

      /* left child */
      _PrintSpaces(out, indent + 1);
      fprintf(out, "< %f\n", dt->splitThreshold);
      _PrintHelp(VALIndex(dt->children, 0), out, indent + 2);

      /* right child */
      _PrintSpaces(out, indent + 1);
      fprintf(out, ">= %f\n", dt->splitThreshold);
      _PrintHelp(VALIndex(dt->children, 1), out, indent + 2);


      _PrintSpaces(out, indent);
      fprintf(out, ")\n");
   } else if(dt->nodeType == dtnGrowing) {
      fprintf(out, "(growing)\n");
   }
}

static void _DecisionTreeWrite(DecisionTreePtr dt) {
   _PrintHelp(dt, stdout, 0);
}

void _ProcessExamplesBatch(VFDTPtr vfdt, FILE *input) {
   ExamplePtr e;

   e = ExampleRead(input, vfdt->spec);

   while(e != 0 && vfdt->numGrowing > 0) {
      _ProcessExampleBatch(vfdt, e);
      vfdt->examplesSeen++;

      e = ExampleRead(input, vfdt->spec);
   }

   _forceSplits(vfdt, vfdt->dtree);

   //_DecisionTreeWrite(vfdt->dtree);

   if(vfdt->messageLevel >= 1) {
      printf("finished with all the examples, there are %ld growing nodes\n",
              vfdt->numGrowing);
   }
}

static void _ProcessExample(VFDTPtr vfdt, ExamplePtr e) {
   int foundNode = 0;
   DecisionTreePtr currentNode;

   currentNode = vfdt->dtree;

   while(!foundNode && currentNode != 0) {
      if(DecisionTreeIsLeaf(currentNode)) {
         /* We are done growing in this direction, this example isn't needed */
         foundNode = 1;
      } else if(DecisionTreeIsNodeGrowing(currentNode)) {
         foundNode = 1;
      }

      if(!foundNode) {
         currentNode = DecisionTreeOneStepClassify(currentNode, e);
      }
   }
      
   if(!DecisionTreeIsLeaf(currentNode)) { /* if leaf ignore the example */
      _AddExampleToGrowingNode(vfdt, currentNode, e, 1);
   } else {
      _AddExampleToDeactivatedNode(vfdt, currentNode, e);
      if(vfdt->messageLevel >= 3) {
         printf("not learning from:");
         ExampleWrite(e, stdout);
      }
   }

   if(vfdt->examplesSeen % vfdt->reactivateScanPeriod == 0) {
      _DoReactivateScan(vfdt);
   }

   if(vfdt->maxAllocationBytes < MGetTotalAllocation() &&
                                           vfdt->cacheExamples) {
      _FlushExampleCachesCheckStop(vfdt);
   }
}


void _ProcessExamples(VFDTPtr vfdt, FILE *input) {
   ExamplePtr e;

   e = ExampleRead(input, vfdt->spec);

   while(e != 0 && vfdt->numGrowing > 0) {
      _ProcessExample(vfdt, e);
      vfdt->examplesSeen++;
      if((vfdt->messageLevel >= 1) && (vfdt->examplesSeen % 1000 == 0)) {
         printf("processed %ld examples\n", vfdt->examplesSeen);
      }

      e = ExampleRead(input, vfdt->spec);
   }

   if(vfdt->messageLevel >= 1) {
      printf("finished with all the examples, there are %ld growing nodes\n",
              vfdt->numGrowing);
   }
}

//void VFDTBootstrapC45(VFDTPtr vfdt, char *fileStem, int overprune, int runC45) {
//   char command[500], fileName[100];
//   FILE *tree;

//   if(runC45) {
//      sprintf(command, "c4.5 -f %s -u >> /dev/null", fileStem);
//      if(vfdt->messageLevel >= 1) {
//         printf("%s\n", command);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -