⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cvfdt.c

📁 数据挖掘方面的源码
💻 C
📖 第 1 页 / 共 4 页
字号:
/* NOTE: Leafify not working for some reasonstatic float _CalculateLeafifyIndex(DecisionTreePtr node, long totalSeen) {   float percent;   VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(node);   percent = (float)(gd->seenExamplesCount / totalSeen);   if(ExampleGroupStatsNumExamplesSeen(gd->egs) > 0) {      return percent *          (1 - (ExampleGroupStatsGetMostCommonClassCount(gd->egs) /                (float)ExampleGroupStatsNumExamplesSeen(gd->egs)));   } else {      return percent;   }}static void _LeafifyLeastImportantNodes(VFDTPtr vfdt, long number) {   float *indexes;   DecisionTreePtr *nodes;   int i, j;   float thisIndex, tmpIndex;   DecisionTreePtr cNode, tmpNode;   VoidAListPtr growingNodes = VALNew();   if(gMessageLevel >= 1) {      printf("leafifying %ld nodes\n", number);   }   DecisionTreeGatherGrowingNodes(vfdt->dtree, growingNodes);   indexes = MNewPtr(sizeof(float) * number);   nodes   = MNewPtr(sizeof(DecisionTreePtr) * number);   for(i = 0 ; i < number ; i++) {     nodes[i] = VALIndex(growingNodes, i);     indexes[i] = _CalculateLeafifyIndex(nodes[i], vfdt->examplesSeen);   }   for(i = number ; i < VALLength(growingNodes) ; i++) {      thisIndex = _CalculateLeafifyIndex(VALIndex(growingNodes, i),                                                      vfdt->examplesSeen);      cNode = VALIndex(growingNodes, i);      for(j = 0 ; j < number ; j++) {         if(thisIndex < indexes[j]) {            tmpIndex = indexes[j];            indexes[j] = thisIndex;            thisIndex = tmpIndex;            tmpNode = nodes[j];            nodes[j] = cNode;            cNode = tmpNode;         }      }   }   for(i = 0 ; i < number ; i++) {      _DoMakeLeaf(vfdt, nodes[i]);      if(gMessageLevel >= 2) {         printf("leafifying a node with index %f\n", indexes[i]);      }   }   MFreePtr(indexes);   MFreePtr(nodes);   VALFree(growingNodes);}*/static void _DoContinuousSplit( CVFDTPtr root, CVFDTPtr currentNode, 				int attNum, float threshold) {  int j;  AttributeTrackerPtr newAT;  int childCount;  VFDTGrowingDataPtr parentGD = (VFDTGrowingDataPtr)    DecisionTreeGetGrowingData(currentNode->dtreeNode);  AttributeTrackerPtr at =     ExampleGroupStatsGetAttributeTracker(parentGD->egs);  VFDTGrowingDataPtr gd;   //  long allocation;  int parentClass;  float parentErrorRate;  //  CVFDTPtr parent = currentNode;  // Do split at decision tree level  DecisionTreeSplitOnContinuousAttribute(currentNode->dtreeNode, attNum, threshold);  childCount = DecisionTreeGetChildCount(currentNode->dtreeNode);    // Wrap each new child as a cvfdt node  for ( j = 0; j < childCount; j++ ) {    CVFDTAddChild( currentNode, DecisionTreeGetChild( currentNode->dtreeNode, j ));  }  // update numGrowing  root->vfdt->numGrowing--;  root->vfdt->numGrowing += childCount;  parentClass = ExampleGroupStatsGetMostCommonClass(parentGD->egs);  parentErrorRate =    (1.0 - (ExampleGroupStatsGetMostCommonClassCount(parentGD->egs) /	    (float)ExampleGroupStatsNumExamplesSeen(parentGD->egs)));  // children  for ( j = 0; j < 2; j++ ) {    newAT = AttributeTrackerClone(at);    gd = MNewPtr(sizeof(VFDTGrowingData));    gd->egs = ExampleGroupStatsNew(root->vfdt->spec, newAT);    gd->parentClass = parentClass;    //gd->splitConfidence = parentSplitConfidence;    gd->exampleCache = 0;    gd->parentErrorRate = parentErrorRate;    if ( j == 0 )      gd->seenExamplesCount = 	ExampleGroupStatsGetPercentBelowThreshold( parentGD->egs,						   attNum, threshold )	* parentGD->seenExamplesCount;    else      gd->seenExamplesCount = 	(1 - ExampleGroupStatsGetPercentBelowThreshold( parentGD->egs,							attNum, threshold ))	* parentGD->seenExamplesCount;    gd->seenSinceLastProcess = 0;    DecisionTreeSetGrowingData(DecisionTreeGetChild(currentNode->dtreeNode, j), gd);    DecisionTreeSetClass(DecisionTreeGetChild(currentNode->dtreeNode, j),			 parentClass);  }}static void _DoDiscreteSplit( CVFDTPtr root,                              CVFDTPtr currentNode, int attNum) {  int i, j;  AttributeTrackerPtr newAT;  int childCount;    VFDTGrowingDataPtr parentGD = (VFDTGrowingDataPtr)(DecisionTreeGetGrowingData(currentNode->dtreeNode));  AttributeTrackerPtr at =    ExampleGroupStatsGetAttributeTracker(parentGD->egs);  VFDTGrowingDataPtr gd;  //long allocation;  int parentClass;    DecisionTreeSplitOnDiscreteAttribute(currentNode->dtreeNode, attNum);  childCount = DecisionTreeGetChildCount(currentNode->dtreeNode);  for ( j = 0; j < childCount; j++ ) {    CVFDTAddChild( currentNode, DecisionTreeGetChild( currentNode->dtreeNode, j ));  }    root->vfdt->numGrowing--;  root->vfdt->numGrowing += childCount;    parentClass = ExampleGroupStatsGetMostCommonClass(parentGD->egs);    for(i = 0 ; i < childCount ; i++) {    newAT = AttributeTrackerClone(at);    AttributeTrackerMarkInactive(newAT, attNum);        gd = MNewPtr(sizeof(VFDTGrowingData));    gd->egs = ExampleGroupStatsNew(root->vfdt->spec, newAT);        if ( parentGD->seenExamplesCount == 0 )      gd->seenExamplesCount = 0;    else      gd->seenExamplesCount 	= (long)(ExampleGroupStatsGetValuePercent(parentGD->egs,						  attNum, i) * parentGD->seenExamplesCount);        gd->seenSinceLastProcess = 0;        DecisionTreeSetGrowingData(DecisionTreeGetChild(currentNode->dtreeNode, i), gd);    DecisionTreeSetClass(DecisionTreeGetChild(currentNode->dtreeNode, i),			 parentClass);  }  //ExampleGroupStatsFree(((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode))->egs);  //MFreePtr(DecisionTreeGetGrowingData(currentNode));    /* allocation = MGetTotalAllocation();  if(root->vfdt->maxAllocationBytes < allocation) {    // HERE may want to control how many to leafify with a flag    // _LeafifyLeastImportantNodes(root->vfdt, (long)(root->vfdt->numGrowing * .2));      }*/}static int _ValidateSplitEntropy(CVFDTPtr root,                                  CVFDTPtr currentNode ) {   float hoeffdingBound;   int i;   float range;   float *allIndexes;   float firstIndex, secondIndex;   int firstAttrib = -1, secondAttrib = -1;   float firstThresh = 10000, secondThresh = 10000;   float tmpIndexOne, tmpIndexTwo, tmpThreshOne, tmpThreshTwo;   int numChildren = 0;   ExampleGroupStatsPtr egs      = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode->dtreeNode))->egs;   int bestSplitAtt = currentNode->dtreeNode->splitAttribute;   // Select best two attributes   firstIndex = secondIndex = 10000;   allIndexes      = MNewPtr(sizeof(float) * ExampleSpecGetNumAttributes(root->vfdt->spec));   for(i = 0 ; i < ExampleSpecGetNumAttributes(root->vfdt->spec) ; i++) {     // Discrete     if(ExampleSpecIsAttributeDiscrete(root->vfdt->spec, i)) {       tmpIndexTwo = 10000;       tmpIndexOne = ExampleGroupStatsEntropyDiscreteAttributeSplit(egs, i);     // Continuous     } else {        ExampleGroupStatsEntropyContinuousAttributeSplit(egs, i, &tmpIndexOne, &tmpThreshOne,							&tmpIndexTwo, &tmpThreshTwo);     }     allIndexes[i] = tmpIndexOne;     if(tmpIndexOne < firstIndex) {       secondIndex = firstIndex;       secondThresh = firstThresh;       secondAttrib = firstAttrib;       firstIndex = tmpIndexOne;       firstThresh = tmpThreshOne;       firstAttrib = i;       if(tmpIndexTwo < secondIndex) {	 secondIndex = tmpIndexTwo;	 secondThresh = tmpThreshTwo;	 secondAttrib = i;       }     } else if(tmpIndexOne < secondIndex) {       secondIndex = tmpIndexOne;       secondThresh = tmpThreshOne;       secondAttrib = i;     }   }      if(root->vfdt->messageLevel >= 3) {     printf("Validating split, best two indices are:\n");     printf("   Index: %f Thresh: %f Attrib: %d\n", firstIndex, firstThresh, firstAttrib);     printf("   Index: %f Thresh: %f Attrib: %d\n", secondIndex, secondThresh, secondAttrib);   }      range = lg(ExampleSpecGetNumClasses(root->vfdt->spec));   /* HERE, this will crash if splitConfidence is 0 */   hoeffdingBound = (float)(sqrt(((range * range) * log(1/root->vfdt->splitConfidence)) / 				 (2.0 * ExampleGroupStatsNumExamplesSeen(egs))));   if ( currentNode->children != 0 )      numChildren = VALLength( currentNode->children );   if (( numChildren != 0 ) && ( hoeffdingBound != 0 )) {     /*       if ( secondIndex - firstIndex <= hoeffdingBound &&       ExampleGroupStatsEntropyTotal(egs) > firstIndex) {       if ( firstAttrib != currentNode->dtreeNode->splitAttribute        && (( secondIndex - firstIndex ) >= 0.1 * hoeffdingBound )) {       if ( gMessageLevel >= 2 ) {       printf("Split not valid - difference %f, bound %f at node %ld\n",        (secondIndex - firstIndex), hoeffdingBound, currentNode->uid );       }       bestSplitAtt = firstAttrib;       }       }*/     if ( secondIndex - firstIndex > hoeffdingBound 	  && ExampleGroupStatsEntropyTotal(egs) > firstIndex 	  && firstAttrib != currentNode->dtreeNode->splitAttribute ) {       bestSplitAtt = firstAttrib;            }     else if ( hoeffdingBound < root->vfdt->tieConfidence &&	       ExampleGroupStatsEntropyTotal(egs) > firstIndex) {       if ( firstAttrib != currentNode->dtreeNode->splitAttribute 	    && (( secondIndex - firstIndex ) >= ( 0.5 * gTieConfidence ))) {	 if ( gMessageLevel >= 2 ) {	   printf("Node %ld was tied, difference %f\n", currentNode->uid,		  secondIndex - firstIndex);	 }	 bestSplitAtt = firstAttrib;       }     }   }   MFreePtr(allIndexes);   return bestSplitAtt;}static void _CheckSplitEntropy(CVFDTPtr root, 			       CVFDTPtr currentNode ) {   float hoeffdingBound;   int i;   float range;   float *allIndexes;   float firstIndex, secondIndex;   int firstAttrib = -1, secondAttrib = -1;   float firstThresh = -1, secondThresh = -1;   float tmpIndexOne, tmpIndexTwo, tmpThreshOne, tmpThreshTwo;//   int numChildren = 0;   ExampleGroupStatsPtr egs      = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode->dtreeNode))->egs;   // Select best two attributes   firstIndex = secondIndex = 10000;   allIndexes      = MNewPtr(sizeof(float) * ExampleSpecGetNumAttributes(root->vfdt->spec));   for(i = 0 ; i < ExampleSpecGetNumAttributes(root->vfdt->spec) ; i++) {     // Discrete     if(ExampleSpecIsAttributeDiscrete(root->vfdt->spec, i)) {       tmpIndexTwo = 10000;       tmpIndexOne = ExampleGroupStatsEntropyDiscreteAttributeSplit(egs, i);     // Continuous     } else {        ExampleGroupStatsEntropyContinuousAttributeSplit(egs, i, &tmpIndexOne, &tmpThreshOne,							&tmpIndexTwo, &tmpThreshTwo);     }     allIndexes[i] = tmpIndexOne;     if(tmpIndexOne < firstIndex) {       secondIndex = firstIndex;       secondThresh = firstThresh;       secondAttrib = firstAttrib;       firstIndex = tmpIndexOne;       firstThresh = tmpThreshOne;       firstAttrib = i;       if(tmpIndexTwo < secondIndex) {	 secondIndex = tmpIndexTwo;	 secondThresh = tmpThreshTwo;	 secondAttrib = i;       }     } else if(tmpIndexOne < secondIndex) {       secondIndex = tmpIndexOne;       secondThresh = tmpThreshOne;       secondAttrib = i;     }   }      if(root->vfdt->messageLevel >= 3) {     printf("Considering split, best two indices are:\n");     printf("   Index: %f Thresh: %f Attrib: %d\n", firstIndex, firstThresh, firstAttrib);     printf("   Index: %f Thresh: %f Attrib: %d\n", secondIndex, secondThresh, secondAttrib);   }      range = (float)(lg(ExampleSpecGetNumClasses(root->vfdt->spec)));   /* HERE, this will crash if splitConfidence is 0 */   hoeffdingBound = (float)(sqrt(((range * range) * log(1/root->vfdt->splitConfidence)) / 				 (2.0 * ExampleGroupStatsNumExamplesSeen(egs))));      if(secondIndex - firstIndex > hoeffdingBound &&      ExampleGroupStatsEntropyTotal(egs) > firstIndex) {     if(gMessageLevel >= 1) {       if(ExampleSpecIsAttributeDiscrete(root->vfdt->spec, firstAttrib)) {	 printf("Did discrete split on attrib %s index %f, second best %s index %f, hoeffding %f, based on %ld examples, presplit entropy %f\n",		ExampleSpecGetAttributeName(root->vfdt->spec, firstAttrib), 		firstIndex, 		ExampleSpecGetAttributeName(root->vfdt->spec, secondAttrib),		secondIndex, hoeffdingBound,		ExampleGroupStatsNumExamplesSeen(egs),		ExampleGroupStatsEntropyTotal(egs));       } else {	 printf("Did continuous split on attrib %s thresh %f index %f, second best %s thresh %f index %f, hoeffding %f, based on %ld examples, presplit entropy %f\n",		ExampleSpecGetAttributeName(root->vfdt->spec, firstAttrib),		firstThresh, firstIndex, 		ExampleSpecGetAttributeName(root->vfdt->spec, secondAttrib),		secondThresh, secondIndex, hoeffdingBound,                     ExampleGroupStatsNumExamplesSeen(egs),		ExampleGroupStatsEntropyTotal(egs));       }       printf("   allocation now: %ld\n", MGetTotalAllocation());     }     if(root->vfdt->messageLevel >= 2) {       ExampleGroupStatsWrite(egs, stdout);     }     if(ExampleSpecIsAttributeDiscrete(root->vfdt->spec, firstAttrib)) {       _DoDiscreteSplit(root, currentNode, firstAttrib);     } else {       _DoContinuousSplit(root, currentNode, firstAttrib, firstThresh);     }          if(gMessageLevel >= 3) {       printf("The new tree is: \n");       CVFDTPrint(root, stdout);     }   } else if(hoeffdingBound < root->vfdt->tieConfidence &&	     ExampleGroupStatsEntropyTotal(egs) > firstIndex) {     if(gMessageLevel >= 1) {       printf("Called it a tie at node %ld between attrib %d entropy %f, and %d entropy %f, hoeffding %f, based on %ld examples\n",	      currentNode->uid, firstAttrib, firstIndex, 	      secondAttrib, secondIndex, hoeffdingBound,	      ExampleGroupStatsNumExamplesSeen(egs));       printf("   allocation now: %ld\n", MGetTotalAllocation());     }     if(ExampleSpecIsAttributeDiscrete(root->vfdt->spec, firstAttrib)) {       _DoDiscreteSplit(root, currentNode, firstAttrib);     } else {       _DoContinuousSplit(root, currentNode, firstAttrib, firstThresh);     }   } else {     if(gMessageLevel >= 3) {       printf("Didn't split, top entropies were %f at %d and %f at %d hoeffding %f\n",	      firstIndex, firstAttrib, secondIndex, secondAttrib, hoeffdingBound);       ExampleGroupStatsWrite(egs, stdout);     }     /* now disable any that aren't promising */     for(i = 0 ; i < ExampleSpecGetNumAttributes(root->vfdt->spec) ; i++) {       if(allIndexes[i] > firstIndex + hoeffdingBound

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -