📄 cvfdt.c
字号:
/* NOTE: Leafify not working for some reasonstatic float _CalculateLeafifyIndex(DecisionTreePtr node, long totalSeen) { float percent; VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(node); percent = (float)(gd->seenExamplesCount / totalSeen); if(ExampleGroupStatsNumExamplesSeen(gd->egs) > 0) { return percent * (1 - (ExampleGroupStatsGetMostCommonClassCount(gd->egs) / (float)ExampleGroupStatsNumExamplesSeen(gd->egs))); } else { return percent; }}static void _LeafifyLeastImportantNodes(VFDTPtr vfdt, long number) { float *indexes; DecisionTreePtr *nodes; int i, j; float thisIndex, tmpIndex; DecisionTreePtr cNode, tmpNode; VoidAListPtr growingNodes = VALNew(); if(gMessageLevel >= 1) { printf("leafifying %ld nodes\n", number); } DecisionTreeGatherGrowingNodes(vfdt->dtree, growingNodes); indexes = MNewPtr(sizeof(float) * number); nodes = MNewPtr(sizeof(DecisionTreePtr) * number); for(i = 0 ; i < number ; i++) { nodes[i] = VALIndex(growingNodes, i); indexes[i] = _CalculateLeafifyIndex(nodes[i], vfdt->examplesSeen); } for(i = number ; i < VALLength(growingNodes) ; i++) { thisIndex = _CalculateLeafifyIndex(VALIndex(growingNodes, i), vfdt->examplesSeen); cNode = VALIndex(growingNodes, i); for(j = 0 ; j < number ; j++) { if(thisIndex < indexes[j]) { tmpIndex = indexes[j]; indexes[j] = thisIndex; thisIndex = tmpIndex; tmpNode = nodes[j]; nodes[j] = cNode; cNode = tmpNode; } } } for(i = 0 ; i < number ; i++) { _DoMakeLeaf(vfdt, nodes[i]); if(gMessageLevel >= 2) { printf("leafifying a node with index %f\n", indexes[i]); } } MFreePtr(indexes); MFreePtr(nodes); VALFree(growingNodes);}*/static void _DoContinuousSplit( CVFDTPtr root, CVFDTPtr currentNode, int attNum, float threshold) { int j; AttributeTrackerPtr newAT; int childCount; VFDTGrowingDataPtr parentGD = (VFDTGrowingDataPtr) DecisionTreeGetGrowingData(currentNode->dtreeNode); AttributeTrackerPtr at = ExampleGroupStatsGetAttributeTracker(parentGD->egs); VFDTGrowingDataPtr gd; // long allocation; int parentClass; float parentErrorRate; // CVFDTPtr parent = currentNode; // Do split at decision tree level DecisionTreeSplitOnContinuousAttribute(currentNode->dtreeNode, attNum, threshold); childCount = DecisionTreeGetChildCount(currentNode->dtreeNode); // Wrap each new child as a cvfdt node for ( j = 0; j < childCount; j++ ) { CVFDTAddChild( currentNode, DecisionTreeGetChild( currentNode->dtreeNode, j )); } // update numGrowing root->vfdt->numGrowing--; root->vfdt->numGrowing += childCount; parentClass = ExampleGroupStatsGetMostCommonClass(parentGD->egs); parentErrorRate = (1.0 - (ExampleGroupStatsGetMostCommonClassCount(parentGD->egs) / (float)ExampleGroupStatsNumExamplesSeen(parentGD->egs))); // children for ( j = 0; j < 2; j++ ) { newAT = AttributeTrackerClone(at); gd = MNewPtr(sizeof(VFDTGrowingData)); gd->egs = ExampleGroupStatsNew(root->vfdt->spec, newAT); gd->parentClass = parentClass; //gd->splitConfidence = parentSplitConfidence; gd->exampleCache = 0; gd->parentErrorRate = parentErrorRate; if ( j == 0 ) gd->seenExamplesCount = ExampleGroupStatsGetPercentBelowThreshold( parentGD->egs, attNum, threshold ) * parentGD->seenExamplesCount; else gd->seenExamplesCount = (1 - ExampleGroupStatsGetPercentBelowThreshold( parentGD->egs, attNum, threshold )) * parentGD->seenExamplesCount; gd->seenSinceLastProcess = 0; DecisionTreeSetGrowingData(DecisionTreeGetChild(currentNode->dtreeNode, j), gd); DecisionTreeSetClass(DecisionTreeGetChild(currentNode->dtreeNode, j), parentClass); }}static void _DoDiscreteSplit( CVFDTPtr root, CVFDTPtr currentNode, int attNum) { int i, j; AttributeTrackerPtr newAT; int childCount; VFDTGrowingDataPtr parentGD = (VFDTGrowingDataPtr)(DecisionTreeGetGrowingData(currentNode->dtreeNode)); AttributeTrackerPtr at = ExampleGroupStatsGetAttributeTracker(parentGD->egs); VFDTGrowingDataPtr gd; //long allocation; int parentClass; DecisionTreeSplitOnDiscreteAttribute(currentNode->dtreeNode, attNum); childCount = DecisionTreeGetChildCount(currentNode->dtreeNode); for ( j = 0; j < childCount; j++ ) { CVFDTAddChild( currentNode, DecisionTreeGetChild( currentNode->dtreeNode, j )); } root->vfdt->numGrowing--; root->vfdt->numGrowing += childCount; parentClass = ExampleGroupStatsGetMostCommonClass(parentGD->egs); for(i = 0 ; i < childCount ; i++) { newAT = AttributeTrackerClone(at); AttributeTrackerMarkInactive(newAT, attNum); gd = MNewPtr(sizeof(VFDTGrowingData)); gd->egs = ExampleGroupStatsNew(root->vfdt->spec, newAT); if ( parentGD->seenExamplesCount == 0 ) gd->seenExamplesCount = 0; else gd->seenExamplesCount = (long)(ExampleGroupStatsGetValuePercent(parentGD->egs, attNum, i) * parentGD->seenExamplesCount); gd->seenSinceLastProcess = 0; DecisionTreeSetGrowingData(DecisionTreeGetChild(currentNode->dtreeNode, i), gd); DecisionTreeSetClass(DecisionTreeGetChild(currentNode->dtreeNode, i), parentClass); } //ExampleGroupStatsFree(((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode))->egs); //MFreePtr(DecisionTreeGetGrowingData(currentNode)); /* allocation = MGetTotalAllocation(); if(root->vfdt->maxAllocationBytes < allocation) { // HERE may want to control how many to leafify with a flag // _LeafifyLeastImportantNodes(root->vfdt, (long)(root->vfdt->numGrowing * .2)); }*/}static int _ValidateSplitEntropy(CVFDTPtr root, CVFDTPtr currentNode ) { float hoeffdingBound; int i; float range; float *allIndexes; float firstIndex, secondIndex; int firstAttrib = -1, secondAttrib = -1; float firstThresh = 10000, secondThresh = 10000; float tmpIndexOne, tmpIndexTwo, tmpThreshOne, tmpThreshTwo; int numChildren = 0; ExampleGroupStatsPtr egs = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode->dtreeNode))->egs; int bestSplitAtt = currentNode->dtreeNode->splitAttribute; // Select best two attributes firstIndex = secondIndex = 10000; allIndexes = MNewPtr(sizeof(float) * ExampleSpecGetNumAttributes(root->vfdt->spec)); for(i = 0 ; i < ExampleSpecGetNumAttributes(root->vfdt->spec) ; i++) { // Discrete if(ExampleSpecIsAttributeDiscrete(root->vfdt->spec, i)) { tmpIndexTwo = 10000; tmpIndexOne = ExampleGroupStatsEntropyDiscreteAttributeSplit(egs, i); // Continuous } else { ExampleGroupStatsEntropyContinuousAttributeSplit(egs, i, &tmpIndexOne, &tmpThreshOne, &tmpIndexTwo, &tmpThreshTwo); } allIndexes[i] = tmpIndexOne; if(tmpIndexOne < firstIndex) { secondIndex = firstIndex; secondThresh = firstThresh; secondAttrib = firstAttrib; firstIndex = tmpIndexOne; firstThresh = tmpThreshOne; firstAttrib = i; if(tmpIndexTwo < secondIndex) { secondIndex = tmpIndexTwo; secondThresh = tmpThreshTwo; secondAttrib = i; } } else if(tmpIndexOne < secondIndex) { secondIndex = tmpIndexOne; secondThresh = tmpThreshOne; secondAttrib = i; } } if(root->vfdt->messageLevel >= 3) { printf("Validating split, best two indices are:\n"); printf(" Index: %f Thresh: %f Attrib: %d\n", firstIndex, firstThresh, firstAttrib); printf(" Index: %f Thresh: %f Attrib: %d\n", secondIndex, secondThresh, secondAttrib); } range = lg(ExampleSpecGetNumClasses(root->vfdt->spec)); /* HERE, this will crash if splitConfidence is 0 */ hoeffdingBound = (float)(sqrt(((range * range) * log(1/root->vfdt->splitConfidence)) / (2.0 * ExampleGroupStatsNumExamplesSeen(egs)))); if ( currentNode->children != 0 ) numChildren = VALLength( currentNode->children ); if (( numChildren != 0 ) && ( hoeffdingBound != 0 )) { /* if ( secondIndex - firstIndex <= hoeffdingBound && ExampleGroupStatsEntropyTotal(egs) > firstIndex) { if ( firstAttrib != currentNode->dtreeNode->splitAttribute && (( secondIndex - firstIndex ) >= 0.1 * hoeffdingBound )) { if ( gMessageLevel >= 2 ) { printf("Split not valid - difference %f, bound %f at node %ld\n", (secondIndex - firstIndex), hoeffdingBound, currentNode->uid ); } bestSplitAtt = firstAttrib; } }*/ if ( secondIndex - firstIndex > hoeffdingBound && ExampleGroupStatsEntropyTotal(egs) > firstIndex && firstAttrib != currentNode->dtreeNode->splitAttribute ) { bestSplitAtt = firstAttrib; } else if ( hoeffdingBound < root->vfdt->tieConfidence && ExampleGroupStatsEntropyTotal(egs) > firstIndex) { if ( firstAttrib != currentNode->dtreeNode->splitAttribute && (( secondIndex - firstIndex ) >= ( 0.5 * gTieConfidence ))) { if ( gMessageLevel >= 2 ) { printf("Node %ld was tied, difference %f\n", currentNode->uid, secondIndex - firstIndex); } bestSplitAtt = firstAttrib; } } } MFreePtr(allIndexes); return bestSplitAtt;}static void _CheckSplitEntropy(CVFDTPtr root, CVFDTPtr currentNode ) { float hoeffdingBound; int i; float range; float *allIndexes; float firstIndex, secondIndex; int firstAttrib = -1, secondAttrib = -1; float firstThresh = -1, secondThresh = -1; float tmpIndexOne, tmpIndexTwo, tmpThreshOne, tmpThreshTwo;// int numChildren = 0; ExampleGroupStatsPtr egs = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode->dtreeNode))->egs; // Select best two attributes firstIndex = secondIndex = 10000; allIndexes = MNewPtr(sizeof(float) * ExampleSpecGetNumAttributes(root->vfdt->spec)); for(i = 0 ; i < ExampleSpecGetNumAttributes(root->vfdt->spec) ; i++) { // Discrete if(ExampleSpecIsAttributeDiscrete(root->vfdt->spec, i)) { tmpIndexTwo = 10000; tmpIndexOne = ExampleGroupStatsEntropyDiscreteAttributeSplit(egs, i); // Continuous } else { ExampleGroupStatsEntropyContinuousAttributeSplit(egs, i, &tmpIndexOne, &tmpThreshOne, &tmpIndexTwo, &tmpThreshTwo); } allIndexes[i] = tmpIndexOne; if(tmpIndexOne < firstIndex) { secondIndex = firstIndex; secondThresh = firstThresh; secondAttrib = firstAttrib; firstIndex = tmpIndexOne; firstThresh = tmpThreshOne; firstAttrib = i; if(tmpIndexTwo < secondIndex) { secondIndex = tmpIndexTwo; secondThresh = tmpThreshTwo; secondAttrib = i; } } else if(tmpIndexOne < secondIndex) { secondIndex = tmpIndexOne; secondThresh = tmpThreshOne; secondAttrib = i; } } if(root->vfdt->messageLevel >= 3) { printf("Considering split, best two indices are:\n"); printf(" Index: %f Thresh: %f Attrib: %d\n", firstIndex, firstThresh, firstAttrib); printf(" Index: %f Thresh: %f Attrib: %d\n", secondIndex, secondThresh, secondAttrib); } range = (float)(lg(ExampleSpecGetNumClasses(root->vfdt->spec))); /* HERE, this will crash if splitConfidence is 0 */ hoeffdingBound = (float)(sqrt(((range * range) * log(1/root->vfdt->splitConfidence)) / (2.0 * ExampleGroupStatsNumExamplesSeen(egs)))); if(secondIndex - firstIndex > hoeffdingBound && ExampleGroupStatsEntropyTotal(egs) > firstIndex) { if(gMessageLevel >= 1) { if(ExampleSpecIsAttributeDiscrete(root->vfdt->spec, firstAttrib)) { printf("Did discrete split on attrib %s index %f, second best %s index %f, hoeffding %f, based on %ld examples, presplit entropy %f\n", ExampleSpecGetAttributeName(root->vfdt->spec, firstAttrib), firstIndex, ExampleSpecGetAttributeName(root->vfdt->spec, secondAttrib), secondIndex, hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs), ExampleGroupStatsEntropyTotal(egs)); } else { printf("Did continuous split on attrib %s thresh %f index %f, second best %s thresh %f index %f, hoeffding %f, based on %ld examples, presplit entropy %f\n", ExampleSpecGetAttributeName(root->vfdt->spec, firstAttrib), firstThresh, firstIndex, ExampleSpecGetAttributeName(root->vfdt->spec, secondAttrib), secondThresh, secondIndex, hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs), ExampleGroupStatsEntropyTotal(egs)); } printf(" allocation now: %ld\n", MGetTotalAllocation()); } if(root->vfdt->messageLevel >= 2) { ExampleGroupStatsWrite(egs, stdout); } if(ExampleSpecIsAttributeDiscrete(root->vfdt->spec, firstAttrib)) { _DoDiscreteSplit(root, currentNode, firstAttrib); } else { _DoContinuousSplit(root, currentNode, firstAttrib, firstThresh); } if(gMessageLevel >= 3) { printf("The new tree is: \n"); CVFDTPrint(root, stdout); } } else if(hoeffdingBound < root->vfdt->tieConfidence && ExampleGroupStatsEntropyTotal(egs) > firstIndex) { if(gMessageLevel >= 1) { printf("Called it a tie at node %ld between attrib %d entropy %f, and %d entropy %f, hoeffding %f, based on %ld examples\n", currentNode->uid, firstAttrib, firstIndex, secondAttrib, secondIndex, hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs)); printf(" allocation now: %ld\n", MGetTotalAllocation()); } if(ExampleSpecIsAttributeDiscrete(root->vfdt->spec, firstAttrib)) { _DoDiscreteSplit(root, currentNode, firstAttrib); } else { _DoContinuousSplit(root, currentNode, firstAttrib, firstThresh); } } else { if(gMessageLevel >= 3) { printf("Didn't split, top entropies were %f at %d and %f at %d hoeffding %f\n", firstIndex, firstAttrib, secondIndex, secondAttrib, hoeffdingBound); ExampleGroupStatsWrite(egs, stdout); } /* now disable any that aren't promising */ for(i = 0 ; i < ExampleSpecGetNumAttributes(root->vfdt->spec) ; i++) { if(allIndexes[i] > firstIndex + hoeffdingBound
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -