📄 vfdt-engine.c
字号:
flushed = 0;
for(i = 0 ; i < VALLength(growingNodes) ; i++) {
gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(VALIndex(growingNodes, i));
if(gd->exampleCache != 0) {
flushed++;
_GrowingDataFreeExampleCache(gd);
}
}
vfdt->cacheExamples = 0;
if(vfdt->messageLevel >= 1) {
printf("Stopped caching, there were %ld active...\n", flushed);
}
}
VALFree(growingNodes);
}
static void _DoReactivateScan(VFDTPtr vfdt) {
long number, allocation;
float *indexes;
DecisionTreePtr *nodes;
int i, j;
float thisIndex, tmpIndex;
DecisionTreePtr cNode, tmpNode;
VoidAListPtr leaves = VALNew();
VFDTGrowingDataPtr gd;
DecisionTreeGatherLeaves(vfdt->dtree, leaves);
/* ugh, now I remove all pre-pruned nodes, this is pretty bad
because it is expensive to remove nodes from my list */
for(i = 0 ; i < VALLength(leaves) ; i++) {
tmpNode = (DecisionTreePtr)VALIndex(leaves, i);
gd = DecisionTreeGetGrowingData(tmpNode);
if(gd->prePruned) {
VALRemove(leaves, i);
}
}
number = VALLength(leaves) * 0.05;
if(number < 5) {
VALFree(leaves);
return;
}
if(vfdt->messageLevel >= 1) {
printf("looking into reactivating %ld nodes\n", number);
}
indexes = MNewPtr(sizeof(float) * number);
nodes = MNewPtr(sizeof(DecisionTreePtr) * number);
/* take the first bunch of leaves nodes to start */
for(i = 0 ; i < number ; i++) {
nodes[i] = VALIndex(leaves, i);
indexes[i] = _CalculateReactivateIndex(nodes[i], vfdt->examplesSeen);
}
/* EFF I should use some kind of something smart here if
I'm trying to consider a lot of nodes */
/* now look at the rest of the leaves, if they are better use them */
for(i = number ; i < VALLength(leaves) ; i++) {
thisIndex = _CalculateReactivateIndex(VALIndex(leaves, i),
vfdt->examplesSeen);
cNode = VALIndex(leaves, i);
for(j = 0 ; j < number ; j++) {
if(thisIndex > indexes[j]) {
tmpIndex = indexes[j];
indexes[j] = thisIndex;
thisIndex = tmpIndex;
tmpNode = nodes[j];
nodes[j] = cNode;
cNode = tmpNode;
}
}
}
/* now make all of them that look good compared to our last leafify
back into growing nodes */
for(i = 0 ; i < number ; i++) {
if(indexes[i] > vfdt->highestDeactivatedIndex) {
_DoReactivateLeaf(vfdt, nodes[i]);
if(vfdt->messageLevel >= 2) {
printf("reactivating a node with index %f\n", indexes[i]);
}
}
}
MFreePtr(indexes);
MFreePtr(nodes);
VALFree(leaves);
allocation = MGetTotalAllocation();
if(vfdt->maxAllocationBytes < allocation) {
if(vfdt->cacheExamples) {
_FlushExampleCachesCheckStop(vfdt);
} else {
/* HERE may want to control how many to leafify with a flag */
_LeafifyLeastImportantNodes(vfdt, vfdt->numGrowing * .2);
}
}
}
static void _AddExampleToGrowingNode(VFDTPtr vfdt,
DecisionTreePtr currentNode,
ExamplePtr e,
int checkSplits);
static float _BonferonniDelta(VFDTPtr vfdt, int numChoices) {
// return vfdt->splitConfidence;
// return 1 - pow(1 - vfdt->splitConfidence, 1.0 / (float)numChoices);
// printf("\t in bonf old confidence %f num choices %d new conf %.10f\n",
// vfdt->splitConfidence, numChoices,
// vfdt->splitConfidence / (float)numChoices);
if(vfdt->doBonferonni) {
return vfdt->splitConfidence / (float)numChoices;
} else {
return vfdt->splitConfidence;
}
}
static void _DoContinuousSplit(VFDTPtr vfdt,
DecisionTreePtr currentNode, int attNum,
float threshold) {
int i;
AttributeTrackerPtr newAT;
int childCount;
VFDTGrowingDataPtr parentGD = (VFDTGrowingDataPtr)(DecisionTreeGetGrowingData(currentNode));
AttributeTrackerPtr at =
ExampleGroupStatsGetAttributeTracker(parentGD->egs);
VFDTGrowingDataPtr gd;
long allocation;
int parentClass;
float parentErrorRate;
DecisionTreePtr parent, target;
ExamplePtr e;
parent = currentNode;
DebugMessage(1, 2, " Continuous split info, attrib %s, thresh %f\n",
ExampleSpecGetAttributeName(vfdt->spec, attNum), threshold);
DecisionTreeSplitOnContinuousAttribute(currentNode, attNum, threshold);
childCount = DecisionTreeGetChildCount(currentNode);
vfdt->numGrowing--;
vfdt->numGrowing += childCount;
parentClass = ExampleGroupStatsGetMostCommonClass(parentGD->egs);
parentErrorRate =
(1.0 - (ExampleGroupStatsGetMostCommonClassCount(parentGD->egs) /
(float)ExampleGroupStatsNumExamplesSeen(parentGD->egs)));
/* left child, < threshold */
newAT = AttributeTrackerClone(at);
gd = MNewPtr(sizeof(VFDTGrowingData));
gd->egs = ExampleGroupStatsNew(vfdt->spec, newAT);
gd->parentClass = parentClass;
gd->treeLevel = parentGD->treeLevel + 1;
gd->prePruned = 0;
gd->splitConfidence = _BonferonniDelta(vfdt,
AttributeTrackerNumActive(newAT));
if(vfdt->cacheExamples) {
gd->exampleCache = VALNew();
} else {
gd->exampleCache = 0;
}
gd->parentErrorRate = parentErrorRate;
gd->seenExamplesCount =
ExampleGroupStatsGetPercentBelowThreshold(parentGD->egs,
attNum, threshold) * parentGD->seenExamplesCount;
gd->seenSinceLastProcess = 0;
DecisionTreeSetGrowingData(DecisionTreeGetChild(currentNode, 0), gd);
DecisionTreeSetClass(DecisionTreeGetChild(currentNode, 0),
parentClass);
/* right child, >= threshold */
newAT = AttributeTrackerClone(at);
gd = MNewPtr(sizeof(VFDTGrowingData));
gd->egs = ExampleGroupStatsNew(vfdt->spec, newAT);
gd->parentClass = parentClass;
gd->treeLevel = parentGD->treeLevel + 1;
gd->prePruned = 0;
gd->splitConfidence = _BonferonniDelta(vfdt,
AttributeTrackerNumActive(newAT));
if(vfdt->cacheExamples) {
gd->exampleCache = VALNew();
} else {
gd->exampleCache = 0;
}
gd->parentErrorRate = parentErrorRate;
gd->seenExamplesCount = (1.0 -
ExampleGroupStatsGetPercentBelowThreshold(parentGD->egs,
attNum, threshold)) * parentGD->seenExamplesCount;
gd->seenSinceLastProcess = 0;
DecisionTreeSetGrowingData(DecisionTreeGetChild(currentNode, 1), gd);
DecisionTreeSetClass(DecisionTreeGetChild(currentNode, 1),
parentClass);
/* split the example cache */
if(parentGD->exampleCache != 0) {
for(i = VALLength(parentGD->exampleCache) - 1 ; i >= 0 ; i--) {
e = VALRemove(parentGD->exampleCache, i);
target = DecisionTreeOneStepClassify(parent, e);
_AddExampleToGrowingNode(vfdt, target, e, 0);
}
}
//_DecisionTreeWrite(currentNode);
/* this will check to see if there is a cache first */
_GrowingDataFreeExampleCache(parentGD);
ExampleGroupStatsFree(parentGD->egs);
MFreePtr(parentGD);
DecisionTreeSetGrowingData(currentNode, 0);
allocation = MGetTotalAllocation();
if(vfdt->maxAllocationBytes < allocation) {
if(vfdt->cacheExamples) {
_FlushExampleCachesCheckStop(vfdt);
} else {
/* HERE may want to control how many to leafify with a flag */
_LeafifyLeastImportantNodes(vfdt, vfdt->numGrowing * .2);
}
}
}
static void _DoDiscreteSplit(VFDTPtr vfdt, DecisionTreePtr splitNode,
int attNum) {
int i;
AttributeTrackerPtr newAT;
int childCount;
VFDTGrowingDataPtr parentGD = (VFDTGrowingDataPtr)(DecisionTreeGetGrowingData(splitNode));
AttributeTrackerPtr at =
ExampleGroupStatsGetAttributeTracker(parentGD->egs);
VFDTGrowingDataPtr gd;
long allocation;
int parentClass;
float parentErrorRate;
DecisionTreePtr target;
ExamplePtr e;
parentClass = ExampleGroupStatsGetMostCommonClass(parentGD->egs);
if(AttributeTrackerAreAllInactive(at)) {
/* made a split on every attribute along this path so don't try
to split any more */
DecisionTreeSetTypeLeaf(splitNode);
DecisionTreeSetClass(splitNode, parentClass);
} else {
DecisionTreeSplitOnDiscreteAttribute(splitNode, attNum);
childCount = DecisionTreeGetChildCount(splitNode);
vfdt->numGrowing--;
vfdt->numGrowing += childCount;
parentErrorRate =
(1.0 - (ExampleGroupStatsGetMostCommonClassCount(parentGD->egs) /
(float)ExampleGroupStatsNumExamplesSeen(parentGD->egs)));
for(i = 0 ; i < childCount ; i++) {
newAT = AttributeTrackerClone(at);
AttributeTrackerMarkInactive(newAT, attNum);
gd = MNewPtr(sizeof(VFDTGrowingData));
gd->egs = ExampleGroupStatsNew(vfdt->spec, newAT);
gd->parentClass = parentClass;
gd->treeLevel = parentGD->treeLevel + 1;
gd->prePruned = 0;
gd->splitConfidence = _BonferonniDelta(vfdt,
AttributeTrackerNumActive(newAT));
if(vfdt->cacheExamples) {
gd->exampleCache = VALNew();
} else {
gd->exampleCache = 0;
}
gd->parentErrorRate = parentErrorRate;
gd->seenExamplesCount =
ExampleGroupStatsGetValuePercent(parentGD->egs,
attNum, i) * parentGD->seenExamplesCount;
gd->seenSinceLastProcess = 0;
DecisionTreeSetGrowingData(DecisionTreeGetChild(splitNode, i), gd);
DecisionTreeSetClass(DecisionTreeGetChild(splitNode, i),
parentClass);
}
/* split the example cache */
if(parentGD->exampleCache != 0) {
for(i = VALLength(parentGD->exampleCache) - 1 ; i >= 0 ; i--) {
e = VALRemove(parentGD->exampleCache, i);
target = DecisionTreeOneStepClassify(splitNode, e);
_AddExampleToGrowingNode(vfdt, target, e, 0);
}
}
}
/* this will check to see if there is a cache first */
_GrowingDataFreeExampleCache(parentGD);
ExampleGroupStatsFree(parentGD->egs);
MFreePtr(parentGD);
DecisionTreeSetGrowingData(splitNode, 0);
allocation = MGetTotalAllocation();
if(vfdt->maxAllocationBytes < allocation) {
if(vfdt->cacheExamples) {
_FlushExampleCachesCheckStop(vfdt);
} else {
/* HERE may want to control how many to leafify with a flag */
_LeafifyLeastImportantNodes(vfdt, vfdt->numGrowing * .2);
}
}
}
static void _CheckSplit(VFDTPtr vfdt, DecisionTreePtr currentNode, int forceSplit) {
float hoeffdingBound;
int i;
float range;
float *allIndexes;
float firstIndex, secondIndex, nullIndex;
int firstAttrib, secondAttrib;
float firstThresh, secondThresh;
float tmpIndexOne, tmpIndexTwo, tmpThreshOne, tmpThreshTwo;
VFDTGrowingDataPtr gd = DecisionTreeGetGrowingData(currentNode);
ExampleGroupStatsPtr egs = gd->egs;
/* count the bounds that need to hold */
vfdt->numBoundsUsed +=
AttributeTrackerNumActive(ExampleGroupStatsGetAttributeTracker(egs));
firstIndex = secondIndex = 10000;
firstAttrib = secondAttrib = -1;
firstThresh = secondThresh = 0;
allIndexes =
MNewPtr(sizeof(float) * ExampleSpecGetNumAttributes(vfdt->spec));
for(i = 0 ; i < ExampleSpecGetNumAttributes(vfdt->spec) ; i++) {
if(ExampleSpecIsAttributeDiscrete(vfdt->spec, i)) {
/* set up the second tmp index to look really bad (unless updated by
a continuous attribute to look better) */
tmpIndexTwo = 10000;
if(vfdt->useGini) {
tmpIndexOne = ExampleGroupStatsGiniDiscreteAttributeSplit(egs, i);
} else {
tmpIndexOne =
ExampleGroupStatsEntropyDiscreteAttributeSplit(egs, i);
}
} else { /* continuous attribute */
if(vfdt->useGini) {
ExampleGroupStatsGiniContinuousAttributeSplit(egs, i,
&tmpIndexOne, &tmpThreshOne, &tmpIndexTwo, &tmpThreshTwo);
} else {
ExampleGroupStatsEntropyContinuousAttributeSplit(egs, i,
&tmpIndexOne, &tmpThreshOne, &tmpIndexTwo, &tmpThreshTwo);
}
}
allIndexes[i] = tmpIndexOne;
if(tmpIndexOne < firstIndex) {
/* bump the first one over */
secondIndex = firstIndex;
secondThresh = firstThresh;
secondAttrib = firstAttrib;
/* replace the first one */
firstIndex = tmpIndexOne;
firstThresh = tmpThreshOne;
firstAttrib = i;
if(tmpIndexTwo < secondIndex) {
/* replace the second one */
secondIndex = tmpIndexTwo;
secondThresh = tmpThreshTwo;
secondAttrib = i;
}
} else if(tmpIndexOne < secondIndex) {
/* replace the second one */
secondIndex = tmpIndexOne;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -