📄 vfdt-engine.c
字号:
#include "vfdt-engine.h"
#include "memory.h"
#include <math.h>
#include <stdlib.h> /* for system for bootstrapc45 */
#define lg(x) (log(x) / log(2))
static void _DecisionTreeWrite(DecisionTreePtr dt);
static float _BonferonniDelta(VFDTPtr vfdt, int numChoices);
static void _GrowingDataFlushExampleCache(VFDTGrowingDataPtr gd) {
int i;
if(gd->exampleCache != 0) {
for(i = VALLength(gd->exampleCache) - 1 ; i >= 0 ; i--) {
ExampleFree(VALRemove(gd->exampleCache, i));
}
}
}
static void _GrowingDataFreeExampleCache(VFDTGrowingDataPtr gd) {
if(gd->exampleCache != 0) {
_GrowingDataFlushExampleCache(gd);
VALFree(gd->exampleCache);
gd->exampleCache = 0;
}
}
VFDTPtr VFDTNew(ExampleSpecPtr spec, float splitConfidence,
float tieConfidence) {
VFDTGrowingDataPtr gd = MNewPtr(sizeof(VFDTGrowingData));
VFDTPtr vfdt = MNewPtr(sizeof(VFDT));
AttributeTrackerPtr at;
vfdt->spec = spec;
vfdt->dtree = DecisionTreeNew(spec);
at = AttributeTrackerInitial(vfdt->spec);
gd->egs = ExampleGroupStatsNew(vfdt->spec, at);
vfdt->cacheExamples = 1;
vfdt->recoverMinimum = 3 * 1024 * 1024;
if(vfdt->cacheExamples) {
gd->exampleCache = VALNew();
} else {
gd->exampleCache = 0;
}
// gd->startedOnExampleNum = 0;
gd->seenExamplesCount = 0;
gd->seenSinceLastProcess = 0;
gd->treeLevel = 0;
gd->prePruned = 0;
DecisionTreeSetTypeGrowing(vfdt->dtree);
DecisionTreeSetGrowingData(vfdt->dtree, gd);
vfdt->processChunkSize = 300;
vfdt->splitConfidence = splitConfidence;
vfdt->tieConfidence = tieConfidence;
vfdt->useGini = 0;
vfdt->messageLevel = 0;
vfdt->examplesSeen = 0;
vfdt->numBoundsUsed = 0;
vfdt->numGrowing = 1;
vfdt->maxAllocationBytes = 10000;
vfdt->maxAllocationBytes *= 1024;
vfdt->maxAllocationBytes *= 1024;
vfdt->reactivateLeaves = 1;
vfdt->highestDeactivatedIndex = 1.0;
vfdt->reactivateScanPeriod = 1473777;
vfdt->batchMode = 0;
vfdt->prePruneTau = 0.0;
vfdt->laplace = 0;
/* needs the completed vfdt, so make sure it happens after that */
gd->splitConfidence = _BonferonniDelta(vfdt, AttributeTrackerNumActive(at));
return vfdt;
}
void VFDTFree(VFDTPtr vfdt) {
int i;
VoidAListPtr list = VALNew();
VFDTGrowingDataPtr gd;
DecisionTreeGatherGrowingNodes(vfdt->dtree, list);
for(i = VALLength(list) - 1 ; i >= 0 ; i--) {
gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(VALIndex(list,i));
_GrowingDataFreeExampleCache(gd);
ExampleGroupStatsFree(gd->egs);
MFreePtr(gd);
}
VALFree(list);
DecisionTreeFree(vfdt->dtree);
MFreePtr(vfdt);
}
void VFDTSetMessageLevel(VFDTPtr vfdt, int value) {
vfdt->messageLevel = value;
}
void VFDTSetMaxAllocationMegs(VFDTPtr vfdt, int value) {
vfdt->maxAllocationBytes = value;
vfdt->maxAllocationBytes *= 1024;
vfdt->maxAllocationBytes *= 1024;
}
void VFDTSetProcessChunkSize(VFDTPtr vfdt, int value) {
vfdt->processChunkSize = value;
}
void VFDTSetUseGini(VFDTPtr vfdt, int value) {
vfdt->useGini = value;
}
void VFDTSetRestartLeaves(VFDTPtr vfdt, int value) {
vfdt->reactivateLeaves = value;
}
void VFDTSetCacheTrainingExamples(VFDTPtr vfdt, int value) {
VoidAListPtr growingNodes;
VFDTGrowingDataPtr gd;
int i;
vfdt->cacheExamples = value;
if(vfdt->cacheExamples == 0) {
/* disable this everywhere HERE maybe enable it everywhere in the
case where the value is 1?? */
growingNodes = VALNew();
DecisionTreeGatherGrowingNodes(vfdt->dtree, growingNodes);
for(i = 0 ; i < VALLength(growingNodes) ; i++) {
gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(VALIndex(growingNodes, i));
if(gd->exampleCache != 0) {
_GrowingDataFreeExampleCache(gd);
}
}
VALFree(growingNodes);
}
}
void VFDTSetPrePruneTau(VFDTPtr vfdt, float value) {
vfdt->prePruneTau = value;
}
void VFDTSetLaplace(VFDTPtr vfdt, int value) {
vfdt->laplace = value;
}
void VFDTSetDoBonferonni(VFDTPtr vfdt, int value) {
vfdt->doBonferonni = value;
}
static void _DoMakeLeaf(VFDTPtr vfdt, DecisionTreePtr currentNode) {
VFDTGrowingDataPtr gd = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode));
ExampleGroupStatsPtr egs = gd->egs;
int mostCommonClass = ExampleGroupStatsGetMostCommonClassLaplace(egs,
gd->parentClass, vfdt->laplace);
vfdt->numGrowing--;
DecisionTreeSetTypeLeaf(currentNode);
DecisionTreeSetClass(currentNode, mostCommonClass);
gd->seenSinceDeactivated = 0;
gd->errorsSinceDeactivated = 0;
ExampleGroupStatsDeactivate(egs);
_GrowingDataFreeExampleCache(gd);
// ExampleGroupStatsFree(egs);
// MFreePtr(DecisionTreeGetGrowingData(currentNode));
}
static void _DoReactivateLeaf(VFDTPtr vfdt, DecisionTreePtr currentNode) {
VFDTGrowingDataPtr gd = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode));
ExampleGroupStatsPtr egs = gd->egs;
vfdt->numGrowing++;
DecisionTreeSetTypeGrowing(currentNode);
ExampleGroupStatsReactivate(egs);
if(vfdt->cacheExamples) {
gd->exampleCache = VALNew();
}
}
static float _CalculateReactivateIndex(DecisionTreePtr node, long totalSeen) {
/* this is the same hueristic as the calculate leafify index but gets
its informationg from a slightly different place */
float percent;
VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(node);
float estimatedError;
if(gd == 0) {
/* this would happen if we've already split on attribute */
return 0;
}
percent = gd->seenExamplesCount / totalSeen;
/* use a laplace correction for the estimated error:
(errors + (m * parentError)) / (seen + m) */
estimatedError =
(gd->errorsSinceDeactivated +
(10 * gd->parentErrorRate)) /
(10 + gd->seenSinceDeactivated);
/* return an estimate of the percentage of examples falling on
weighted by the accuracy the node would have if we pruned now */
/* leaves that give more long term benefit should have higher values */
return percent * estimatedError;
}
static float _CalculateLeafifyIndex(DecisionTreePtr node, long totalSeen) {
float percent;
VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(node);
float estimatedError;
percent = gd->seenExamplesCount / totalSeen;
/* use a laplace correction for the estimated error:
(errors + (m * parentError)) / (seen + m) */
estimatedError =
((ExampleGroupStatsNumExamplesSeen(gd->egs) -
ExampleGroupStatsGetMostCommonClassCount(gd->egs)) +
(10 * gd->parentErrorRate)) /
(10 + ExampleGroupStatsNumExamplesSeen(gd->egs));
/* return an estimate of the percentage of examples falling on
weighted by the accuracy the node would have if we pruned now */
/* leaves that give more long term benefit should have higher values */
return percent * estimatedError;
}
static void _LeafifyLeastImportantNodes(VFDTPtr vfdt, long number) {
float *indexes;
DecisionTreePtr *nodes;
int i, j;
float thisIndex, tmpIndex;
DecisionTreePtr cNode, tmpNode;
VoidAListPtr growingNodes = VALNew();
DecisionTreeGatherGrowingNodes(vfdt->dtree, growingNodes);
if(vfdt->messageLevel >= 1) {
printf("leafifying %ld of %d growing nodes\n", number, VLLength(growingNodes));
}
indexes = MNewPtr(sizeof(float) * number);
nodes = MNewPtr(sizeof(DecisionTreePtr) * number);
/* take the first bunch of growing nodes to start */
for(i = 0 ; i < number ; i++) {
nodes[i] = VALIndex(growingNodes, i);
indexes[i] = _CalculateLeafifyIndex(nodes[i], vfdt->examplesSeen);
}
/* EFF I should use some kind of something smart here if
I'm trying to prune a lot of nodes */
/* now look at the rest of the growing nodes, if they are worse use them */
for(i = number ; i < VALLength(growingNodes) ; i++) {
thisIndex = _CalculateLeafifyIndex(VALIndex(growingNodes, i),
vfdt->examplesSeen);
cNode = VALIndex(growingNodes, i);
for(j = 0 ; j < number ; j++) {
if(thisIndex < indexes[j]) {
tmpIndex = indexes[j];
indexes[j] = thisIndex;
thisIndex = tmpIndex;
tmpNode = nodes[j];
nodes[j] = cNode;
cNode = tmpNode;
}
}
}
vfdt->highestDeactivatedIndex = 0;
/* now make them all leaves! */
for(i = 0 ; i < number ; i++) {
if(indexes[i] > vfdt->highestDeactivatedIndex) {
vfdt->highestDeactivatedIndex = indexes[i];
}
_DoMakeLeaf(vfdt, nodes[i]);
if(vfdt->messageLevel >= 2) {
printf("leafifying a node with index %g\n", indexes[i]);
}
}
MFreePtr(indexes);
MFreePtr(nodes);
VALFree(growingNodes);
}
void _FlushExampleCachesCheckStop(VFDTPtr vfdt) {
long initialAllocation = MGetTotalAllocation();
float *indexes;
DecisionTreePtr *nodes;
int i, j;
float thisIndex, tmpIndex;
DecisionTreePtr cNode, tmpNode;
long finalAllocation;
VoidAListPtr growingNodes;
VFDTGrowingDataPtr gd;
long number;
long flushed;
if(!vfdt->cacheExamples) { return; }
growingNodes = VALNew();
if(vfdt->messageLevel >= 2) {
printf("flushing example cache...\n");
}
DecisionTreeGatherGrowingNodes(vfdt->dtree, growingNodes);
number = VALLength(growingNodes) * .35;
/* round up */
number++;
if(vfdt->messageLevel >= 2) {
printf("thinking about flushing %ld caches\n", number);
}
indexes = MNewPtr(sizeof(float) * number);
nodes = MNewPtr(sizeof(DecisionTreePtr) * number);
/* take the first bunch of growing nodes to start */
for(i = 0 ; i < number ; i++) {
nodes[i] = VALIndex(growingNodes, i);
gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(nodes[i]);
if(gd->exampleCache == 0) {
indexes[i] = 1.0;
} else {
indexes[i] = _CalculateLeafifyIndex(nodes[i], vfdt->examplesSeen);
}
}
/* EFF I should use some kind of something smart here if
I'm trying to do this to a lot of nodes */
/* now look at the rest of the growing nodes, if they are worse use them */
for(i = number ; i < VALLength(growingNodes) ; i++) {
gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(VALIndex(
growingNodes, i));
if(gd->exampleCache == 0) {
thisIndex = 1.0;
} else {
thisIndex = _CalculateLeafifyIndex(VALIndex(growingNodes, i),
vfdt->examplesSeen);
}
cNode = VALIndex(growingNodes, i);
for(j = 0 ; j < number ; j++) {
if(thisIndex < indexes[j]) {
tmpIndex = indexes[j];
indexes[j] = thisIndex;
thisIndex = tmpIndex;
tmpNode = nodes[j];
nodes[j] = cNode;
cNode = tmpNode;
}
}
}
flushed = 0;
for(i = 0 ; i < number && (initialAllocation - MGetTotalAllocation() < (0.05 * vfdt->maxAllocationBytes)); i++) {
gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(nodes[i]);
if(gd->exampleCache != 0) {
flushed++;
_GrowingDataFreeExampleCache(gd);
}
}
MFreePtr(indexes);
MFreePtr(nodes);
finalAllocation = MGetTotalAllocation();
/* HERE, this .05 should be a parameter & it is used above as well */
if(vfdt->messageLevel >= 2) {
printf("flushed %ld freed %ld bytes stop threshold %.2f\n",
flushed, initialAllocation - finalAllocation,
0.05 * vfdt->maxAllocationBytes);
}
if(initialAllocation - finalAllocation < (0.05 * vfdt->maxAllocationBytes)) {
/* if we don't recover at least some minimum then stop caching */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -