📄 vfdt-engine.c
字号:
secondThresh = tmpThreshOne;
secondAttrib = i;
/* the tmpIndexTwo must be worse than what we got so ignore it */
}
}
if(vfdt->messageLevel >= 3) {
printf("Considering split, best two indecies are:\n");
printf(" Index: %f Thresh: %f Attrib: %d\n", firstIndex, firstThresh, firstAttrib);
printf(" Index: %f Thresh: %f Attrib: %d\n", secondIndex, secondThresh, secondAttrib);
}
if(vfdt->useGini) {
range = 1.0;
} else {
range = lg(ExampleSpecGetNumClasses(vfdt->spec));
}
/* HERE, this will crash if splitConfidence is 0 */
hoeffdingBound = sqrt(((range * range) * log(1/gd->splitConfidence)) /
(2.0 * ExampleGroupStatsNumExamplesSeen(egs)));
/* check for prepruning */
if(vfdt->useGini) {
nullIndex = ExampleGroupStatsGiniTotal(egs);
} else {
nullIndex = ExampleGroupStatsEntropyTotal(egs);
}
if(firstIndex - nullIndex > hoeffdingBound) {
if(vfdt->messageLevel >= 1) {
printf("Prepruned based on a null-win over %d, index %f, hoeffding %f, based on %ld examples, null index %f, level %d\n",
firstAttrib, firstIndex,
hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs),
nullIndex, gd->treeLevel);
}
gd->prePruned = 1;
} else if((nullIndex - firstIndex < hoeffdingBound) &&
hoeffdingBound < vfdt->prePruneTau) {
if(vfdt->messageLevel >= 1) {
printf("Prepruned based on a null tie with %d, index %f, hoeffding %f, based on %ld examples, null index %f, level %d\n",
firstAttrib, firstIndex,
hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs),
nullIndex, gd->treeLevel);
}
gd->prePruned = 1;
}
if(gd->prePruned) {
_DoMakeLeaf(vfdt, currentNode);
}
if(!gd->prePruned) {
/* if we win by the hoeffding test
or
if we are in forceSplit mode and it's a good time to split */
if((secondIndex - firstIndex > hoeffdingBound)
||
(forceSplit && ExampleGroupStatsNumExamplesSeen(egs) > 5 &&
!ExampleGroupStatsIsPure(egs))) {
//&& ExampleGroupStatsEntropyTotal(egs) > firstIndex)) {
if(vfdt->messageLevel >= 1) {
if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
printf("Did discrete split on attrib %s, index %f, second best %s, index %f, hoeffding %f, based on %ld examples, presplit entropy %f, level %d\n",
ExampleSpecGetAttributeName(vfdt->spec, firstAttrib),
firstIndex,
ExampleSpecGetAttributeName(vfdt->spec, secondAttrib),
secondIndex, hoeffdingBound,
ExampleGroupStatsNumExamplesSeen(egs),
ExampleGroupStatsEntropyTotal(egs),
gd->treeLevel);
} else {
printf("Did continuous split on attrib %s, thresh %f, index %f, second best %s, thresh %f, index %f, hoeffding %f, based on %ld examples, presplit entropy %f, level %d\n",
ExampleSpecGetAttributeName(vfdt->spec, firstAttrib),
firstThresh, firstIndex,
ExampleSpecGetAttributeName(vfdt->spec, secondAttrib),
secondThresh, secondIndex, hoeffdingBound,
ExampleGroupStatsNumExamplesSeen(egs),
ExampleGroupStatsEntropyTotal(egs),
gd->treeLevel);
}
if(vfdt->messageLevel >= 2) {
printf(" allocation now: %ld\n", MGetTotalAllocation());
}
}
if(vfdt->messageLevel >= 4) {
ExampleGroupStatsWrite(egs, stdout);
}
if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
_DoDiscreteSplit(vfdt, currentNode, firstAttrib);
} else {
_DoContinuousSplit(vfdt, currentNode, firstAttrib, firstThresh);
}
if(vfdt->messageLevel >= 4) {
printf("The new tree is: \n");
DecisionTreePrint(vfdt->dtree, stdout);
}
} else if(hoeffdingBound < vfdt->tieConfidence &&
nullIndex - firstIndex > vfdt->prePruneTau) {
/* if it looks like we have a tie */
if(vfdt->messageLevel >= 1) {
printf("Called it a tie between attrib %d, index %f, and %d, index %f, hoeffding %f, based on %ld examples, presplit entropy %f, level %d\n",
firstAttrib, firstIndex, secondAttrib, secondIndex,
hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs),
ExampleGroupStatsEntropyTotal(egs),
gd->treeLevel);
if(vfdt->messageLevel >= 2) {
printf(" allocation now: %ld\n", MGetTotalAllocation());
}
}
if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
_DoDiscreteSplit(vfdt, currentNode, firstAttrib);
} else {
_DoContinuousSplit(vfdt, currentNode, firstAttrib, firstThresh);
}
} else {
if(vfdt->messageLevel >= 2) {
printf("Didn't split, top indexes were %f at %d and %f at %d hoeffding %f\n",
firstIndex, firstAttrib, secondIndex, secondAttrib,
hoeffdingBound);
//ExampleGroupStatsWrite(egs, stdout);
}
/* now disable any that aren't promising */
for(i = 0 ; i < ExampleSpecGetNumAttributes(vfdt->spec) ; i++) {
if(allIndexes[i] > firstIndex + hoeffdingBound
&& i != firstAttrib && i != secondAttrib) {
ExampleGroupStatsIgnoreAttribute(egs, i);
if(vfdt->messageLevel >= 3) {
printf("started ignoring an attribute %d index %f, best %f\n based on %ld examples\n", i, allIndexes[i], firstIndex, ExampleGroupStatsNumExamplesSeen(egs));
}
} else if(!ExampleSpecIsAttributeDiscrete(vfdt->spec, i)) {
/* start ignoring bad split points */
if(vfdt->useGini) {
ExampleGroupStatsIgnoreSplitsWorseThanGini(egs,
i, firstIndex + hoeffdingBound);
} else {
ExampleGroupStatsIgnoreSplitsWorseThanEntropy(egs,
i, firstIndex + hoeffdingBound);
}
}
}
}
}
MFreePtr(allIndexes);
}
static void _AddExampleToGrowingNode(VFDTPtr vfdt,
DecisionTreePtr currentNode,
ExamplePtr e,
int checkSplits) {
int i;
VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode);
ExampleGroupStatsPtr egs = gd->egs;
/* add the information */
ExampleGroupStatsAddExample(egs, e);
for(i = 0 ; i < ExampleSpecGetNumAttributes(vfdt->spec) ; i++) {
if(ExampleSpecIsAttributeContinuous(vfdt->spec, i)) {
if(ExampleGroupStatsIsAttributeActive(egs, i)) {
if(ExampleGroupStatsNumSplitThresholds(egs, i) > 1000) {
ExampleGroupStatsStopAddingSplits(egs, i);
}
}
}
}
if(vfdt->cacheExamples && gd->exampleCache != 0) {
VALAppend(gd->exampleCache, e);
} else {
ExampleFree(e);
}
gd->seenSinceLastProcess++;
gd->seenExamplesCount++;
if((gd->seenSinceLastProcess >= vfdt->processChunkSize) && checkSplits) {
gd->seenSinceLastProcess = 0;
if(!ExampleGroupStatsIsPure(egs)) {
_CheckSplit(vfdt, currentNode, 0);
}
}
}
static void _AddExampleToDeactivatedNode(VFDTPtr vfdt,
DecisionTreePtr currentNode,
ExamplePtr e) {
VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode);
if(gd != 0) {
/* if it's zero we've already spilt on every attribute on that path */
gd->seenSinceDeactivated++;
if(ExampleGetClass(e) != DecisionTreeGetClass(currentNode)) {
gd->errorsSinceDeactivated++;
}
}
ExampleFree(e);
}
static void _ProcessExampleBatch(VFDTPtr vfdt, ExamplePtr e) {
int foundNode = 0;
DecisionTreePtr currentNode;
currentNode = vfdt->dtree;
while(!foundNode && currentNode != 0) {
if(DecisionTreeIsLeaf(currentNode)) {
/* We are done growing in this direction, this example isn't needed */
foundNode = 1;
} else if(DecisionTreeIsNodeGrowing(currentNode)) {
foundNode = 1;
}
if(!foundNode) {
currentNode = DecisionTreeOneStepClassify(currentNode, e);
}
}
if(!DecisionTreeIsLeaf(currentNode)) { /* if leaf ignore the example */
/* here, this doesn't check for splits */
_AddExampleToGrowingNode(vfdt, currentNode, e, 0);
} else {
_AddExampleToDeactivatedNode(vfdt, currentNode, e);
if(vfdt->messageLevel >= 3) {
printf("not learning from:");
ExampleWrite(e, stdout);
}
}
if(vfdt->examplesSeen % vfdt->reactivateScanPeriod == 0) {
_DoReactivateScan(vfdt);
}
if(vfdt->maxAllocationBytes < MGetTotalAllocation()) {
if(vfdt->cacheExamples) {
_FlushExampleCachesCheckStop(vfdt);
} else {
/* HERE may want to control how many to leafify with a flag */
_LeafifyLeastImportantNodes(vfdt, vfdt->numGrowing * .2);
}
}
if((vfdt->messageLevel >= 1) && (vfdt->examplesSeen % 1000 == 0)) {
printf("processed %ld examples\n", vfdt->examplesSeen);
}
}
void _forceSplits(VFDTPtr vfdt, DecisionTreePtr dt) {
int i;
if(DecisionTreeIsLeaf(dt) || DecisionTreeIsNodeGrowing(dt)) {
_CheckSplit(vfdt, dt, 1);
}
if(!DecisionTreeIsLeaf(dt) && !DecisionTreeIsNodeGrowing(dt)) {
for(i = 0 ; i < DecisionTreeGetChildCount(dt) ; i++) {
_forceSplits(vfdt, DecisionTreeGetChild(dt, i));
}
}
}
static void _PrintSpaces(FILE *out, int num) {
int i;
for(i = 0 ; i < num ; i++) {
fprintf(out, " ");
}
}
static void _PrintHelp(DecisionTreePtr dt, FILE *out, int indent) {
int i;
VFDTGrowingDataPtr gd;
_PrintSpaces(out, indent);
if(dt->nodeType == dtnLeaf || dt->nodeType == dtnGrowing) {
//fprintf(out, "(leaf: %d)\n", dt->myclass);
fprintf(out, "(leaf: %s -",
ExampleSpecGetClassValueName(dt->spec, dt->myclass));
gd = DecisionTreeGetGrowingData(dt);
for(i = 0 ; i < ExampleSpecGetNumClasses(dt->spec) ; i++) {
fprintf(out, " %ld", gd->egs->classTotals[i]);
}
fprintf(out, ")\n");
} else if(dt->nodeType == dtnDiscrete) {
fprintf(out, "(split on %s:\n",
ExampleSpecGetAttributeName(dt->spec, dt->splitAttribute));
for(i = 0 ; i < VALLength(dt->children) ; i++) {
_PrintSpaces(out, indent + 1);
fprintf(out, "%s\n",
ExampleSpecGetAttributeValueName(dt->spec, dt->splitAttribute, i));
_PrintHelp(VALIndex(dt->children, i), out, indent + 2);
}
_PrintSpaces(out, indent);
fprintf(out, ")\n");
} else if(dt->nodeType == dtnContinuous) {
fprintf(out, "(split on %s:\n",
ExampleSpecGetAttributeName(dt->spec, dt->splitAttribute));
/* left child */
_PrintSpaces(out, indent + 1);
fprintf(out, "< %f\n", dt->splitThreshold);
_PrintHelp(VALIndex(dt->children, 0), out, indent + 2);
/* right child */
_PrintSpaces(out, indent + 1);
fprintf(out, ">= %f\n", dt->splitThreshold);
_PrintHelp(VALIndex(dt->children, 1), out, indent + 2);
_PrintSpaces(out, indent);
fprintf(out, ")\n");
} else if(dt->nodeType == dtnGrowing) {
fprintf(out, "(growing)\n");
}
}
static void _DecisionTreeWrite(DecisionTreePtr dt) {
_PrintHelp(dt, stdout, 0);
}
void _ProcessExamplesBatch(VFDTPtr vfdt, FILE *input) {
ExamplePtr e;
e = ExampleRead(input, vfdt->spec);
while(e != 0 && vfdt->numGrowing > 0) {
_ProcessExampleBatch(vfdt, e);
vfdt->examplesSeen++;
e = ExampleRead(input, vfdt->spec);
}
_forceSplits(vfdt, vfdt->dtree);
//_DecisionTreeWrite(vfdt->dtree);
if(vfdt->messageLevel >= 1) {
printf("finished with all the examples, there are %ld growing nodes\n",
vfdt->numGrowing);
}
}
static void _ProcessExample(VFDTPtr vfdt, ExamplePtr e) {
int foundNode = 0;
DecisionTreePtr currentNode;
currentNode = vfdt->dtree;
while(!foundNode && currentNode != 0) {
if(DecisionTreeIsLeaf(currentNode)) {
/* We are done growing in this direction, this example isn't needed */
foundNode = 1;
} else if(DecisionTreeIsNodeGrowing(currentNode)) {
foundNode = 1;
}
if(!foundNode) {
currentNode = DecisionTreeOneStepClassify(currentNode, e);
}
}
if(!DecisionTreeIsLeaf(currentNode)) { /* if leaf ignore the example */
_AddExampleToGrowingNode(vfdt, currentNode, e, 1);
} else {
_AddExampleToDeactivatedNode(vfdt, currentNode, e);
if(vfdt->messageLevel >= 3) {
printf("not learning from:");
ExampleWrite(e, stdout);
}
}
if(vfdt->examplesSeen % vfdt->reactivateScanPeriod == 0) {
_DoReactivateScan(vfdt);
}
if(vfdt->maxAllocationBytes < MGetTotalAllocation()) {
if(vfdt->cacheExamples) {
_FlushExampleCachesCheckStop(vfdt);
} else {
/* HERE may want to control how many to leafify with a flag */
_LeafifyLeastImportantNodes(vfdt, vfdt->numGrowing * .2);
}
}
}
void _ProcessExamples(VFDTPtr vfdt, FILE *input) {
ExamplePtr e;
e = ExampleRead(input, vfdt->spec);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -