📄 vfdt-engine.c
字号:
secondAttrib = i;
}
} else if(tmpIndexOne < secondIndex) {
/* replace the second one */
secondIndex = tmpIndexOne;
secondThresh = tmpThreshOne;
secondAttrib = i;
/* the tmpIndexTwo must be worse than what we got so ignore it */
}
}
if(vfdt->messageLevel >= 3) {
printf("Considering split, best two indecies are:\n");
printf(" Index: %f Thresh: %f Attrib: %d\n", firstIndex, firstThresh, firstAttrib);
printf(" Index: %f Thresh: %f Attrib: %d\n", secondIndex, secondThresh, secondAttrib);
}
if(vfdt->useGini) {
range = 1.0;
} else {
range = log(ExampleSpecGetNumClasses(vfdt->spec));
}
/* HERE, this will crash if splitConfidence is 0 */
hoeffdingBound = sqrt(((range * range) * log(1/gd->splitConfidence)) /
(2.0 * ExampleGroupStatsNumExamplesSeen(egs)));
/* if we win by the hoeffding test and we aren't pre-pruned
or
if we are in forceSplit mode and it's a good time to split */
if(((secondIndex - firstIndex > hoeffdingBound) &&
(!vfdt->prePrune || ExampleGroupStatsEntropyTotal(egs) > firstIndex))
||
(forceSplit && ExampleGroupStatsNumExamplesSeen(egs) > 1 &&
!ExampleGroupStatsIsPure(egs) &&
ExampleGroupStatsEntropyTotal(egs) > firstIndex)) {
if(vfdt->messageLevel >= 1) {
if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
printf("Did discrete split on attrib %s index %f, second best %s index %f, hoeffding %f, based on %ld examples, presplit entropy %f\n",
ExampleSpecGetAttributeName(vfdt->spec, firstAttrib),
firstIndex,
ExampleSpecGetAttributeName(vfdt->spec, secondAttrib),
secondIndex, hoeffdingBound,
ExampleGroupStatsNumExamplesSeen(egs),
ExampleGroupStatsEntropyTotal(egs));
} else {
printf("Did continuous split on attrib %s thresh %f index %f, second best %s thresh %f index %f, hoeffding %f, based on %ld examples, presplit entropy %f\n",
ExampleSpecGetAttributeName(vfdt->spec, firstAttrib),
firstThresh, firstIndex,
ExampleSpecGetAttributeName(vfdt->spec, secondAttrib),
secondThresh, secondIndex, hoeffdingBound,
ExampleGroupStatsNumExamplesSeen(egs),
ExampleGroupStatsEntropyTotal(egs));
}
printf(" allocation now: %ld\n", MGetTotalAllocation());
}
if(vfdt->messageLevel >= 2) {
ExampleGroupStatsWrite(egs, stdout);
}
if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
_DoDiscreteSplit(vfdt, currentNode, firstAttrib, gd->splitConfidence);
} else {
_DoContinuousSplit(vfdt, currentNode, firstAttrib, firstThresh,
gd->splitConfidence);
}
if(vfdt->messageLevel >= 3) {
printf("The new tree is: \n");
DecisionTreePrint(vfdt->dtree, stdout);
}
} else if(hoeffdingBound < vfdt->tieConfidence &&
(!vfdt->prePrune || ExampleGroupStatsEntropyTotal(egs) > firstIndex)) {
/* if it looks like we have a tie and we aren't pre-pruned */
if(vfdt->messageLevel >= 1) {
printf("Called it a tie between attrib %d index %f, and %d index %f, hoeffding %f, based on %ld examples\n",
firstAttrib, firstIndex, secondAttrib, secondIndex,
hoeffdingBound, ExampleGroupStatsNumExamplesSeen(egs));
printf(" allocation now: %ld\n", MGetTotalAllocation());
}
if(ExampleSpecIsAttributeDiscrete(vfdt->spec, firstAttrib)) {
_DoDiscreteSplit(vfdt, currentNode, firstAttrib, gd->splitConfidence);
} else {
_DoContinuousSplit(vfdt, currentNode, firstAttrib, firstThresh,
gd->splitConfidence);
}
} else {
if(vfdt->messageLevel >= 3) {
printf("Didn't split, top indexes were %f at %d and %f at %d hoeffding %f\n",
firstIndex, firstAttrib, secondIndex, secondAttrib,
hoeffdingBound);
ExampleGroupStatsWrite(egs, stdout);
}
/* now disable any that aren't promising */
for(i = 0 ; i < ExampleSpecGetNumAttributes(vfdt->spec) ; i++) {
if(allIndexes[i] > firstIndex + hoeffdingBound
&& i != firstAttrib && i != secondAttrib) {
ExampleGroupStatsIgnoreAttribute(egs, i);
if(vfdt->messageLevel >= 2) {
printf("started ignoring an attribute %d index %f, best %f\n based on %ld examples\n", i, allIndexes[i], firstIndex, ExampleGroupStatsNumExamplesSeen(egs));
}
} else if(!ExampleSpecIsAttributeDiscrete(vfdt->spec, i)) {
/* start ignoring bad split points */
if(vfdt->useGini) {
ExampleGroupStatsIgnoreSplitsWorseThanGini(egs,
i, firstIndex + hoeffdingBound);
} else {
ExampleGroupStatsIgnoreSplitsWorseThanEntropy(egs,
i, firstIndex + hoeffdingBound);
}
}
}
}
MFreePtr(allIndexes);
}
static void _AddExampleToGrowingNode(VFDTPtr vfdt,
DecisionTreePtr currentNode,
ExamplePtr e,
int checkSplits) {
VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode);
ExampleGroupStatsPtr egs = gd->egs;
/* add the information */
ExampleGroupStatsAddExample(egs, e);
if(vfdt->cacheExamples && gd->exampleCache != 0) {
VALAppend(gd->exampleCache, e);
} else {
ExampleFree(e);
}
gd->seenSinceLastProcess++;
if((gd->seenSinceLastProcess >= vfdt->processChunkSize) && checkSplits) {
gd->seenSinceLastProcess = 0;
if(!ExampleGroupStatsIsPure(egs)) {
_CheckSplit(vfdt, currentNode, 0);
}
}
}
static void _AddExampleToDeactivatedNode(VFDTPtr vfdt,
DecisionTreePtr currentNode,
ExamplePtr e) {
VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode);
if(gd != 0) {
/* if it's zero we've already spilt on every attribute on that path */
gd->seenSinceDeactivated++;
if(ExampleGetClass(e) != DecisionTreeGetClass(currentNode)) {
gd->errorsSinceDeactivated++;
}
}
ExampleFree(e);
}
static void _ProcessExampleBatch(VFDTPtr vfdt, ExamplePtr e) {
int foundNode = 0;
DecisionTreePtr currentNode;
currentNode = vfdt->dtree;
while(!foundNode && currentNode != 0) {
if(DecisionTreeIsLeaf(currentNode)) {
/* We are done growing in this direction, this example isn't needed */
foundNode = 1;
} else if(DecisionTreeIsNodeGrowing(currentNode)) {
foundNode = 1;
}
if(!foundNode) {
currentNode = DecisionTreeOneStepClassify(currentNode, e);
}
}
if(!DecisionTreeIsLeaf(currentNode)) { /* if leaf ignore the example */
_AddExampleToGrowingNode(vfdt, currentNode, e, 0);
} else {
_AddExampleToDeactivatedNode(vfdt, currentNode, e);
if(vfdt->messageLevel >= 3) {
printf("not learning from:");
ExampleWrite(e, stdout);
}
}
if(vfdt->examplesSeen % vfdt->reactivateScanPeriod == 0) {
_DoReactivateScan(vfdt);
}
if(vfdt->maxAllocationBytes < MGetTotalAllocation() &&
vfdt->cacheExamples) {
_FlushExampleCachesCheckStop(vfdt);
}
if((vfdt->messageLevel >= 1) && (vfdt->examplesSeen % 1000 == 0)) {
printf("processed %ld examples\n", vfdt->examplesSeen);
}
}
void _forceSplits(VFDTPtr vfdt, DecisionTreePtr dt) {
int i;
if(DecisionTreeIsLeaf(dt) || DecisionTreeIsNodeGrowing(dt)) {
_CheckSplit(vfdt, dt, 1);
}
if(!DecisionTreeIsLeaf(dt) && !DecisionTreeIsNodeGrowing(dt)) {
for(i = 0 ; i < DecisionTreeGetChildCount(dt) ; i++) {
_forceSplits(vfdt, DecisionTreeGetChild(dt, i));
}
}
}
static void _PrintSpaces(FILE *out, int num) {
int i;
for(i = 0 ; i < num ; i++) {
fprintf(out, " ");
}
}
static void _PrintHelp(DecisionTreePtr dt, FILE *out, int indent) {
int i;
VFDTGrowingDataPtr gd;
_PrintSpaces(out, indent);
if(dt->nodeType == dtnLeaf || dt->nodeType == dtnGrowing) {
//fprintf(out, "(leaf: %d)\n", dt->myclass);
fprintf(out, "(leaf: %s -",
ExampleSpecGetClassValueName(dt->spec, dt->myclass));
gd = DecisionTreeGetGrowingData(dt);
for(i = 0 ; i < ExampleSpecGetNumClasses(dt->spec) ; i++) {
fprintf(out, " %ld", gd->egs->classTotals[i]);
}
fprintf(out, ")\n");
} else if(dt->nodeType == dtnDiscrete) {
fprintf(out, "(split on %s:\n",
ExampleSpecGetAttributeName(dt->spec, dt->splitAttribute));
for(i = 0 ; i < VALLength(dt->children) ; i++) {
_PrintSpaces(out, indent + 1);
fprintf(out, "%s\n",
ExampleSpecGetAttributeValueName(dt->spec, dt->splitAttribute, i));
_PrintHelp(VALIndex(dt->children, i), out, indent + 2);
}
_PrintSpaces(out, indent);
fprintf(out, ")\n");
} else if(dt->nodeType == dtnContinuous) {
fprintf(out, "(split on %s:\n",
ExampleSpecGetAttributeName(dt->spec, dt->splitAttribute));
/* left child */
_PrintSpaces(out, indent + 1);
fprintf(out, "< %f\n", dt->splitThreshold);
_PrintHelp(VALIndex(dt->children, 0), out, indent + 2);
/* right child */
_PrintSpaces(out, indent + 1);
fprintf(out, ">= %f\n", dt->splitThreshold);
_PrintHelp(VALIndex(dt->children, 1), out, indent + 2);
_PrintSpaces(out, indent);
fprintf(out, ")\n");
} else if(dt->nodeType == dtnGrowing) {
fprintf(out, "(growing)\n");
}
}
static void _DecisionTreeWrite(DecisionTreePtr dt) {
_PrintHelp(dt, stdout, 0);
}
void _ProcessExamplesBatch(VFDTPtr vfdt, FILE *input) {
ExamplePtr e;
e = ExampleRead(input, vfdt->spec);
while(e != 0 && vfdt->numGrowing > 0) {
_ProcessExampleBatch(vfdt, e);
vfdt->examplesSeen++;
e = ExampleRead(input, vfdt->spec);
}
_forceSplits(vfdt, vfdt->dtree);
//_DecisionTreeWrite(vfdt->dtree);
if(vfdt->messageLevel >= 1) {
printf("finished with all the examples, there are %ld growing nodes\n",
vfdt->numGrowing);
}
}
static void _ProcessExample(VFDTPtr vfdt, ExamplePtr e) {
int foundNode = 0;
DecisionTreePtr currentNode;
currentNode = vfdt->dtree;
while(!foundNode && currentNode != 0) {
if(DecisionTreeIsLeaf(currentNode)) {
/* We are done growing in this direction, this example isn't needed */
foundNode = 1;
} else if(DecisionTreeIsNodeGrowing(currentNode)) {
foundNode = 1;
}
if(!foundNode) {
currentNode = DecisionTreeOneStepClassify(currentNode, e);
}
}
if(!DecisionTreeIsLeaf(currentNode)) { /* if leaf ignore the example */
_AddExampleToGrowingNode(vfdt, currentNode, e, 1);
} else {
_AddExampleToDeactivatedNode(vfdt, currentNode, e);
if(vfdt->messageLevel >= 3) {
printf("not learning from:");
ExampleWrite(e, stdout);
}
}
if(vfdt->examplesSeen % vfdt->reactivateScanPeriod == 0) {
_DoReactivateScan(vfdt);
}
if(vfdt->maxAllocationBytes < MGetTotalAllocation() &&
vfdt->cacheExamples) {
_FlushExampleCachesCheckStop(vfdt);
}
}
void _ProcessExamples(VFDTPtr vfdt, FILE *input) {
ExamplePtr e;
e = ExampleRead(input, vfdt->spec);
while(e != 0 && vfdt->numGrowing > 0) {
_ProcessExample(vfdt, e);
vfdt->examplesSeen++;
if((vfdt->messageLevel >= 1) && (vfdt->examplesSeen % 1000 == 0)) {
printf("processed %ld examples\n", vfdt->examplesSeen);
}
e = ExampleRead(input, vfdt->spec);
}
if(vfdt->messageLevel >= 1) {
printf("finished with all the examples, there are %ld growing nodes\n",
vfdt->numGrowing);
}
}
//void VFDTBootstrapC45(VFDTPtr vfdt, char *fileStem, int overprune, int runC45) {
// char command[500], fileName[100];
// FILE *tree;
// if(runC45) {
// sprintf(command, "c4.5 -f %s -u >> /dev/null", fileStem);
// if(vfdt->messageLevel >= 1) {
// printf("%s\n", command);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -