📄 cvfdt.c
字号:
} // If we didn't replace the tree, switch and reset counts if ( currentNode->altSubtreeInfos != 0 ) { if (oldInfo->exampleCount == gAltTestNum) { oldInfo->exampleCount = 0; newInfo->exampleCount = 0; oldInfo->numTested = 0; newInfo->numTested = 0; oldInfo->numErrors = 0; newInfo->numErrors = 0; } oldInfo->status = (oldInfo->status + 1) % 2; } } } /* add the information */ ExampleGroupStatsAddExample(egs, ce->example); if ( currentNode->nodeId > ce->oldestNode ) ce->oldestNode = currentNode->nodeId; currentNode->examplesSeen++; currentNode->hasChanged = 1; gd->seenSinceLastProcess++; // Do depth first search to check for questionable nodes if (( root == currentNode ) && (count % gCheckSize == 0)) { _CVFDTCheckForQuestionableNodes( root, root ); } if(gd->seenSinceLastProcess >= root->vfdt->processChunkSize) { gd->seenSinceLastProcess = 0; if ( !DecisionTreeIsNodeGrowing( currentNode->dtreeNode )) return; if(!ExampleGroupStatsIsPure(egs)) { _CheckSplitEntropy(root, currentNode ); } }}static void _RemoveExampleFromNode(CVFDTPtr root, CVFDTPtr currentNode, CVFDTExamplePtr ce) { VFDTGrowingDataPtr gd = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode->dtreeNode); ExampleGroupStatsPtr egs = gd->egs; // Check for alternate trees if (currentNode->altSubtreeInfos != 0) { //CVFDTAltInfoPtr oldInfo = VALIndex(currentNode->altSubtreeInfos, 0); CVFDTAltInfoPtr newInfo = VALIndex(currentNode->altSubtreeInfos, 1); // HERE Geoff took this out, I think it means that you will forget // different examples than you learned on depending on the test period // instead now we test on 10% of examples but we also use them for // training //if ( oldInfo->status == 1 ) // return; CVFDTForgetExample(newInfo->subtree, ce); } /* remove the information */ currentNode->hasChanged = 1; if ( currentNode->examplesSeen > 0 ) { ExampleGroupStatsRemoveExample(egs, ce->example); currentNode->examplesSeen--; }}void CVFDTForgetExample( CVFDTPtr cvfdt, CVFDTExamplePtr ce ) { int foundNode = 0; CVFDTPtr currentNode = cvfdt; CVFDTPtr parentNode = 0; int lastRemoved; while(!foundNode && currentNode != 0) { if ( ce->oldestNode < currentNode->nodeId ) return; lastRemoved = 0; if ( currentNode->examplesSeen == 1 ) lastRemoved = 1; _RemoveExampleFromNode(cvfdt, currentNode, ce ); if(DecisionTreeIsLeaf(currentNode->dtreeNode )) foundNode = 1; else if(DecisionTreeIsNodeGrowing(currentNode->dtreeNode )) foundNode = 1; if(!foundNode) { parentNode = currentNode; currentNode = CVFDTOneStepClassify(currentNode, ce->example ); } }}void _CVFDTProcessExamples(CVFDTPtr cvfdt, FILE *input) { ExamplePtr e = ExampleRead(input, cvfdt->vfdt->spec); CVFDTExamplePtr ce = _CVFDTExampleNew( e ); //int replaceHere = 0; int count = 0; long allocation; DecisionTreePtr dt; //int cacheCount = 0; char* fileName; long learnTime = 0; int numQExamples = 0; long cacheTime = 0; long seen = 0; struct tms starttime; struct tms endtime; times(&starttime); gCache = VALNew(); while(e != 0 ) { int i = 0; cvfdt->vfdt->examplesSeen++; seen++; ce->inQNode = 0; if(gIncrementalReporting) { times(&endtime); learnTime += endtime.tms_utime - starttime.tms_utime; _doIncrementalTest(cvfdt->vfdt, cvfdt->vfdt->spec, e); if(seen % 1000 == 0) { _doIncrementalReport(); } times(&starttime); } _ProcessExample(cvfdt, ce, count); if ( ce->inQNode == 1 ) numQExamples++; times(&endtime); learnTime += endtime.tms_utime - starttime.tms_utime; times(&starttime); VALInsert( gCache, ce, gCurCachePos ); gCurCachePos++; count++; if (( gCurCachePos == gCacheSize ) || ( count % gWindowSize == 0 )) { CVFDTExamplePtr curCe; FILE* curFile; fileName = VALIndex( gCacheFiles, gCurCacheIndex ); curFile = fopen( fileName, "w" ); fprintf( curFile, "%d ", VALLength( gCache )); for ( i = 0; i < VALLength( gCache ); i++ ) { curCe = (CVFDTExamplePtr)VALIndex( gCache, i ); _CVFDTExampleWrite( curFile, curCe ); _CVFDTExampleFree( curCe ); } VALFree( gCache ); gCache = VALNew(); fclose( curFile ); if ( count < gWindowSize ) gLastFilledCacheIndex = gCurCacheIndex; if ( ++gCurCacheIndex == VALLength( gCacheFiles )) gCurCacheIndex = 0; gCurCachePos = 0; } times(&endtime); cacheTime += endtime.tms_utime - starttime.tms_utime; times(&starttime); /* if(count % 10000 == 0){ printf("processed %ld examples\n", count); } */ if ( count >= gWindowSize ) { if ( gCurCachePos == 0 ) { CVFDTExamplePtr curCe; FILE* curFile; int numExamples; int returnVal; times(&endtime); learnTime += endtime.tms_utime - starttime.tms_utime; times(&starttime); fileName = VALIndex( gCacheFiles, gCurCacheIndex ); curFile = fopen( fileName, "r" ); returnVal = fscanf( curFile, "%d ", &numExamples ); for ( i = 0; i < numExamples; i++ ) { curCe = _CVFDTExampleRead( curFile, cvfdt->vfdt->spec ); VALAppend( gCache, curCe ); } fclose( curFile ); times(&endtime); cacheTime += endtime.tms_utime - starttime.tms_utime; times(&starttime); } ce = VALIndex( gCache, gCurCachePos ); CVFDTForgetExample( cvfdt, ce ); VALRemove( gCache, gCurCachePos ); _CVFDTExampleFree( ce ); } e = ExampleRead(input, cvfdt->vfdt->spec); ce = _CVFDTExampleNew( e ); if (gUseSchedule && count == gScheduleCount) { gScheduleCount *= gScheduleMult; times(&endtime); learnTime += endtime.tms_utime - starttime.tms_utime; allocation = MGetTotalAllocation(); dt = CVFDTGetLearnedTree( cvfdt ); DebugMessage(1, 2, "doing tests...\n"); _doTests( cvfdt->vfdt->spec, dt, cvfdt->vfdt->numGrowing, count, learnTime, cacheTime, allocation, 0, numQExamples, gNumSwitches, gNumPrunes ); numQExamples = 0; gNumSwitches = 0; gNumPrunes = 0; gNumNewAlts = 0; DecisionTreeFree(dt); times(&starttime); } } times(&endtime); learnTime += endtime.tms_utime - starttime.tms_utime; if(gIncrementalReporting) { _doIncrementalReport(); } if(gMessageLevel >= 1) { printf("finished with all the examples, there are %ld growing nodes\n", cvfdt->vfdt->numGrowing); printf("time %.2lfs\n", ((double)learnTime) / 100); } }void CVFDTProcessExamples( CVFDTPtr cvfdt, FILE *input ) { _CVFDTProcessExamples(cvfdt, input);}static void _CVFDTPrintSpaces(FILE *out, int num) { int i; for(i = 0 ; i < num ; i++) { fprintf(out, " "); }}static void _CVFDTPrintHelp( CVFDTPtr cvfdt, FILE *out, int indent ) { int i; _CVFDTPrintSpaces(out, indent); if(cvfdt->dtreeNode->nodeType == dtnLeaf) { fprintf(out, "%ld (%d): (leaf: %s)\n", cvfdt->uid, cvfdt->examplesSeen, ExampleSpecGetClassValueName(cvfdt->dtreeNode->spec, cvfdt->dtreeNode->myclass)); } else if(cvfdt->dtreeNode->nodeType == dtnDiscrete) { fprintf(out, "%ld (%d): (split on %s:\n", cvfdt->uid, cvfdt->examplesSeen, ExampleSpecGetAttributeName(cvfdt->dtreeNode->spec, cvfdt->dtreeNode->splitAttribute) ); for(i = 0 ; i < VALLength(cvfdt->children) ; i++) { _CVFDTPrintSpaces(out, indent + 1); fprintf(out, "Value %s\n", ExampleSpecGetAttributeValueName(cvfdt->dtreeNode->spec, cvfdt->dtreeNode->splitAttribute, i)); _CVFDTPrintHelp(VALIndex(cvfdt->children, i), out, indent + 2); } _CVFDTPrintSpaces(out, indent); fprintf(out, ")\n"); if (cvfdt->altSubtreeInfos != 0) { CVFDTAltInfoPtr altTreeInfo; _CVFDTPrintSpaces(out, indent + 2); fprintf(out, "***************\n"); altTreeInfo = VALIndex(cvfdt->altSubtreeInfos, 1); _CVFDTPrintHelp(altTreeInfo->subtree, out, indent + 2); _CVFDTPrintSpaces(out, indent + 2); fprintf(out, "***************\n"); } } else if(cvfdt->dtreeNode->nodeType == dtnContinuous) { fprintf(out, "%ld: (split on %s threshold %f: \n", cvfdt->uid, ExampleSpecGetAttributeName(cvfdt->dtreeNode->spec, cvfdt->dtreeNode->splitAttribute), cvfdt->dtreeNode->splitThreshold); _CVFDTPrintSpaces(out, indent + 1); /* left child */ fprintf(out, "< %f\n", cvfdt->dtreeNode->splitThreshold); _CVFDTPrintHelp(VALIndex(cvfdt->children, 0), out, indent + 2); /* right child */ fprintf(out, ">= %f\n", cvfdt->dtreeNode->splitThreshold); _CVFDTPrintHelp(VALIndex(cvfdt->children, 1), out, indent + 2); _CVFDTPrintSpaces(out, indent); fprintf(out, ")\n"); } else if ( cvfdt->dtreeNode->nodeType == dtnGrowing ) fprintf( out, "%ld (%d): (growing)\n", cvfdt->uid, cvfdt->examplesSeen );}void CVFDTPrint( CVFDTPtr cvfdt, FILE *out) { _CVFDTPrintHelp(cvfdt, out, 0);}CVFDTPtr CVFDTNew( VFDTPtr vfdt, DecisionTreePtr dt, long uid ) { CVFDTPtr cvfdt = MNewPtr(sizeof(CVFDT)); cvfdt->vfdt = vfdt; cvfdt->dtreeNode = dt; cvfdt->children = 0; cvfdt->examplesSeen = 0; cvfdt->uid = uid; cvfdt->nodeId = gNodeId++; cvfdt->hasChanged = 0; cvfdt->altSubtreeInfos = 0; return cvfdt;}CVFDTPtr CVFDTCreateRoot(ExampleSpecPtr spec, float splitConfidence, float tieConfidence) { CVFDTPtr cvfdt = MNewPtr(sizeof(CVFDT)); cvfdt->vfdt = VFDTNew( spec, splitConfidence, tieConfidence ); cvfdt->dtreeNode = cvfdt->vfdt->dtree; cvfdt->children = 0; cvfdt->examplesSeen = 0; cvfdt->uid = 1; cvfdt->nodeId = gNodeId++; cvfdt->hasChanged = 0; cvfdt->altSubtreeInfos = 0; return cvfdt;}void CVFDTAddChild(CVFDTPtr parent, DecisionTreePtr newChild ){ CVFDTPtr child = MNewPtr(sizeof(CVFDT)); child->vfdt = parent->vfdt; child->dtreeNode = newChild; child->children = 0; child->hasChanged = 0; child->examplesSeen = 0; child->nodeId = gNodeId++; if ( parent->children == 0 ) parent->children = VALNew(); VALAppend( parent->children, child ); child->uid = parent->uid * 10 + VALLength( parent->children ); child->altSubtreeInfos = 0;}CVFDTPtr CVFDTOneStepClassify(CVFDTPtr cvfdt, ExamplePtr e) { if(cvfdt->dtreeNode->nodeType == dtnLeaf || cvfdt->dtreeNode->nodeType == dtnGrowing) { return cvfdt; } if(cvfdt->dtreeNode->nodeType == dtnDiscrete) { if(ExampleIsAttributeUnknown(e, cvfdt->dtreeNode->splitAttribute)) { /* HERE do something smarter */ return VALIndex(cvfdt->children, 0); } else { return VALIndex(cvfdt->children, ExampleGetDiscreteAttributeValue(e, cvfdt->dtreeNode->splitAttribute)); } } else if(cvfdt->dtreeNode->nodeType == dtnContinuous) { if(ExampleIsAttributeUnknown(e, cvfdt->dtreeNode->splitAttribute)) { /* HERE do something smarter */ return VALIndex(cvfdt->children, 0); } else if(ExampleGetContinuousAttributeValue(e, cvfdt->dtreeNode->splitAttribute) >= cvfdt->dtreeNode->splitThreshold) { return VALIndex(cvfdt->children, 1); } else { /* the value is < the threshold */ return VALIndex(cvfdt->children, 0); } } return cvfdt;}DecisionTreePtr CVFDTGetLearnedTree(CVFDTPtr cvfdt) { int i; VoidAListPtr list = VALNew(); ExampleGroupStatsPtr egs; DecisionTreePtr growNode; DecisionTreePtr finalTree = DecisionTreeClone(cvfdt->dtreeNode); DecisionTreeGatherGrowingNodes(finalTree, list); for(i = VALLength(list) - 1 ; i >= 0 ; i--) { growNode = VALIndex(list, i); DecisionTreeSetTypeLeaf(growNode); egs = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(growNode))->egs; DecisionTreeSetClass(growNode, ExampleGroupStatsGetMostCommonClass(egs)); } VALFree(list); return finalTree;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -