📄 cvfdt.c
字号:
#include "vfml.h"#include "vfdt-engine.h"#include "cvfdt.h"#include <stdio.h>#include <string.h>#include <math.h>#include <sys/times.h>#include <time.h>#define lg(x) (log(x) / log(2))char *gFileStem = "DF";char *gSourceDirectory = ".";int gDoTests = 0;int gMessageLevel = 0;int gStdin = 0;float gSplitConfidence = (float)0.01;float gTieConfidence = (float)0.05;int gUseGini = 0;int gRescans = 1;int gChunk = 300;int gGrowMegs = 1000;int gWindowSize = 50000;int gCacheSize = 10000;int gCurCacheIndex = 0;int gLastFilledCacheIndex = -1;int gCurCachePos = 0;int gCacheTestSet = 1;int gUseSchedule = 1;int gScheduleCount = 10000;float gScheduleMult = 1.44;int gIncrementalReporting = 0;VoidAListPtr gCacheFiles;FILE* gTestOut = 0;unsigned long gNodeId = 0;VoidAListPtr gCache;int gAltTestNum = 10000;int gNumSwitches = 0;int gNumPrunes = 0;int gNumNewAlts = 0;int gSplitLevels[100];float gCumulSplitStrength;int gCheckSize = 10000;static void _printUsage(char *name) { printf("%s : 'Concept-adapting Very Fast Decision Tree' induction\n", name); printf("-f <filestem>\tSet the name of the dataset (default DF)\n"); printf("-source <dir>\tSet the source data directory (default '.')\n"); printf("-u\t\tTest the learner's accuracy on data in <stem>.test\n"); printf("-sc <allowed chance of error in each decision> (default 0.01 that's 1 percent)\n"); printf("-stdin \t\t Reads training examples from stdin instead of from\n\t\t <stem>.data (default off) - NOTE this disables the rescans switch\n"); printf("-tc <tie error> call a tie when hoeffding e < this. (default 0.05)\n"); printf("-rescans <count> Naievely consider each example 'count' times\n\t\t with no concern for using it once per level of the induced\n\t\t tree (default 1)\n"); printf("-chunk <count> wait until 'count' examples accumulate at a leaf\n\t\t before testing for a split (default 300)\n"); printf("-growMegs <count> limit dynamic memory allocation to 'count'\n\t\t megabytes (default 1000)\n"); printf("-window <count> number of examples used for context switching (default 50000)\n"); printf("-cache <count> number of examples from the window to keep in\n\t\t memory (default 10000)\n"); printf("-schedule <#> Run tests every # examples (default 10000).\n"); printf("-altTest <#> Interval for building and testing alternate\n\t\t trees (default 10000).\n" ); printf("-incrementalReporting \t As each example arrives test the\n\t\t learned model with it and then learn on it (default off).\n"); printf("-v\t\tCan be used multiple times to increase the debugging output\n"); printf("-checkSize <count> wait until 'count' examples accumulate\n\t\t before checking for questionable nodes(default 10000)\n");}static void _processArgs(int argc, char *argv[]) { int i; /* HERE on the ones that use the next arg make sure it is there */ for(i = 1 ; i < argc ; i++) { if(!strcmp(argv[i], "-f")) { gFileStem = argv[i+1]; /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-source")) { gSourceDirectory = argv[i+1]; /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-u")) { gDoTests = 1; } else if(!strcmp(argv[i], "-v")) { gMessageLevel++; DebugSetMessageLevel(gMessageLevel); } else if(!strcmp(argv[i], "-h")) { _printUsage(argv[0]); exit(0); } else if(!strcmp(argv[i], "-stdin")) { gStdin = 1; } else if(!strcmp(argv[i], "-sc")) { sscanf(argv[i+1], "%f", &gSplitConfidence); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-tc")) { sscanf(argv[i+1], "%f", &gTieConfidence); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-gini")) { gUseGini = 1; } else if(!strcmp(argv[i], "-rescans")) { sscanf(argv[i+1], "%d", &gRescans); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-chunk")) { sscanf(argv[i+1], "%d", &gChunk); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-checkSize")) { sscanf(argv[i+1], "%d", &gCheckSize); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-growMegs")) { sscanf(argv[i+1], "%d", &gGrowMegs); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-window")) { sscanf(argv[i+1], "%d", &gWindowSize); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-cache")) { sscanf(argv[i+1], "%d", &gCacheSize); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-altTest")) { sscanf(argv[i+1], "%d", &gAltTestNum); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-incrementalReporting")) { gIncrementalReporting = 1; } else if(!strcmp(argv[i], "-schedule")) { gUseSchedule = 1; sscanf(argv[i+1], "%f", &gScheduleMult); /* ignore the next argument */ i++; } else { printf("Unknown argument: %s. use -h for help\n", argv[i]); exit(0); } } if(gMessageLevel >= 1) { printf("Stem: %s\n", gFileStem); printf("Source: %s\n", gSourceDirectory); printf("Split Confidence: %f\n", gSplitConfidence); printf("Tie Confidence: %f\n", gTieConfidence); if(gUseGini) { printf("Using Gini split index\n"); } else { printf("Using information gain split index\n"); } printf("Considering each example %d times.\n", gRescans); printf("Checking for splits for every %d examples at a leaf\n", gChunk); if(gDoTests) { printf("Running tests\n"); } printf("Window size: %d\n", gWindowSize ); printf("Message level: %d\n", gMessageLevel ); }}/* Initialize the test cache */void _initCache( ExampleSpecPtr es, VoidAListPtr testSet ){ FILE *exampleIn; char fileNames[255]; ExamplePtr e; sprintf(fileNames, "%s/%s.test", gSourceDirectory, gFileStem); exampleIn = fopen(fileNames, "r"); DebugError(exampleIn == 0, "Unable to open the .test file"); e = ExampleRead(exampleIn, es); while(e != 0) { VALAppend(testSet, e); e = ExampleRead(exampleIn, es); } fclose(exampleIn);}long _incrementalErrors = 0;long _incrementalTests = 0;static void _doIncrementalTest(VFDTPtr vfdt, ExampleSpecPtr es, ExamplePtr e) { DecisionTreePtr dt; dt = VFDTGetLearnedTree(vfdt); if(ExampleGetClass(e) != DecisionTreeClassify(dt, e)) { _incrementalErrors++; } _incrementalTests++; DecisionTreeFree(dt);}static void _doIncrementalReport(void) { printf("\tTested %ld examples, made %ld mistakes that's %.4lf%%\n", _incrementalTests, _incrementalErrors, (float)_incrementalErrors / (float)_incrementalTests);}/* Calculate accuracy of current tree against test set */VoidAListPtr _testSet;int _testCacheInited = 0;static void _doTests(ExampleSpecPtr es, DecisionTreePtr dt, long growingNodes, long learnCount, long learnTime, long cacheTime, long allocation, int finalOutput, int numQExamples, int numSwitches, int numPrunes ) { int oldPool = MGetActivePool(); ExamplePtr e; long tested, errors; FILE *exampleIn; char fileNames[255]; errors = tested = 0; /* don't track this allocation against other VFDT stuff */ MSetActivePool(0); // open test file sprintf(fileNames, "%s/%s.test", gSourceDirectory, gFileStem); exampleIn = fopen(fileNames, "r"); DebugError(exampleIn == 0, "Unable to open the .test file"); // compare class of example in test set to class predicted // by current tree e = ExampleRead(exampleIn, es); while(e != 0) { if (!ExampleIsClassUnknown(e)) { tested++; if(ExampleGetClass(e) != DecisionTreeClassify(dt, e)) { errors++; } } ExampleFree(e); e = ExampleRead(exampleIn, es); } fclose(exampleIn); /* if(!_testCacheInited) { _testSet = VALNew(); _initCache( es, _testSet ); _testCacheInited = 1; } for(i = 0 ; i < VALLength(_testSet) ; i++) { e = VALIndex(_testSet, i); if(!ExampleIsClassUnknown(e)) { tested++; if(ExampleGetClass(e) != DecisionTreeClassify(dt, e)) { errors++; } } }*/ // Report accuracy if(finalOutput) { if(gMessageLevel >= 1) { printf("Tested %ld examples made %ld errors\n", (long)tested, (long)errors); } printf("%.4f\t%ld\n", ((float)errors/(float)tested) * 100, (long)DecisionTreeCountNodes(dt)); fflush(stdout); fprintf(gTestOut,"%.4f\t%ld\n", ((float)errors/(float)tested) * 100, (long)DecisionTreeCountNodes(dt)); fclose(gTestOut); } else { int i; int sumLevels = 0; float sumStrength = 0.0; float averageLevel = 0.0; float averageStrength = 0; for ( i = 0; i < gNumSwitches; i++ ) { sumLevels += gSplitLevels[i]; sumStrength += pow(2.0, (float)-gSplitLevels[i]); } if ( gNumSwitches != 0 ) { averageLevel = (float)sumLevels/(float)gNumSwitches; averageStrength = pow(2.0, -averageLevel); } printf("learned from, error, nodes, growing, learn time, cache time, memory, switches, prunes, new alts\n"); printf(">> %ld\t%.4f\t%ld\t%ld\t%.2lf\t%.2lf\t%.2f\t%d\t%d\t%d\n", learnCount, ((float)errors/(float)tested) * 100, (long)DecisionTreeCountNodes(dt), growingNodes, ((double)learnTime) / 100, ((double)cacheTime) / 100, ((double)allocation / (float)(1024 * 1024)),// ((float)numQExamples)/(float)tested, gNumSwitches, gNumPrunes, gNumNewAlts);//, //sumStrength, averageStrength); fprintf(gTestOut,">> %ld\t%.4f\t%ld\t%ld\t%.2lf\t%.2lf\t%.2f\t%d\t%d\t%d\n", learnCount, ((float)errors/(float)tested) * 100, (long)DecisionTreeCountNodes(dt), growingNodes, ((double)learnTime) / 100, ((double)cacheTime) / 100, ((double)allocation / (float)(1024 * 1024)),// ((float)numQExamples/(float)tested), gNumSwitches, gNumPrunes, gNumNewAlts); //, //sumStrength, averageStrength); fflush( gTestOut ); } fflush(stdout); MSetActivePool(oldPool);}/* Initialize names of cache files and put names in gCacheFiles */void _initWindowFiles(){ int numFiles; char* fileName; int i; FILE* curFile; gCacheFiles = VALNew(); numFiles = ceil( (float)gWindowSize / (float)gCacheSize ); for ( i = 0; i < numFiles; i++ ) { fileName = MNewPtr(sizeof(char) * 10); sprintf( fileName, "tmp%d.dat", i ); VALAppend( gCacheFiles, fileName ); curFile = fopen( fileName, "w" ); DebugError( curFile == 0, "Unable to create tmp file" ); fclose( curFile ); }}int main(int argc, char *argv[]) { char fileNames[255]; FILE *exampleIn; ExampleSpecPtr es; CVFDTPtr cvfdt; DecisionTreePtr dt; int iteration; _processArgs(argc, argv); gTestOut = fopen( "outc.txt", "w"); DebugError( gTestOut == 0, "Unable to open test results file" ); sprintf(fileNames, "%s/%s.names", gSourceDirectory, gFileStem); es = ExampleSpecRead(fileNames); DebugError(es == 0, "Unable to open the .names file (main)"); /* initialize cvfdt */ cvfdt = CVFDTCreateRoot(es, gSplitConfidence, gTieConfidence); VFDTSetUseGini(cvfdt->vfdt, gUseGini); VFDTSetProcessChunkSize(cvfdt->vfdt, gChunk); VFDTSetMaxAllocationMegs(cvfdt->vfdt, gGrowMegs); if(gMessageLevel >= 1) { printf("allocation %ld\n", MGetTotalAllocation()); } _initWindowFiles(); //wait(100000); sprintf(fileNames, "%s/%s.data", gSourceDirectory, gFileStem); for(iteration = 0 ; iteration < gRescans ; iteration++) { if ( gStdin ) { exampleIn = stdin; } else { exampleIn = fopen(fileNames, "r"); DebugError(exampleIn == 0, "Unable to open the data file (main)"); } CVFDTProcessExamples(cvfdt, exampleIn); if ( !gStdin ) fclose(exampleIn); } if(gMessageLevel >= 1) { printf("done learning...\n"); printf(" allocation %ld\n", MGetTotalAllocation()); } if(gDoTests) { dt = CVFDTGetLearnedTree( cvfdt ); _doTests( es, dt, cvfdt->vfdt->numGrowing, 0, 0, 0, MGetTotalAllocation(), 1, 0, 0, 0 ); DecisionTreeFree( dt ); } if(gMessageLevel >= 1) { printf("allocation %ld\n", MGetTotalAllocation()); //CVFDTPrint( cvfdt, stdout ); //DecisionTreePrint( cvfdt->dtreeNode, stdout ); } return 0;}static void _DoMakeLeaf(VFDTPtr vfdt, DecisionTreePtr currentNode) { ExampleGroupStatsPtr egs = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode))->egs; int mostCommonClass = ExampleGroupStatsGetMostCommonClass(egs); vfdt->numGrowing--; DecisionTreeSetClass(currentNode, mostCommonClass); //ExampleGroupStatsFree(egs); //MFreePtr(DecisionTreeGetGrowingData(currentNode));}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -