⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cvfdt.c

📁 数据挖掘方面的源码
💻 C
📖 第 1 页 / 共 4 页
字号:
#include "vfml.h"#include "vfdt-engine.h"#include "cvfdt.h"#include <stdio.h>#include <string.h>#include <math.h>#include <sys/times.h>#include <time.h>#define lg(x)       (log(x) / log(2))char *gFileStem = "DF";char *gSourceDirectory = ".";int   gDoTests = 0;int   gMessageLevel = 0;int   gStdin           = 0;float gSplitConfidence = (float)0.01;float gTieConfidence   = (float)0.05;int   gUseGini         = 0;int   gRescans         = 1;int   gChunk           = 300;int   gGrowMegs        = 1000;int	gWindowSize      = 50000;int	gCacheSize	 = 10000;int     gCurCacheIndex   = 0;int     gLastFilledCacheIndex = -1;int     gCurCachePos      = 0;int	gCacheTestSet	  = 1;int	gUseSchedule	  = 1;int	gScheduleCount	  = 10000;float	gScheduleMult	  = 1.44;int   gIncrementalReporting = 0;VoidAListPtr gCacheFiles;FILE*   gTestOut  = 0;unsigned long gNodeId  = 0;VoidAListPtr gCache;int gAltTestNum = 10000;int gNumSwitches = 0;int gNumPrunes = 0;int gNumNewAlts = 0;int gSplitLevels[100];float gCumulSplitStrength;int gCheckSize = 10000;static void _printUsage(char  *name) {  printf("%s : 'Concept-adapting Very Fast Decision Tree' induction\n", name);    printf("-f <filestem>\tSet the name of the dataset (default DF)\n");  printf("-source <dir>\tSet the source data directory (default '.')\n");  printf("-u\t\tTest the learner's accuracy on data in <stem>.test\n");  printf("-sc <allowed chance of error in each decision> (default 0.01 that's 1 percent)\n");  printf("-stdin \t\t Reads training examples from stdin instead of from\n\t\t <stem>.data (default off) - NOTE this disables the rescans switch\n");  printf("-tc <tie error> call a tie when hoeffding e < this. (default 0.05)\n");  printf("-rescans <count> Naievely consider each example 'count' times\n\t\t with no concern for using it once per level of the induced\n\t\t tree (default 1)\n");  printf("-chunk <count> wait until 'count' examples accumulate at a leaf\n\t\t before testing for a split (default 300)\n");  printf("-growMegs <count> limit dynamic memory allocation to 'count'\n\t\t megabytes (default 1000)\n");  printf("-window <count> number of examples used for context switching (default 50000)\n");  printf("-cache <count> number of examples from the window to keep in\n\t\t memory (default 10000)\n");  printf("-schedule <#> Run tests every # examples (default 10000).\n");  printf("-altTest <#> Interval for building and testing alternate\n\t\t trees (default 10000).\n" );  printf("-incrementalReporting \t As each example arrives test the\n\t\t learned model with it and then learn on it (default off).\n");  printf("-v\t\tCan be used multiple times to increase the debugging output\n");  printf("-checkSize <count> wait until 'count' examples accumulate\n\t\t before checking for questionable nodes(default 10000)\n");}static void _processArgs(int argc, char *argv[]) {   int i;   /* HERE on the ones that use the next arg make sure it is there */   for(i = 1 ; i < argc ; i++) {      if(!strcmp(argv[i], "-f")) {         gFileStem = argv[i+1];         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-source")) {         gSourceDirectory = argv[i+1];         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-u")) {         gDoTests = 1;      } else if(!strcmp(argv[i], "-v")) {         gMessageLevel++;         DebugSetMessageLevel(gMessageLevel);      } else if(!strcmp(argv[i], "-h")) {         _printUsage(argv[0]);         exit(0);      } else if(!strcmp(argv[i], "-stdin")) {         gStdin = 1;      } else if(!strcmp(argv[i], "-sc")) {         sscanf(argv[i+1], "%f", &gSplitConfidence);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-tc")) {         sscanf(argv[i+1], "%f", &gTieConfidence);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-gini")) {         gUseGini = 1;      } else if(!strcmp(argv[i], "-rescans")) {         sscanf(argv[i+1], "%d", &gRescans);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-chunk")) {         sscanf(argv[i+1], "%d", &gChunk);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-checkSize")) {         sscanf(argv[i+1], "%d", &gCheckSize);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-growMegs")) {         sscanf(argv[i+1], "%d", &gGrowMegs);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-window")) {         sscanf(argv[i+1], "%d", &gWindowSize);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-cache")) {         sscanf(argv[i+1], "%d", &gCacheSize);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-altTest")) {         sscanf(argv[i+1], "%d", &gAltTestNum);         /* ignore the next argument */         i++;      } else if(!strcmp(argv[i], "-incrementalReporting")) {         gIncrementalReporting = 1;      } else if(!strcmp(argv[i], "-schedule")) {         gUseSchedule = 1;         sscanf(argv[i+1], "%f", &gScheduleMult);         /* ignore the next argument */         i++;      } else {         printf("Unknown argument: %s.  use -h for help\n", argv[i]);         exit(0);      }   }   if(gMessageLevel >= 1) {      printf("Stem: %s\n", gFileStem);      printf("Source: %s\n", gSourceDirectory);      printf("Split Confidence: %f\n", gSplitConfidence);      printf("Tie Confidence: %f\n", gTieConfidence);      if(gUseGini) {         printf("Using Gini split index\n");      } else {         printf("Using information gain split index\n");      }      printf("Considering each example %d times.\n", gRescans);      printf("Checking for splits for every %d examples at a leaf\n", gChunk);      if(gDoTests) {         printf("Running tests\n");      }      printf("Window size: %d\n", gWindowSize );      printf("Message level: %d\n", gMessageLevel );   }}/* Initialize the test cache */void _initCache( ExampleSpecPtr es, VoidAListPtr testSet ){   FILE *exampleIn;   char fileNames[255];   ExamplePtr e;   sprintf(fileNames, "%s/%s.test", gSourceDirectory, gFileStem);   exampleIn = fopen(fileNames, "r");   DebugError(exampleIn == 0, "Unable to open the .test file");   e = ExampleRead(exampleIn, es);   while(e != 0) {     VALAppend(testSet, e);     e = ExampleRead(exampleIn, es);   }   fclose(exampleIn);}long _incrementalErrors = 0;long _incrementalTests  = 0;static void _doIncrementalTest(VFDTPtr vfdt, ExampleSpecPtr es, ExamplePtr e) {   DecisionTreePtr dt;   dt = VFDTGetLearnedTree(vfdt);   if(ExampleGetClass(e) != DecisionTreeClassify(dt, e)) {      _incrementalErrors++;   }   _incrementalTests++;   DecisionTreeFree(dt);}static void _doIncrementalReport(void) {   printf("\tTested %ld examples, made %ld mistakes that's %.4lf%%\n",         _incrementalTests, _incrementalErrors,         (float)_incrementalErrors / (float)_incrementalTests);}/* Calculate accuracy of current tree against test set */VoidAListPtr _testSet;int          _testCacheInited = 0;static void _doTests(ExampleSpecPtr es, DecisionTreePtr dt, long growingNodes,		     long learnCount, long learnTime, long cacheTime, 		     long allocation, int finalOutput,		     int numQExamples, int numSwitches, int numPrunes ) {   int oldPool = MGetActivePool();   ExamplePtr e;   long tested, errors;   FILE *exampleIn;   char fileNames[255];      errors = tested = 0;      /* don't track this allocation against other VFDT stuff */   MSetActivePool(0);   // open test file   sprintf(fileNames, "%s/%s.test", gSourceDirectory, gFileStem);   exampleIn = fopen(fileNames, "r");   DebugError(exampleIn == 0, "Unable to open the .test file");   // compare class of example in test set to class predicted   // by current tree   e = ExampleRead(exampleIn, es);   while(e != 0) {     if (!ExampleIsClassUnknown(e)) {       tested++;       if(ExampleGetClass(e) != DecisionTreeClassify(dt, e)) {	 errors++;       }     }     ExampleFree(e);     e = ExampleRead(exampleIn, es);   }   fclose(exampleIn);   /* if(!_testCacheInited) {     _testSet = VALNew();     _initCache( es, _testSet );     _testCacheInited = 1;   }      for(i = 0 ; i < VALLength(_testSet) ; i++) {     e = VALIndex(_testSet, i);     if(!ExampleIsClassUnknown(e)) {       tested++;       if(ExampleGetClass(e) != DecisionTreeClassify(dt, e)) {	 errors++;       }			     }     }*/      // Report accuracy   if(finalOutput) {     if(gMessageLevel >= 1) {       printf("Tested %ld examples made %ld errors\n", (long)tested, (long)errors);     }     printf("%.4f\t%ld\n", ((float)errors/(float)tested) * 100, (long)DecisionTreeCountNodes(dt));     fflush(stdout);     fprintf(gTestOut,"%.4f\t%ld\n", ((float)errors/(float)tested) * 100, (long)DecisionTreeCountNodes(dt));     fclose(gTestOut);   } else {     int i;     int sumLevels = 0;     float sumStrength = 0.0;     float averageLevel = 0.0;     float averageStrength = 0;     for ( i = 0; i < gNumSwitches; i++ ) {       sumLevels += gSplitLevels[i];       sumStrength += pow(2.0, (float)-gSplitLevels[i]);     }     if ( gNumSwitches != 0 ) {       averageLevel = (float)sumLevels/(float)gNumSwitches;       averageStrength = pow(2.0, -averageLevel);     }     printf("learned from, error, nodes, growing, learn time, cache time, memory, switches, prunes, new alts\n");     printf(">> %ld\t%.4f\t%ld\t%ld\t%.2lf\t%.2lf\t%.2f\t%d\t%d\t%d\n",	    learnCount,	    ((float)errors/(float)tested) * 100,	    (long)DecisionTreeCountNodes(dt),	    growingNodes,	    ((double)learnTime) / 100,	    ((double)cacheTime) / 100,	    ((double)allocation / (float)(1024 * 1024)),//	    ((float)numQExamples)/(float)tested,	    gNumSwitches, gNumPrunes, gNumNewAlts);//,	    //sumStrength, averageStrength);     fprintf(gTestOut,">> %ld\t%.4f\t%ld\t%ld\t%.2lf\t%.2lf\t%.2f\t%d\t%d\t%d\n",	     learnCount,	     ((float)errors/(float)tested) * 100,	     (long)DecisionTreeCountNodes(dt),	     growingNodes,	     ((double)learnTime) / 100,	     ((double)cacheTime) / 100,	     ((double)allocation / (float)(1024 * 1024)),//	     ((float)numQExamples/(float)tested),	     gNumSwitches, gNumPrunes, gNumNewAlts); //,	     //sumStrength, averageStrength);     fflush( gTestOut );   }   fflush(stdout);       MSetActivePool(oldPool);}/* Initialize names of cache files and put names in gCacheFiles */void _initWindowFiles(){  int numFiles;  char* fileName;  int i;  FILE* curFile;  gCacheFiles = VALNew();  numFiles = ceil( (float)gWindowSize / (float)gCacheSize );  for ( i = 0; i < numFiles; i++ ) {    fileName = MNewPtr(sizeof(char) * 10);     sprintf( fileName, "tmp%d.dat", i );    VALAppend( gCacheFiles, fileName );    curFile = fopen( fileName, "w" );    DebugError( curFile == 0, "Unable to create tmp file" );    fclose( curFile );  }}int main(int argc, char *argv[]) {   char fileNames[255];   FILE *exampleIn;   ExampleSpecPtr es;   CVFDTPtr cvfdt;   DecisionTreePtr dt;   int iteration;   _processArgs(argc, argv);   gTestOut = fopen( "outc.txt", "w");   DebugError( gTestOut == 0, "Unable to open test results file" );   sprintf(fileNames, "%s/%s.names", gSourceDirectory, gFileStem);   es = ExampleSpecRead(fileNames);   DebugError(es == 0, "Unable to open the .names file (main)");   /* initialize cvfdt */   cvfdt = CVFDTCreateRoot(es, gSplitConfidence, gTieConfidence);   VFDTSetUseGini(cvfdt->vfdt, gUseGini);   VFDTSetProcessChunkSize(cvfdt->vfdt, gChunk);   VFDTSetMaxAllocationMegs(cvfdt->vfdt, gGrowMegs);   if(gMessageLevel >= 1) {      printf("allocation %ld\n", MGetTotalAllocation());   }   _initWindowFiles();   //wait(100000);      sprintf(fileNames, "%s/%s.data", gSourceDirectory, gFileStem);   for(iteration = 0 ; iteration < gRescans ; iteration++) {     if ( gStdin ) {       exampleIn = stdin;     } else {       exampleIn = fopen(fileNames, "r");       DebugError(exampleIn == 0, "Unable to open the data file (main)");     }     CVFDTProcessExamples(cvfdt, exampleIn);     if ( !gStdin )       fclose(exampleIn);   }   if(gMessageLevel >= 1) {      printf("done learning...\n");      printf("   allocation %ld\n", MGetTotalAllocation());   }   if(gDoTests) {     dt = CVFDTGetLearnedTree( cvfdt );     _doTests( es, dt, cvfdt->vfdt->numGrowing, 0, 0, 0, 	       MGetTotalAllocation(), 1, 0, 0, 0 );     DecisionTreeFree( dt );   }   if(gMessageLevel >= 1) {      printf("allocation %ld\n", MGetTotalAllocation());      //CVFDTPrint( cvfdt, stdout );      //DecisionTreePrint( cvfdt->dtreeNode, stdout );   }   return 0;}static void _DoMakeLeaf(VFDTPtr vfdt, DecisionTreePtr currentNode) {   ExampleGroupStatsPtr egs = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode))->egs;   int mostCommonClass = ExampleGroupStatsGetMostCommonClass(egs);   vfdt->numGrowing--;   DecisionTreeSetClass(currentNode, mostCommonClass);   //ExampleGroupStatsFree(egs);   //MFreePtr(DecisionTreeGetGrowingData(currentNode));}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -