⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfdt.c

📁 此算法是数据挖掘中的聚类算法
💻 C
📖 第 1 页 / 共 2 页
字号:
#include "vfml.h"
#include "vfdt-engine.h"
#include <stdio.h>
#include <string.h>

#include <sys/times.h>
#include <time.h>

char *gFileStem = "DF";
char *gSourceDirectory = ".";
int   gDoTests = 0;
int   gMessageLevel = 0;

float gSplitConfidence = 0.000001;
float gTieConfidence   = 0.05;
int   gUseGini         = 0;
int   gRescans         = 1;
int   gChunk           = 300;
int   gGrowMegs        = 1000;

int   gStdin           = 0;
int   gUseSchedule     = 0;
long  gScheduleCount   = 10000;
float gScheduleMult    = 1.44;

int   gIncrementalReporting = 0;

int   gOutputTree    = 0;
int   gDoBonferonni  = 0;

int   gCacheTestSet    = 1;

int   gCacheTrainingExamples = 1;
int   gRestartScanPeriod     = 147377 * 2;
int   gRestartLeaves         = 1;

int   gDoBatch               = 0;
//int   gPrePrune              = 1;
float gPrePruneTau           = 0.00;
int   gInitialPause          = 0;

int   gLaplace               = 5;

int   gREPrune         = 0;
int   gCachePruneSet   = 0;
VoidAListPtr gPruneSet;
float        gPruneSetPercent  = .05;
int          gPruneCacheInited = 0;
long         gPruneSetMaxSize  = 1000000;
long         gPruneSetCurrentSize = 0;


static void _printUsage(char  *name) {
   printf("%s : 'Very Fast Decision Tree' induction\n", name);

   printf("-f <filestem>\t Set the name of the dataset (default DF)\n");
   printf("-source <dir>\t Set the source data directory (default '.')\n");
   printf("-u\t\t Test the learner's accuracy on data in <stem>.test\n");

   printf("-sc <prob> \t Allowed chance of error in each decision (default\n\t\t 0.0000001 that's .00001 percent)\n");
   printf("-tc <tie error>\t Call a tie when hoeffding e < this. (default\n\t\t 0.05)\n");
   printf("-gini\t\t Use gini gain instead of information gain (default\n\t\t information gain) (DOESN'T WORK FOR CONTINUOUS ATTRIBUTES)\n");
   printf("-rescans <#>\t Naievely consider each example '#' times with no\n\t\t concern for using it once per level of the induced tree\n\t\t (default 1)\n");
   printf("-chunk <count> \t Wait until 'count' examples accumulate at a\n\t\t leaf before testing for a split (default 300)\n");
   printf("-growMegs <count>\t Limit dynamic memory allocation to 'count'\n\t\t megabytes (default 1000)\n");
   printf("-stdin \t\t Reads training examples from stdin instead of from\n\t\t <stem>.data (default off) - NOTE this disables the\n\t\t rescans switch\n");
   printf("-schedule <mult> Run tests after 10k examples & every mult * 10k.\n");
   printf("-outputTree \t Output the binary tree at each test point.\n");
   printf("-noCacheTestSet \t By default vfdt reads test set from disk once\n\t\t and keeps in memory.  Use this option to conserve some memory.\n");
   printf("-prune \t Do reduced error pruning on the resulting tree\n");
   printf("-noRestartLeaves \t Do not attempt to restart leaves that were\n\t\t deactivated for memory reasons.\n");
   printf("-noCacheTrainingExamples \t Do not use extra RAM to cache training\n\t\t examples.\n");
   printf("-cachePruneExamples \t Use extra RAM to pruning examples\n\t\tonly makes sense if you use -prune (default store prune\n\t\t data in <stem>.prunedata)\n");
   printf("-batch \t Run in batch mode, read all examples into RAM, don't\n\t\t do hoeffding bounds (default off).\n");
   printf("-laplace <count> \t When starting a new node use the laplace\n\t\t correction with the parent's class and <count>\n\t\t examples (defult 5)\n");
   printf("-prePruneTau <ppTau> \t Will not call tie while delta from null\n\t\t to best attrib <ppTau>, preprune if\n\t\t epsilon < <ppTau> makes sense that this be <= tc\n\t\t (default 0, no preprune)\n");
   printf("-initialPause \t Pause five seconds before reading the names\n\t\t file, sometimes needed if the file is being generated\n\t\t and then data is being piped to VFDT\n");
   printf("-incrementalReporting \t As each example arrives test the learned\n\t\t model with it and then learn on it (default off).\n");
   printf("-v\t\t Can be used multiple times to increase the debugging output\n");
}


static void _processArgs(int argc, char *argv[]) {
   int i;

   /* HERE on the ones that use the next arg make sure it is there */
   for(i = 1 ; i < argc ; i++) {
      if(!strcmp(argv[i], "-f")) {
         gFileStem = argv[i+1];
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-source")) {
         gSourceDirectory = argv[i+1];
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-u")) {
         gDoTests = 1;
      } else if(!strcmp(argv[i], "-v")) {
         gMessageLevel++;
      } else if(!strcmp(argv[i], "-h")) {
         _printUsage(argv[0]);
         exit(0);
      } else if(!strcmp(argv[i], "-sc")) {
         sscanf(argv[i+1], "%f", &gSplitConfidence);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-tc")) {
         sscanf(argv[i+1], "%f", &gTieConfidence);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-gini")) {
         gUseGini = 1;
      } else if(!strcmp(argv[i], "-rescans")) {
         sscanf(argv[i+1], "%d", &gRescans);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-chunk")) {
         sscanf(argv[i+1], "%d", &gChunk);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-laplace")) {
         sscanf(argv[i+1], "%d", &gLaplace);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-growMegs")) {
         sscanf(argv[i+1], "%d", &gGrowMegs);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-stdin")) {
         gStdin = 1;
      } else if(!strcmp(argv[i], "-schedule")) {
         gUseSchedule = 1;
         sscanf(argv[i+1], "%f", &gScheduleMult);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-prePruneTau")) {
         sscanf(argv[i+1], "%f", &gPrePruneTau);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-outputTree")) {
         gOutputTree = 1;
      } else if(!strcmp(argv[i], "-initialPause")) {
         gInitialPause = 1;
      } else if(!strcmp(argv[i], "-bonferonni")) {
         gDoBonferonni = 1;
      } else if(!strcmp(argv[i], "-noCacheTestSet")) {
         gCacheTestSet = 0;
      } else if(!strcmp(argv[i], "-cachePruneExamples")) {
         gCachePruneSet = 1;
      } else if(!strcmp(argv[i], "-prune")) {
         gREPrune = 1;
      } else if(!strcmp(argv[i], "-noRestartLeaves")) {
         gRestartLeaves = 0;
      } else if(!strcmp(argv[i], "-noCacheTrainingExamples")) {
         gCacheTrainingExamples = 0;
      } else if(!strcmp(argv[i], "-batch")) {
	 gDoBatch = 1;
      } else if(!strcmp(argv[i], "-incrementalReporting")) {
         gIncrementalReporting = 1;
      //} else if(!strcmp(argv[i], "-noPrePrune")) {
      //   gPrePrune = 0;
      } else {
         printf("Unknown argument: %s.  use -h for help\n", argv[i]);
         exit(0);
      }
   }

   if(gMessageLevel >= 1) {
      printf("Stem: %s\n", gFileStem);
      printf("Source: %s\n", gSourceDirectory);

      printf("Split Confidence: %g\n", gSplitConfidence);
      printf("Tie Confidence: %g\n", gTieConfidence);

      if(gUseGini) {
         printf("Using Gini split index - remember this won't work for continuous attributes.\n");
      } else {
         printf("Using information gain split index\n");
      }

      if(gStdin) {
         printf("Reading examples from standard in.\n");
      } else {
         printf("Considering each example %d times.\n", gRescans);
      }
      printf("Checking for splits for every %d examples at a leaf\n", gChunk);

      if(gREPrune) {
         printf("Doing reduced error pruning on resulting tree.\n");
      }

      if(gDoTests) {
         printf("Running tests\n");
      }
   }
}

long _incrementalErrors = 0;
long _incrementalTests  = 0;
static void _doIncrementalTest(VFDTPtr vfdt, ExampleSpecPtr es, ExamplePtr e) {
   DecisionTreePtr dt;

   dt = VFDTGetLearnedTree(vfdt);
   if(ExampleGetClass(e) != DecisionTreeClassify(dt, e)) {
      _incrementalErrors++;
   }
   _incrementalTests++;

   DecisionTreeFree(dt);
}

static void _doIncrementalReport(void) {
   printf("\tTested %ld examples, made %ld mistakes that's %.4lf%%\n", 
        _incrementalTests, _incrementalErrors, 
        (float)_incrementalErrors / (float)_incrementalTests);
}

VoidAListPtr _testSet;
int          _testCacheInited = 0;
static void _doTests(ExampleSpecPtr es, DecisionTreePtr dt, long numBoundsUsed, long growingNodes, long learnCount, long learnTime, long allocation, int finalOutput) {
   int oldPool = MGetActivePool();
   char fileNames[255];
   FILE *exampleIn, *treeOut;
   ExamplePtr e;
   long i;
   long tested, errors;

   struct tms starttime;
   struct tms endtime;

   errors = tested = 0;

   /* don't track this allocation against other VFDT stuff */
   MSetActivePool(0);

   if(gREPrune) {
      if(gCachePruneSet) {
         if(!gPruneCacheInited) {
            gPruneSet = VALNew();

            sprintf(fileNames, "%s/%s.prunedata", gSourceDirectory, gFileStem);
            exampleIn = fopen(fileNames, "r");
            DebugError(exampleIn == 0, "Unable to open the .prunedata file");

            e = ExampleRead(exampleIn, es);
            while(e != 0) {
               VALAppend(gPruneSet, e);
               e = ExampleRead(exampleIn, es);
            }
            fclose(exampleIn);
            gPruneCacheInited = 1;
         }
      }

      times(&starttime);

      DebugMessage(1, 1, "Starting post pruning...\n");
      if(gCachePruneSet) {
         REPruneBatch(dt, gPruneSet);
      } else {
         sprintf(fileNames, "%s/%s.prunedata", gSourceDirectory, gFileStem);
         exampleIn = fopen(fileNames, "r");
         DebugError(exampleIn == 0, "Unable to open the .prunedata file");
         REPruneBatchFile(dt, exampleIn, 0);
         fclose(exampleIn);
      }
      DebugMessage(1, 1, "Done post pruning...\n");

      times(&endtime);
      learnTime += endtime.tms_utime - starttime.tms_utime;
      if(gMessageLevel >= 1) {
         printf("Prune Time %.2lf\n",
              ((double)endtime.tms_utime - starttime.tms_utime)/100);
      }
   }


   if(gCacheTestSet) {
      if(!_testCacheInited) {
         _testSet = VALNew();

         sprintf(fileNames, "%s/%s.test", gSourceDirectory, gFileStem);
         exampleIn = fopen(fileNames, "r");
         DebugError(exampleIn == 0, "Unable to open the .test file");

         e = ExampleRead(exampleIn, es);
         while(e != 0) {
            VALAppend(_testSet, e);
            e = ExampleRead(exampleIn, es);
         }
         fclose(exampleIn);
         _testCacheInited = 1;
      }

      for(i = 0 ; i < VALLength(_testSet) ; i++) {
         e = VALIndex(_testSet, i);
         if(!ExampleIsClassUnknown(e)) {
            tested++;
            if(ExampleGetClass(e) != DecisionTreeClassify(dt, e)) {
               errors++;
            }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -