⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vfdt.c

📁 数据挖掘方面的源码
💻 C
📖 第 1 页 / 共 2 页
字号:
#include "vfml.h"
#include "vfdt-engine.h"
#include <stdio.h>
#include <string.h>

#include <sys/times.h>
#include <time.h>

char *gFileStem = "DF";
char *gSourceDirectory = ".";
int   gDoTests = 0;
int   gMessageLevel = 0;

float gSplitConfidence = 0.000001;
float gTieConfidence   = 0.05;
int   gUseGini         = 0;
int   gRescans         = 1;
int   gChunk           = 300;
int   gGrowMegs        = 1000;

int   gStdin           = 0;
int   gUseSchedule     = 0;
long  gScheduleCount   = 10000;
float gScheduleMult    = 1.44;

int   gOutputTree    = 0;

int   gCacheTestSet    = 1;
int   gREPrune         = 0;
int   gCachePruneSet   = 0;

int   gCacheTrainingExamples = 1;
int   gRestartScanPeriod     = 147377 * 2;
int   gRestartLeaves         = 1;

int   gDoBatch               = 0;
int   gPrePrune              = 1;
int   gAdaptiveDelta         = 0;

VoidAListPtr gPruneSet;
float        gPruneSetPercent  = .1;
int          gPruneCacheInited = 0;


static void _printUsage(char  *name) {
   printf("%s : 'Very Fast Decision Tree' induction\n", name);

   printf("-f <filestem>\t Set the name of the dataset (default DF)\n");
   printf("-source <dir>\t Set the source data directory (default '.')\n");
   printf("-u\t\t Test the learner's accuracy on data in <stem>.test\n");

   printf("-sc <prob> \t Allowed chance of error in each decision (default 0.0000001 that's .00001 percent)\n");
   printf("-tc <tie error>\t Call a tie when hoeffding e < this. (default 0.05)\n");
   printf("-gini\t\t Use gini gain instead of information gain (default information gain) (DOESN'T WORK FOR CONTINUOUS ATTRIBUTES)\n");
   printf("-rescans <#>\t Naievely consider each example '#' times with no concern for using it once per level of the induced tree (default 1)\n");
   printf("-chunk <count> \t Wait until 'count' examples accumulate at a leaf before testing for a split (default 300)\n");
   printf("-growMegs <count>\t Limit dynamic memory allocation to 'count' megabytes (default 1000)\n");
   printf("-stdin \t\t Reads training examples from stdin instead of from <stem>.data (default off) - NOTE this disables the rescans switch\n");
   printf("-schedule <mult> Run tests after 10k examples & every mult * 10k.\n");
   printf("-outputTree \t Output the binary tree at each test point.\n");
   printf("-noCacheTestSet \t By default vfdt reads test set from disk once and keeps in memory.  Use this option to conserve some memory.\n");
   printf("-prune \t Do reduced error pruning on the resulting tree\n");
   printf("-noRestartLeaves \t Do not attempt to restart leaves that were deactivated for memory reasons.\n");
   printf("-noCacheTrainingExamples \t Do not use extra RAM to cache training examples.\n");
   printf("-batch \t Run in batch mode, read all examples into ram, don't do hoeffding bounds (default off).\n");
   printf("-noPrePrune \t Make splits that have information-loss (doesn't effect batch mode)\n");
   printf("-adaptiveDelta \t Double 'sc' each level down the induced tree until it reaches 5%\n");
   printf("-v\t\t Can be used multiple times to increase the debugging output\n");
}


static void _processArgs(int argc, char *argv[]) {
   int i;

   /* HERE on the ones that use the next arg make sure it is there */
   for(i = 1 ; i < argc ; i++) {
      if(!strcmp(argv[i], "-f")) {
         gFileStem = argv[i+1];
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-source")) {
         gSourceDirectory = argv[i+1];
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-u")) {
         gDoTests = 1;
      } else if(!strcmp(argv[i], "-v")) {
         gMessageLevel++;
      } else if(!strcmp(argv[i], "-h")) {
         _printUsage(argv[0]);
         exit(0);
      } else if(!strcmp(argv[i], "-sc")) {
         sscanf(argv[i+1], "%f", &gSplitConfidence);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-tc")) {
         sscanf(argv[i+1], "%f", &gTieConfidence);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-gini")) {
         gUseGini = 1;
      } else if(!strcmp(argv[i], "-rescans")) {
         sscanf(argv[i+1], "%d", &gRescans);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-chunk")) {
         sscanf(argv[i+1], "%d", &gChunk);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-growMegs")) {
         sscanf(argv[i+1], "%d", &gGrowMegs);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-stdin")) {
         gStdin = 1;
      } else if(!strcmp(argv[i], "-schedule")) {
         gUseSchedule = 1;
         sscanf(argv[i+1], "%f", &gScheduleMult);
         /* ignore the next argument */
         i++;
      } else if(!strcmp(argv[i], "-outputTree")) {
         gOutputTree = 1;
      } else if(!strcmp(argv[i], "-noCacheTestSet")) {
         gCacheTestSet = 0;
      } else if(!strcmp(argv[i], "-prune")) {
         gREPrune = 1;
      } else if(!strcmp(argv[i], "-noRestartLeaves")) {
         gRestartLeaves = 0;
      } else if(!strcmp(argv[i], "-noCacheTrainingExamples")) {
         gCacheTrainingExamples = 0;
      } else if(!strcmp(argv[i], "-batch")) {
	 gDoBatch = 1;
      } else if(!strcmp(argv[i], "-noPrePrune")) {
         gPrePrune = 0;
      } else if(!strcmp(argv[i], "-adaptiveDelta")) {
         gAdaptiveDelta = 1;
      } else {
         printf("Unknown argument: %s.  use -h for help\n", argv[i]);
         exit(0);
      }
   }

   if(gMessageLevel >= 1) {
      printf("Stem: %s\n", gFileStem);
      printf("Source: %s\n", gSourceDirectory);

      printf("Split Confidence: %f\n", gSplitConfidence);
      printf("Tie Confidence: %f\n", gTieConfidence);

      if(gUseGini) {
         printf("Using Gini split index - remember this won't work for continuous attributes.\n");
      } else {
         printf("Using information gain split index\n");
      }

      if(gStdin) {
         printf("Reading examples from standard in.\n");
      } else {
         printf("Considering each example %d times.\n", gRescans);
      }
      printf("Checking for splits for every %d examples at a leaf\n", gChunk);

      if(gREPrune) {
         printf("Doing reduced error pruning on resulting tree.\n");
      }

      if(gDoTests) {
         printf("Running tests\n");
      }
   }
}

VoidAListPtr _testSet;
int          _testCacheInited = 0;
static void _doTests(ExampleSpecPtr es, DecisionTreePtr dt, long growingNodes, long learnCount, long learnTime, long allocation, int finalOutput) {
   int oldPool = MGetActivePool();
   char fileNames[255];
   FILE *exampleIn, *treeOut;
   ExamplePtr e;
   long i;
   long tested, errors;

   struct tms starttime;
   struct tms endtime;

   errors = tested = 0;

   /* don't track this allocation against other VFDT stuff */
   MSetActivePool(0);

   if(gREPrune) {
      if(gCachePruneSet) {
         if(!gPruneCacheInited) {
            gPruneSet = VALNew();

            sprintf(fileNames, "%s/%s.prune", gSourceDirectory, gFileStem);
            exampleIn = fopen(fileNames, "r");
            DebugError(exampleIn == 0, "Unable to open the .prune file");

            e = ExampleRead(exampleIn, es);
            while(e != 0) {
               VALAppend(gPruneSet, e);
               e = ExampleRead(exampleIn, es);
            }
            fclose(exampleIn);
            gPruneCacheInited = 1;
         }
      } else {
         sprintf(fileNames, "%s/%s.prune", gSourceDirectory, gFileStem);
         exampleIn = fopen(fileNames, "r");
         DebugError(exampleIn == 0, "Unable to open the .prune file");
      }

      times(&starttime);

      if(gCachePruneSet) {
         REPruneBatch(dt, gPruneSet);
      } else {
         REPruneBatchFile(dt, exampleIn, 0);
         fclose(exampleIn);
      }

      times(&endtime);
      learnTime += endtime.tms_utime - starttime.tms_utime;
      if(gMessageLevel >= 1) {
         printf("Prune Time %.2lf\n",
              ((double)endtime.tms_utime - starttime.tms_utime)/100);
      }
   }


   if(gCacheTestSet) {
      if(!_testCacheInited) {
         _testSet = VALNew();

         sprintf(fileNames, "%s/%s.test", gSourceDirectory, gFileStem);
         exampleIn = fopen(fileNames, "r");
         DebugError(exampleIn == 0, "Unable to open the .test file");

         e = ExampleRead(exampleIn, es);
         while(e != 0) {
            VALAppend(_testSet, e);
            e = ExampleRead(exampleIn, es);
         }
         fclose(exampleIn);
         _testCacheInited = 1;
      }

      for(i = 0 ; i < VALLength(_testSet) ; i++) {
         e = VALIndex(_testSet, i);
         if(!ExampleIsClassUnknown(e)) {
            tested++;
            if(ExampleGetClass(e) != DecisionTreeClassify(dt, e)) {
               errors++;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -