📄 c4.5gt.c
字号:
/*************************************************************************//* *//* Definitions used in C4.5 *//* ------------------------ *//* *//*************************************************************************/#include <stdio.h>#include <math.h>#define Eof EOF /*char read on end of file*/#define Nil 0 /*null pointer*/#define false 0 #define true 1 #define None -1#define Epsilon 1E-3//long random();#define Random ((rand()&2147483647) / 2147483648.0)#define Max(a,b) ((a)>(b) ? a : b) #define Min(a,b) ((a)<(b) ? a : b) #define Round(x) ((int) (x+0.5))#define Log2 0.69314718055994530942#define Log(x) ((x) <= 0 ? 0.0 : log((float)x) / Log2)#define Bit(b) (1 << (b))#define In(b,s) ((s[(b) >> 3]) & Bit((b) & 07))#define ClearBits(n,s) memset(s,0,n)#define CopyBits(n,f,t) memcpy(t,f,n)#define SetBit(b,s) (s[(b) >> 3] |= Bit((b) & 07))#define ForEach(v,f,l) for(v=f ; v<=l ; ++v) #define Verbosity(d) if(VERBOSITY >= d)#define Check(v,l,h)\ if ( v<l||v>h ) {printf("\t** illegal value **\n"); exit(1);}/*************************************************************************//* *//* Type definitions for C4.5 *//* ------------------------- *//* *//*************************************************************************/typedef char Boolean, *String, *Set;typedef int ItemNo; /* data item number */typedef float ItemCount; /* count of (partial) items */typedef short ClassNo, /* class number, 0..MaxClass */ DiscrValue; /* discrete attribute value (0 = ?) */typedef short Attribute; /* attribute number, 0..MaxAtt */typedef union _attribute_value { DiscrValue _discr_val; float _cont_val; } AttValue, *Description;#define CVal(Case,Attribute) Case[Attribute]._cont_val#define DVal(Case,Attribute) Case[Attribute]._discr_val#define Class(Case) Case[MaxAtt+1]._discr_val#define Unknown -999 /* unknown value for continuous attribute */#define BrDiscr 1 /* node types: branch */#define ThreshContin 2 /* threshold cut */#define BrSubset 3 /* subset test */typedef struct _tree_record *Tree;typedef struct _tree_record { short NodeType; /* 0=leaf 1=branch 2=cut 3=subset */ ClassNo Leaf; /* most frequent class at this node */ ItemCount Items, /* no of items at this node */ *ClassDist, /* class distribution of items */ Errors; /* no of errors at this node */ Attribute Tested; /* attribute referenced in test */ short Forks; /* number of branches at this node */ float Cut, /* threshold for continuous attribute */ Lower, /* lower limit of soft threshold */ Upper; /* upper limit ditto */ Set *Subset; /* subsets of discrete values */ Tree *Branch; /* Branch[x] = (sub)tree for outcome x */ } TreeRec;#define IGNORE 1 /* special attribute status: do not use */#define DISCRETE 2 /* ditto: collect values as data read */typedef short RuleNo; /* rule number */typedef struct TestRec *Test;struct TestRec { short NodeType; /* test type (see tree nodes) */ Attribute Tested; /* attribute tested */ short Forks; /* possible branches */ float Cut; /* threshold (if relevant) */ Set *Subset; /* subset (if relevant) */ };typedef struct CondRec *Condition;struct CondRec { Test CondTest; /* test part of condition */ short TestValue; /* specified outcome of test */ };typedef struct ProdRuleRec PR;struct ProdRuleRec { short Size; /* number of conditions */ Condition *Lhs; /* conditions themselves */ ClassNo Rhs; /* class given by rule */ float Error, /* estimated error rate */ Bits; /* bits to encode rule */ ItemNo Used, /* times rule used */ Incorrect; /* times rule incorrect */ };typedef struct RuleSetRec RuleSet;struct RuleSetRec { PR *SRule; /* rules */ RuleNo SNRules, /* number of rules */ *SRuleIndex; /* ranking of rules */ ClassNo SDefaultClass; /* default class for this ruleset */ };/*************************************************************************//* *//* Main routine, c4.5 *//* ------------------ *//* *//*************************************************************************/short MaxAtt, MaxClass, MaxDiscrVal = 2;ItemNo MaxItem;Description *Item;DiscrValue *MaxAttVal;char *SpecialStatus;String *ClassName, *AttName, **AttValName, FileName = "DF";short VERBOSITY = 0, TRIALS = 10;Boolean GAINRATIO = true, SUBSET = false, BATCH = true, UNSEENS = false, PROBTHRESH = false;ItemNo MINOBJS = 2, WINDOW = 0, INCREMENT = 0;float CF = 0.25;Tree *Pruned;Boolean AllKnown = true;int optind = 1;
char *optarg;getopt(Argc, Argv, Str)
/* ------ */
int Argc;
char **Argv, *Str;
{
int Optchar;
char *Option;
if ( optind >= Argc ) return EOF;
Option = Argv[optind++];
if ( *Option++ != '-' ) return '?';
Optchar = *Option++;
while ( *Str && *Str != Optchar ) Str++;
if ( ! *Str ) return '?';
if ( *++Str == ':' )
{
if ( *Option ) optarg = Option;
else
if ( optind < Argc ) optarg = Argv[optind++];
else
Optchar = '?';
}
return Optchar;
}
main(Argc, Argv)/* ---- */ int Argc; char *Argv[];{ int o; extern char *optarg; Boolean FirstTime=true; short Best, BestTree(); PrintHeader("decision tree generator");
scanf("%s",Argv[2]); /* Process options */ while ( (o = getopt(Argc, Argv, "f:bupv:t:w:i:gsm:c:")) != EOF ) { if ( FirstTime ) { printf("\n Options:\n"); FirstTime = false; } switch (o) { case 'f': FileName = optarg; printf("\tFile stem <%s>\n", FileName); break; case 'b': BATCH = true; printf("\tWindowing disabled (now the default)\n"); break; case 'u': UNSEENS = true; printf("\tTrees evaluated on unseen cases\n"); break; case 'p': PROBTHRESH = true; printf("\tProbability thresholds used\n"); break; case 'v': VERBOSITY = atoi(optarg); printf("\tVerbosity level %d\n", VERBOSITY); break; case 't': TRIALS = atoi(optarg); printf("\tWindowing enabled with %d trials\n", TRIALS); Check(TRIALS, 1, 10000); BATCH = false; break; case 'w': WINDOW = atoi(optarg); printf("\tInitial window size of %d items\n", WINDOW); Check(WINDOW, 1, 1000000); BATCH = false; break; case 'i': INCREMENT = atoi(optarg); printf("\tMaximum window increment of %d items\n", INCREMENT); Check(INCREMENT, 1, 1000000); BATCH = false; break; case 'g': GAINRATIO = false; printf("\tGain criterion used\n"); break; case 's': SUBSET = true; printf("\tTests on discrete attribute groups\n"); break; case 'm': MINOBJS = atoi(optarg); printf("\tSensible test requires 2 branches with >=%d cases\n", MINOBJS); Check(MINOBJS, 1, 1000000); break; case 'c': CF = atof(optarg); printf("\tPruning confidence level %g%%\n", CF); Check(CF, Epsilon, 100); CF /= 100; break; case '?': printf("unrecognised option\n"); exit(1); } } /* Initialise */ GetNames(); GetData(".data"); printf("\nRead %d cases (%d attributes) from %s.data\n", MaxItem+1, MaxAtt+1, FileName); /* Build decision trees */ if ( BATCH ) { TRIALS = 1; OneTree(); Best = 0; } else { Best = BestTree(); } /* Soften thresholds in best tree */ if ( PROBTHRESH ) { printf("Softening thresholds"); if ( ! BATCH ) printf(" for best tree from trial %d", Best); printf("\n"); SoftenThresh(Pruned[Best]); printf("\n"); PrintTree(Pruned[Best]); } /* Save best tree */ if ( BATCH || TRIALS == 1 ) { printf("\nTree saved\n"); } else { printf("\nBest tree from trial %d saved\n", Best); } SaveTree(Pruned[Best], ".tree"); /* Evaluation */ printf("\n\nEvaluation on training data (%d items):\n", MaxItem+1); Evaluate(false, Best); if ( UNSEENS ) { GetData(".test"); printf("\nEvaluation on test data (%d items):\n", MaxItem+1); Evaluate(true, Best); } exit(0);}/*************************************************************************//* *//* Routines to manage tree growth, pruning and evaluation *//* ------------------------------------------------------ *//* *//*************************************************************************/ItemNo *TargetClassFreq;Tree *Raw;extern Tree *Pruned;/*************************************************************************//* *//* Grow and prune a single tree from all data *//* *//*************************************************************************/ OneTree()/* --------- */{ Tree FormTree(), CopyTree(); Boolean Prune(); InitialiseTreeData(); InitialiseWeights(); Raw = (Tree *) calloc(1, sizeof(Tree)); Pruned = (Tree *) calloc(1, sizeof(Tree)); AllKnown = true; Raw[0] = FormTree(0, MaxItem); printf("\n"); PrintTree(Raw[0]); SaveTree(Raw[0], ".unpruned"); Pruned[0] = CopyTree(Raw[0]); if ( Prune(Pruned[0]) ) { printf("\nSimplified "); PrintTree(Pruned[0]); }}/*************************************************************************//* *//* Grow and prune TRIALS trees and select the best of them *//* *//*************************************************************************/short BestTree()/* -------- */{ Tree CopyTree(), Iterate(); Boolean Prune(); short t, Best=0; InitialiseTreeData(); TargetClassFreq = (ItemNo *) calloc(MaxClass+1, sizeof(ItemNo)); Raw = (Tree *) calloc(TRIALS, sizeof(Tree)); Pruned = (Tree *) calloc(TRIALS, sizeof(Tree)); /* If necessary, set initial size of window to 20% (or twice the sqrt, if this is larger) of the number of data items, and the maximum number of items that can be added to the window at each iteration to 20% of the initial window size */ if ( ! WINDOW ) { WINDOW = Max(2 * sqrt(MaxItem+1.0), (MaxItem+1) / 5); } if ( ! INCREMENT ) { INCREMENT = Max(WINDOW / 5, 1); } FormTarget(WINDOW); /* Form set of trees by iteration and prune */ ForEach(t, 0, TRIALS-1 ) { FormInitialWindow(); printf("\n--------\nTrial %d\n--------\n\n", t); Raw[t] = Iterate(WINDOW, INCREMENT); printf("\n"); PrintTree(Raw[t]); SaveTree(Raw[t], ".unpruned"); Pruned[t] = CopyTree(Raw[t]); if ( Prune(Pruned[t]) ) { printf("\nSimplified "); PrintTree(Pruned[t]); } if ( Pruned[t]->Errors < Pruned[Best]->Errors ) { Best = t; } } printf("\n--------\n"); return Best;}/*************************************************************************//* *//* The windowing approach seems to work best when the class *//* distribution of the initial window is as close to uniform as *//* possible. FormTarget generates this initial target distribution, *//* setting up a TargetClassFreq value for each class. *//* *//*************************************************************************/ FormTarget(Size)/* ----------- */ ItemNo Size;{ ItemNo i, *ClassFreq; ClassNo c, Smallest, ClassesLeft=0; ClassFreq = (ItemNo *) calloc(MaxClass+1, sizeof(ItemNo)); /* Generate the class frequency distribution */ ForEach(i, 0, MaxItem) { ClassFreq[ Class(Item[i]) ]++; } /* Calculate the no. of classes of which there are items */ ForEach(c, 0, MaxClass) { if ( ClassFreq[c] ) { ClassesLeft++; } else { TargetClassFreq[c] = 0; } } while ( ClassesLeft ) { /* Find least common class of which there are some items */ Smallest = -1; ForEach(c, 0, MaxClass) { if ( ClassFreq[c] && ( Smallest < 0 || ClassFreq[c] < ClassFreq[Smallest] ) ) { Smallest = c; } } /* Allocate the no. of items of this class to use in the window */ TargetClassFreq[Smallest] = Min(ClassFreq[Smallest], Round(Size/ClassesLeft)); ClassFreq[Smallest] = 0; Size -= TargetClassFreq[Smallest]; ClassesLeft--; } free(ClassFreq);}/*************************************************************************//* *//* Form initial window, attempting to obtain the target class profile *//* in TargetClassFreq. This is done by placing the targeted number *//* of items of each class at the beginning of the set of data items. *//* *//*************************************************************************/ FormInitialWindow()/* ------------------- */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -