📄 c4.5rulesgt.c
字号:
/*************************************************************************//* *//* Definitions used in C4.5 *//* ------------------------ *//* *//*************************************************************************/#include <stdio.h>#include <math.h>#define Eof EOF /*char read on end of file*/#define Nil 0 /*null pointer*/#define false 0 #define true 1 #define None -1#define Epsilon 1E-3//long random();#define Random ((rand()&2147483647) / 2147483648.0)#define Max(a,b) ((a)>(b) ? a : b) #define Min(a,b) ((a)<(b) ? a : b) #define Round(x) ((int) (x+0.5))#define Log2 0.69314718055994530942#define Log(x) ((x) <= 0 ? 0.0 : log((float)x) / Log2)#define Bit(b) (1 << (b))#define In(b,s) ((s[(b) >> 3]) & Bit((b) & 07))#define ClearBits(n,s) memset(s,0,n)#define CopyBits(n,f,t) memcpy(t,f,n)#define SetBit(b,s) (s[(b) >> 3] |= Bit((b) & 07))#define ForEach(v,f,l) for(v=f ; v<=l ; ++v) #define Verbosity(d) if(VERBOSITY >= d)#define Check(v,l,h)\ if ( v<l||v>h ) {printf("\t** illegal value **\n"); exit(1);}/*************************************************************************//* *//* Type definitions for C4.5 *//* ------------------------- *//* *//*************************************************************************/typedef char Boolean, *String, *Set;typedef int ItemNo; /* data item number */typedef float ItemCount; /* count of (partial) items */typedef short ClassNo, /* class number, 0..MaxClass */ DiscrValue; /* discrete attribute value (0 = ?) */typedef short Attribute; /* attribute number, 0..MaxAtt */typedef union _attribute_value { DiscrValue _discr_val; float _cont_val; } AttValue, *Description;#define CVal(Case,Attribute) Case[Attribute]._cont_val#define DVal(Case,Attribute) Case[Attribute]._discr_val#define Class(Case) Case[MaxAtt+1]._discr_val#define Unknown -999 /* unknown value for continuous attribute */#define BrDiscr 1 /* node types: branch */#define ThreshContin 2 /* threshold cut */#define BrSubset 3 /* subset test */typedef struct _tree_record *Tree;typedef struct _tree_record { short NodeType; /* 0=leaf 1=branch 2=cut 3=subset */ ClassNo Leaf; /* most frequent class at this node */ ItemCount Items, /* no of items at this node */ *ClassDist, /* class distribution of items */ Errors; /* no of errors at this node */ Attribute Tested; /* attribute referenced in test */ short Forks; /* number of branches at this node */ float Cut, /* threshold for continuous attribute */ Lower, /* lower limit of soft threshold */ Upper; /* upper limit ditto */ Set *Subset; /* subsets of discrete values */ Tree *Branch; /* Branch[x] = (sub)tree for outcome x */ } TreeRec;#define IGNORE 1 /* special attribute status: do not use */#define DISCRETE 2 /* ditto: collect values as data read */typedef short RuleNo; /* rule number */typedef struct TestRec *Test;struct TestRec { short NodeType; /* test type (see tree nodes) */ Attribute Tested; /* attribute tested */ short Forks; /* possible branches */ float Cut; /* threshold (if relevant) */ Set *Subset; /* subset (if relevant) */ };typedef struct CondRec *Condition;struct CondRec { Test CondTest; /* test part of condition */ short TestValue; /* specified outcome of test */ };typedef struct ProdRuleRec PR;struct ProdRuleRec { short Size; /* number of conditions */ Condition *Lhs; /* conditions themselves */ ClassNo Rhs; /* class given by rule */ float Error, /* estimated error rate */ Bits; /* bits to encode rule */ ItemNo Used, /* times rule used */ Incorrect; /* times rule incorrect */ };typedef struct RuleSetRec RuleSet;struct RuleSetRec { PR *SRule; /* rules */ RuleNo SNRules, /* number of rules */ *SRuleIndex; /* ranking of rules */ ClassNo SDefaultClass; /* default class for this ruleset */ };/*************************************************************************//* *//* Main routine for constructing sets of production rules from trees *//* ----------------------------------------------------------------- *//* *//*************************************************************************/ /* External data. Note: uncommented variables have the same meaning as for decision trees */short MaxAtt, MaxClass, MaxDiscrVal;ItemNo MaxItem;Description *Item;DiscrValue *MaxAttVal;char *SpecialStatus;String *ClassName, *AttName, **AttValName, FileName = "DF";short VERBOSITY = 0, TRIALS;Boolean UNSEENS = false, SIGTEST = false, /* use significance test in rule pruning */ SIMANNEAL = false; /* use simulated annealing */float SIGTHRESH = 0.05, CF = 0.25, REDUNDANCY = 1.0; /* factor that guesstimates the amount of redundancy and irrelevance in the attributes */PR *Rule; /* current rules */RuleNo NRules = 0, /* number of current rules */ *RuleIndex; /* rule index */short RuleSpace = 0; /* space allocated for rules */ClassNo DefaultClass; /* current default class */RuleSet *PRSet; /* sets of rulesets */float AttTestBits, /* bits to encode tested att */ *BranchBits; /* ditto attribute value */int optind = 1;
char *optarg;
getopt(Argc, Argv, Str)
/* ------ */
int Argc;
char **Argv, *Str;
{
int Optchar;
char *Option;
if ( optind >= Argc ) return EOF;
Option = Argv[optind++];
if ( *Option++ != '-' ) return '?';
Optchar = *Option++;
while ( *Str && *Str != Optchar ) Str++;
if ( ! *Str ) return '?';
if ( *++Str == ':' )
{
if ( *Option ) optarg = Option;
else
if ( optind < Argc ) optarg = Argv[optind++];
else
Optchar = '?';
}
return Optchar;
} main(Argc, Argv)/* ---- */ int Argc; char *Argv[];{ int o; extern char *optarg; Boolean FirstTime=true; PrintHeader("rule generator");
printf("%s\n",Argv[2]);
scanf("%s",Argv[2]);
printf("%s\n",Argv[1]);
printf("%s\n",Argv[2]);
printf("%s\n",Argv[3]); /* Process options */ while ( (o = getopt(Argc, Argv, "f:uv:c:r:F:a")) != EOF ) { if ( FirstTime ) { printf("\n Options:\n"); FirstTime = false; } switch (o) { case 'f': FileName = optarg; printf("\tFile stem <%s>\n", FileName); break; case 'u': UNSEENS = true; printf("\tRulesets evaluated on unseen cases\n"); break; case 'v': VERBOSITY = atoi(optarg); printf("\tVerbosity level %d\n", VERBOSITY); break; case 'c': CF = atof(optarg); printf("\tPruning confidence level %g%%\n", CF); Check(CF, 0, 100); CF /= 100; break; case 'r': REDUNDANCY = atof(optarg); printf("\tRedundancy factor %g\n", REDUNDANCY); Check(REDUNDANCY, 0, 10000); break; case 'F': SIGTHRESH = atof(optarg); printf("\tSignificance test in rule pruning, "); printf("threshold %g%%\n", SIGTHRESH); Check(SIGTHRESH, 0, 100); SIGTHRESH /= 100; SIGTEST = true; break; case 'a': SIMANNEAL = true; printf("\tSimulated annealing for selecting rules\n"); break; case '?': printf("unrecognised option\n"); exit(1); } } /* Initialise */ GetNames(); GetData(".data"); printf("\nRead %d cases (%d attributes) from %s\n", MaxItem+1, MaxAtt+1, FileName); GenerateLogs(); /* Construct rules */ GenerateRules(); /* Evaluations */ printf("\n\nEvaluation on training data (%d items):\n", MaxItem+1); EvaluateRulesets(true); /* Save current ruleset */ SaveRules(); if ( UNSEENS ) { GetData(".test"); printf("\nEvaluation on test data (%d items):\n", MaxItem+1); EvaluateRulesets(false); } exit(0);}/*************************************************************************//* *//* Miscellaneous routines for rule handling *//* ---------------------------------------- *//* *//*************************************************************************/extern FILE *TRf; /* rules file */Test *TestVec;short NTests = 0;FILE *fopen();extern char Fn[500]; /* file name *//*************************************************************************//* *//* Save the current ruleset in rules file in order of the index *//* *//*************************************************************************/ SaveRules()/* ---------- */{ short ri, d, v, Bytes; RuleNo r; Test Tst; if ( TRf ) fclose(TRf); strcpy(Fn, FileName); strcat(Fn, ".rules"); if ( ! ( TRf = fopen(Fn, "w") ) ) Error(0, Fn, " for writing"); StreamOut((char *) &NRules, sizeof(RuleNo)); StreamOut((char *) &DefaultClass, sizeof(ClassNo)); ForEach(ri, 1, NRules) { r = RuleIndex[ri]; StreamOut((char *) &Rule[r].Size, sizeof(short)); ForEach(d, 1, Rule[r].Size) { Tst = Rule[r].Lhs[d]->CondTest; StreamOut((char *) &Tst->NodeType, sizeof(short)); StreamOut((char *) &Tst->Tested, sizeof(Attribute)); StreamOut((char *) &Tst->Forks, sizeof(short)); StreamOut((char *) &Tst->Cut, sizeof(float)); if ( Tst->NodeType == BrSubset ) { Bytes = (MaxAttVal[Tst->Tested]>>3) + 1; ForEach(v, 1, Tst->Forks) { StreamOut((char *) Tst->Subset[v], Bytes); } } StreamOut((char *) &Rule[r].Lhs[d]->TestValue, sizeof(short)); } StreamOut((char *) &Rule[r].Rhs, sizeof(ClassNo)); StreamOut((char *) &Rule[r].Error, sizeof(float)); } SaveDiscreteNames();}/*************************************************************************//* *//* Get a new ruleset from rules file *//* *//*************************************************************************/ GetRules()/* --------- */{ RuleNo nr, r; short n, d, v, Bytes; Condition *Cond; Test Tst, FindTest(); ClassNo c; float e; Boolean NewRule(); if ( TRf ) fclose(TRf); strcpy(Fn, FileName); strcat(Fn, ".rules"); if ( ! ( TRf = fopen(Fn, "r") ) ) Error(0, Fn, ""); StreamIn((char *) &nr, sizeof(RuleNo)); StreamIn((char *) &DefaultClass, sizeof(ClassNo)); ForEach(r, 1, nr) { StreamIn((char *) &n, sizeof(short)); Cond = (Condition *) calloc(n+1, sizeof(Condition)); ForEach(d, 1, n) { Tst = (Test) malloc(sizeof(struct TestRec)); StreamIn((char *) &Tst->NodeType, sizeof(short)); StreamIn((char *) &Tst->Tested, sizeof(Attribute)); StreamIn((char *) &Tst->Forks, sizeof(short)); StreamIn((char *) &Tst->Cut, sizeof(float)); if ( Tst->NodeType == BrSubset ) { Tst->Subset = (Set *) calloc(Tst->Forks + 1, sizeof(Set)); Bytes = (MaxAttVal[Tst->Tested]>>3) + 1; ForEach(v, 1, Tst->Forks) { Tst->Subset[v] = (Set) malloc(Bytes); StreamIn((char *) Tst->Subset[v], Bytes); } } Cond[d] = (Condition) malloc(sizeof(struct CondRec)); Cond[d]->CondTest = FindTest(Tst); StreamIn((char *) &Cond[d]->TestValue, sizeof(short)); } StreamIn((char *) &c, sizeof(ClassNo)); StreamIn((char *) &e, sizeof(float)); NewRule(Cond, n, c, e); free(Cond); } RecoverDiscreteNames();}/*************************************************************************//* *//* Find a test in the test vector; if it's not there already, add it *//* *//*************************************************************************/Test FindTest(Newtest)/* --------- */ Test Newtest;{ static short TestSpace=0; short i; Boolean SameTest(); ForEach(i, 1, NTests) { if ( SameTest(Newtest, TestVec[i]) ) { free(Newtest); return TestVec[i]; } } NTests++; if ( NTests >= TestSpace ) { TestSpace += 1000; if ( TestSpace > 1000 ) { TestVec = (Test *) realloc(TestVec, TestSpace * sizeof(Test)); } else { TestVec = (Test *) malloc(TestSpace * sizeof(Test)); } } TestVec[NTests] = Newtest; return TestVec[NTests];}/*************************************************************************//* *//* See if test t1 is the same test as test t2 *//* *//*************************************************************************/Boolean SameTest(t1, t2)/* --------- */ Test t1, t2;{ short i; if ( t1->NodeType != t2->NodeType || t1->Tested != t2->Tested ) { return false; } switch ( t1->NodeType ) { case BrDiscr: return true; case ThreshContin: return t1->Cut == t2->Cut; case BrSubset: ForEach(i, 1, t1->Forks) { if ( t1->Subset[i] != t2->Subset[i] ) { return false; } } } return true;}/*************************************************************************//* *//* Clear for new set of rules *//* *//*************************************************************************/ InitialiseRules()/* ---------------- */{ NRules = 0; Rule = 0; RuleSpace = 0;}/*************************************************************************//* *//* Add a new rule to the current ruleset, by updating Rule[], *//* NRules and, if necessary, RuleSpace *//* *//*************************************************************************/Boolean NewRule(Cond, NConds, TargetClass, Err)/* ------- */ Condition Cond[]; short NConds; ClassNo TargetClass; float Err;{ short d, r; Boolean SameRule(); /* See if rule already exists */ ForEach(r, 1, NRules) { if ( SameRule(r, Cond, NConds, TargetClass) ) { Verbosity(1) printf("\tduplicates rule %d\n", r); /* Keep the most pessimistic error estimate */ if ( Err > Rule[r].Error ) { Rule[r].Error = Err; } return false; } } /* Make sure there is enough room for the new rule */ NRules++; if ( NRules >= RuleSpace ) { RuleSpace += 100;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -