📄 c4.5gt.c
字号:
{ ItemNo i, Start=0, More; ClassNo c; void Swap(); Shuffle(); ForEach(c, 0, MaxClass) { More = TargetClassFreq[c]; for ( i = Start ; More ; i++ ) { if ( Class(Item[i]) == c ) { Swap(Start, i); Start++; More--; } } }}/*************************************************************************//* *//* Shuffle the data items randomly *//* *//*************************************************************************/ Shuffle()/* ------- */{ ItemNo This, Alt, Left; Description Hold; This = 0; for( Left = MaxItem+1 ; Left ; ) { Alt = This + (Left--) * Random; Hold = Item[This]; Item[This++] = Item[Alt]; Item[Alt] = Hold; }}/*************************************************************************//* *//* Grow a tree iteratively with initial window size Window and *//* initial window increment IncExceptions. *//* *//* Construct a classifier tree using the data items in the *//* window, then test for the successful classification of other *//* data items by this tree. If there are misclassified items, *//* put them immediately after the items in the window, increase *//* the size of the window and build another classifier tree, and *//* so on until we have a tree which successfully classifies all *//* of the test items or no improvement is apparent. *//* *//* On completion, return the tree which produced the least errors. *//* *//*************************************************************************/Tree Iterate(Window, IncExceptions)/* ------- */ ItemNo Window, IncExceptions;{ Tree Classifier, BestClassifier=Nil, FormTree(); ItemNo i, Errors, TotalErrors, BestTotalErrors=MaxItem+1, Exceptions, Additions; ClassNo Assigned, Category(); short Cycle=0; void Swap(); printf("Cycle Tree -----Cases----"); printf(" -----------------Errors-----------------\n"); printf(" size window other"); printf(" window rate other rate total rate\n"); printf("----- ---- ------ ------"); printf(" ------ ---- ------ ---- ------ ----\n"); do { /* Build a classifier tree with the first Window items */ InitialiseWeights(); AllKnown = true; Classifier = FormTree(0, Window-1); /* Error analysis */ Errors = Round(Classifier->Errors); /* Move all items that are incorrectly classified by the classifier tree to immediately after the items in the current window. */ Exceptions = Window; ForEach(i, Window, MaxItem) { Assigned = Category(Item[i], Classifier); if ( Assigned != Class(Item[i]) ) { Swap(Exceptions, i); Exceptions++; } } Exceptions -= Window; TotalErrors = Errors + Exceptions; /* Print error analysis */ printf("%3d %7d %8d %6d %8d%5.1f%% %6d%5.1f%% %6d%5.1f%%\n", ++Cycle, TreeSize(Classifier), Window, MaxItem-Window+1, Errors, 100*(float)Errors/Window, Exceptions, 100*Exceptions/(MaxItem-Window+1.001), TotalErrors, 100*TotalErrors/(MaxItem+1.0)); /* Keep track of the most successful classifier tree so far */ if ( ! BestClassifier || TotalErrors < BestTotalErrors ) { if ( BestClassifier ) ReleaseTree(BestClassifier); BestClassifier = Classifier; BestTotalErrors = TotalErrors; } else { ReleaseTree(Classifier); } /* Increment window size */ Additions = Min(Exceptions, IncExceptions); Window = Min(Window + Max(Additions, Exceptions / 2), MaxItem + 1); } while ( Exceptions ); return BestClassifier;}/*************************************************************************//* *//* Print report of errors for each of the trials *//* *//*************************************************************************/ Evaluate(CMInfo, Saved)/* -------- */ Boolean CMInfo; short Saved;{ ClassNo RealClass, PrunedClass, Category(); short t; ItemNo *ConfusionMat, i, RawErrors, PrunedErrors; if ( CMInfo ) { ConfusionMat = (ItemNo *) calloc((MaxClass+1)*(MaxClass+1), sizeof(ItemNo)); } printf("\n"); if ( TRIALS > 1 ) { printf("Trial\t Before Pruning After Pruning\n"); printf("-----\t---------------- ---------------------------\n"); } else { printf("\t Before Pruning After Pruning\n"); printf("\t---------------- ---------------------------\n"); } printf("\tSize Errors Size Errors Estimate\n\n"); ForEach(t, 0, TRIALS-1) { RawErrors = PrunedErrors = 0; ForEach(i, 0, MaxItem) { RealClass = Class(Item[i]); if ( Category(Item[i], Raw[t]) != RealClass ) RawErrors++; PrunedClass = Category(Item[i], Pruned[t]); if ( PrunedClass != RealClass ) PrunedErrors++; if ( CMInfo && t == Saved ) { ConfusionMat[RealClass*(MaxClass+1)+PrunedClass]++; } } if ( TRIALS > 1 ) { printf("%4d", t); } printf("\t%4d %3d(%4.1f%%) %4d %3d(%4.1f%%) (%4.1f%%)%s\n", TreeSize(Raw[t]), RawErrors, 100.0*RawErrors / (MaxItem+1.0), TreeSize(Pruned[t]), PrunedErrors, 100.0*PrunedErrors / (MaxItem+1.0), 100 * Pruned[t]->Errors / Pruned[t]->Items, ( t == Saved ? " <<" : "" )); } if ( CMInfo ) { PrintConfusionMatrix(ConfusionMat); free(ConfusionMat); }}/*************************************************************************//* *//* Central tree-forming algorithm incorporating all criteria *//* --------------------------------------------------------- *//* *//*************************************************************************/ItemCount *Weight, /* Weight[i] = current fraction of item i */ **Freq, /* Freq[x][c] = no. items of class c with outcome x */ *ValFreq, /* ValFreq[x] = no. items with outcome x */ *ClassFreq; /* ClassFreq[c] = no. items of class c */float *Gain, /* Gain[a] = info gain by split on att a */ *Info, /* Info[a] = potential info of split on att a */ *Bar, /* Bar[a] = best threshold for contin att a */ *UnknownRate; /* UnknownRate[a] = current unknown rate for att a */Boolean *Tested, /* Tested[a] set if att a has already been tested */ MultiVal; /* true when all atts have many values */ /* External variables initialised here */extern float *SplitGain, /* SplitGain[i] = gain with att value of item i as threshold */ *SplitInfo; /* SplitInfo[i] = potential info ditto */extern ItemCount *Slice1, /* Slice1[c] = saved values of Freq[x][c] in subset.c */ *Slice2; /* Slice2[c] = saved values of Freq[y][c] */extern Set **Subset; /* Subset[a][s] = subset s for att a */extern short *Subsets; /* Subsets[a] = no. subsets for att a *//*************************************************************************//* *//* Allocate space for tree tables *//* *//*************************************************************************/ InitialiseTreeData()/* ------------------ */{ DiscrValue v; Attribute a; Tested = (char *) calloc(MaxAtt+1, sizeof(char)); Gain = (float *) calloc(MaxAtt+1, sizeof(float)); Info = (float *) calloc(MaxAtt+1, sizeof(float)); Bar = (float *) calloc(MaxAtt+1, sizeof(float)); Subset = (Set **) calloc(MaxAtt+1, sizeof(Set *)); ForEach(a, 0, MaxAtt) { if ( MaxAttVal[a] ) { Subset[a] = (Set *) calloc(MaxDiscrVal+1, sizeof(Set)); ForEach(v, 0, MaxAttVal[a]) { Subset[a][v] = (Set) malloc((MaxAttVal[a]>>3) + 1); } } } Subsets = (short *) calloc(MaxAtt+1, sizeof(short)); SplitGain = (float *) calloc(MaxItem+1, sizeof(float)); SplitInfo = (float *) calloc(MaxItem+1, sizeof(float)); Weight = (ItemCount *) calloc(MaxItem+1, sizeof(ItemCount)); Freq = (ItemCount **) calloc(MaxDiscrVal+1, sizeof(ItemCount *)); ForEach(v, 0, MaxDiscrVal) { Freq[v] = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount)); } ValFreq = (ItemCount *) calloc(MaxDiscrVal+1, sizeof(ItemCount)); ClassFreq = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount)); Slice1 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount)); Slice2 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount)); UnknownRate = (float *) calloc(MaxAtt+1, sizeof(float)); /* Check whether all attributes have many discrete values */ MultiVal = true; if ( ! SUBSET ) { for ( a = 0 ; MultiVal && a <= MaxAtt ; a++ ) { if ( SpecialStatus[a] != IGNORE ) { MultiVal = MaxAttVal[a] >= 0.3 * (MaxItem + 1); } } }}/*************************************************************************//* *//* Initialise the weight of each item *//* *//*************************************************************************/ InitialiseWeights()/* ----------------- */{ ItemNo i; ForEach(i, 0, MaxItem) { Weight[i] = 1.0; }}/*************************************************************************//* *//* Build a decision tree for the cases Fp through Lp: *//* *//* - if all cases are of the same class, the tree is a leaf and so *//* the leaf is returned labelled with this class *//* *//* - for each attribute, calculate the potential information provided *//* by a test on the attribute (based on the probabilities of each *//* case having a particular value for the attribute), and the gain *//* in information that would result from a test on the attribute *//* (based on the probabilities of each case with a particular *//* value for the attribute being of a particular class) *//* *//* - on the basis of these figures, and depending on the current *//* selection criterion, find the best attribute to branch on. *//* Note: this version will not allow a split on an attribute *//* unless two or more subsets have at least MINOBJS items. *//* *//* - try branching and test whether better than forming a leaf *//* *//*************************************************************************/Tree FormTree(Fp, Lp)/* --------- */ ItemNo Fp, Lp; { ItemNo i, Kp, Ep, Group(); ItemCount Cases, NoBestClass, KnownCases, CountItems(); float Factor, BestVal, Val, AvGain=0, Worth(); Attribute Att, BestAtt, Possible=0; ClassNo c, BestClass; Tree Node, Leaf(); DiscrValue v; Boolean PrevAllKnown; Cases = CountItems(Fp, Lp); /* Generate the class frequency distribution */ ForEach(c, 0, MaxClass) { ClassFreq[c] = 0; } ForEach(i, Fp, Lp) { ClassFreq[ Class(Item[i]) ] += Weight[i]; } /* Find the most frequent class */ BestClass = 0; ForEach(c, 0, MaxClass) { if ( ClassFreq[c] > ClassFreq[BestClass] ) { BestClass = c; } } NoBestClass = ClassFreq[BestClass]; Node = Leaf(ClassFreq, BestClass, Cases, Cases - NoBestClass); /* If all cases are of the same class or there are not enough cases to divide, the tree is a leaf */ if ( NoBestClass == Cases || Cases < 2 * MINOBJS ) { return Node; } Verbosity(1) printf("\n%d items, total weight %.1f\n", Lp - Fp + 1, Cases); /* For each available attribute, find the information and gain */ ForEach(Att, 0, MaxAtt) { Gain[Att] = -Epsilon; if ( SpecialStatus[Att] == IGNORE ) continue; if ( MaxAttVal[Att] ) { /* discrete valued attribute */ if ( SUBSET && MaxAttVal[Att] > 2 ) { EvalSubset(Att, Fp, Lp, Cases); } else if ( ! Tested[Att] ) { EvalDiscreteAtt(Att, Fp, Lp, Cases); } } else { /* continuous attribute */ EvalContinuousAtt(Att, Fp, Lp); } /* Update average gain, excluding attributes with very many values */ if ( Gain[Att] > -Epsilon && ( MultiVal || MaxAttVal[Att] < 0.3 * (MaxItem + 1) ) ) { Possible++; AvGain += Gain[Att]; } } /* Find the best attribute according to the given criterion */ BestVal = -Epsilon; BestAtt = None; AvGain = ( Possible ? AvGain / Possible : 1E6 ); Verbosity(2) { if ( AvGain < 1E6 ) printf("\taverage gain %.3f\n", AvGain); } ForEach(Att, 0, MaxAtt) { if ( Gain[Att] > -Epsilon ) { Val = Worth(Info[Att], Gain[Att], AvGain); if ( Val > BestVal ) { BestAtt = Att; BestVal = Val; } } } /* Decide whether to branch or not */ if ( BestAtt != None ) { Verbosity(1) { printf("\tbest attribute %s", AttName[BestAtt]); if ( ! MaxAttVal[BestAtt] ) { printf(" cut %.3f", Bar[BestAtt]); } printf(" inf %.3f gain %.3f val %.3f\n", Info[BestAtt], Gain[BestAtt], BestVal); } /* Build a node of the selected test */ if ( MaxAttVal[BestAtt] ) { /* Discrete valued attribute */ if ( SUBSET && MaxAttVal[BestAtt] > 2 ) { SubsetTest(Node, BestAtt); } else { DiscreteTest(Node, BestAtt); } } else { /* Continuous attribute */ ContinTest(Node, BestAtt); } /* Remove unknown attribute values */ PrevAllKnown = AllKnown; Kp = Group(0, Fp, Lp, Node) + 1; if ( Kp != Fp ) AllKnown = false; KnownCases = Cases - CountItems(Fp, Kp-1); UnknownRate[BestAtt] = (Cases - KnownCases) / (Cases + 0.001); Verbosity(1) { if ( UnknownRate[BestAtt] > 0 ) { printf("\tunknown rate for %s = %.3f\n", AttName[BestAtt], UnknownRate[BestAtt]); } } /* Recursive divide and conquer */ ++Tested[BestAtt]; Ep = Kp - 1; Node->Errors = 0; ForEach(v, 1, Node->Forks) { Ep = Group(v, Kp, Lp, Node);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -