📄 c4.5gt.c
字号:
/*************************************************************************//* *//* Restore old class distribution figures of discrete attribute *//* values x and y from Slice1 and Slice2 *//* *//*************************************************************************/ Uncombine(x, y)/* --------- */ DiscrValue x, y;{ ClassNo c; ForEach(c, 0, MaxClass) { Freq[x][c] = Slice1[c]; Freq[y][c] = Slice2[c]; } ValFreq[x] = Slice1[MaxClass+1]; ValFreq[y] = Slice2[MaxClass+1];}/*************************************************************************//* *//* Print the values of attribute Att which are in the subset Ss *//* *//*************************************************************************/ PrintSubset(Att, Ss)/* ----------- */ Attribute Att; Set Ss;{ DiscrValue V1; Boolean First=true; ForEach(V1, 1, MaxAttVal[Att]) { if ( In(V1, Ss) ) { if ( First ) { First = false; } else { printf(", "); } printf("%s", AttValName[Att][V1]); } }}/*************************************************************************//* *//* Construct and return a node for a test on a subset of values *//* *//*************************************************************************/ SubsetTest(Node, Att)/* ----------- */ Tree Node; Attribute Att;{ ItemCount CountItems(); short S, Bytes; Sprout(Node, Subsets[Att]); Node->NodeType = BrSubset; Node->Tested = Att; Node->Errors = 0; Bytes = (MaxAttVal[Att]>>3) + 1; Node->Subset = (Set *) calloc(Subsets[Att] + 1, sizeof(Set)); ForEach(S, 1, Node->Forks) { Node->Subset[S] = (Set) malloc(Bytes); CopyBits(Bytes, Subset[Att][S], Node->Subset[S]); }} /*************************************************************************//* *//* Prune a decision tree and predict its error rate *//* ------------------------------------------------ *//* *//*************************************************************************/extern ItemCount *Weight;Set *PossibleValues=Nil;Boolean Changed;#define LocalVerbosity(x) if (Sh >= 0 && VERBOSITY >= x)#define Intab(x) Indent(x, "| ")/*************************************************************************//* *//* Prune tree T, returning true if tree has been modified *//* *//*************************************************************************/Boolean Prune(T)/* ----- */ Tree T;{ ItemNo i; float EstimateErrors(); Attribute a; InitialiseWeights(); AllKnown = true; Verbosity(1) printf("\n"); Changed = false; EstimateErrors(T, 0, MaxItem, 0, true); if ( SUBSET ) { if ( ! PossibleValues ) { PossibleValues = (Set *) calloc(MaxAtt+1, sizeof(Set)); } ForEach(a, 0, MaxAtt) { if ( MaxAttVal[a] ) { PossibleValues[a] = (Set) malloc((MaxAttVal[a]>>3) + 1); ClearBits((MaxAttVal[a]>>3) + 1, PossibleValues[a]); ForEach(i, 1, MaxAttVal[a]) { SetBit(i, PossibleValues[a]); } } } CheckPossibleValues(T); } return Changed;}/*************************************************************************//* *//* Estimate the errors in a given subtree *//* *//*************************************************************************/float EstimateErrors(T, Fp, Lp, Sh, UpdateTree)/* -------------- */ Tree T; ItemNo Fp, Lp; short Sh; Boolean UpdateTree;{ ItemNo i, Kp, Ep, Group(); ItemCount Cases, KnownCases, *LocalClassDist, TreeErrors, LeafErrors, ExtraLeafErrors, BranchErrors, CountItems(), Factor, MaxFactor, AddErrs(); DiscrValue v, MaxBr; ClassNo c, BestClass; Boolean PrevAllKnown; /* Generate the class frequency distribution */ Cases = CountItems(Fp, Lp); LocalClassDist = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount)); ForEach(i, Fp, Lp) { LocalClassDist[ Class(Item[i]) ] += Weight[i]; } /* Find the most frequent class and update the tree */ BestClass = T->Leaf; ForEach(c, 0, MaxClass) { if ( LocalClassDist[c] > LocalClassDist[BestClass] ) { BestClass = c; } } LeafErrors = Cases - LocalClassDist[BestClass]; ExtraLeafErrors = AddErrs(Cases, LeafErrors); if ( UpdateTree ) { T->Items = Cases; T->Leaf = BestClass; memcpy(T->ClassDist, LocalClassDist, (MaxClass + 1) * sizeof(ItemCount)); } if ( ! T->NodeType ) /* leaf */ { TreeErrors = LeafErrors + ExtraLeafErrors; if ( UpdateTree ) { T->Errors = TreeErrors; LocalVerbosity(1) { Intab(Sh); printf("%s (%.2f:%.2f/%.2f)\n", ClassName[T->Leaf], T->Items, LeafErrors, T->Errors); } } free(LocalClassDist); return TreeErrors; } /* Estimate errors for each branch */ Kp = Group(0, Fp, Lp, T) + 1; KnownCases = CountItems(Kp, Lp); PrevAllKnown = AllKnown; if ( Kp != Fp ) AllKnown = false; TreeErrors = MaxFactor = 0; ForEach(v, 1, T->Forks) { Ep = Group(v, Kp, Lp, T); if ( Kp <= Ep ) { Factor = CountItems(Kp, Ep) / KnownCases; if ( Factor >= MaxFactor ) { MaxBr = v; MaxFactor = Factor; } ForEach(i, Fp, Kp-1) { Weight[i] *= Factor; } TreeErrors += EstimateErrors(T->Branch[v], Fp, Ep, Sh+1, UpdateTree); Group(0, Fp, Ep, T); ForEach(i, Fp, Kp-1) { Weight[i] /= Factor; } } } AllKnown = PrevAllKnown; if ( ! UpdateTree ) { free(LocalClassDist); return TreeErrors; } /* See how the largest branch would fare */ BranchErrors = EstimateErrors(T->Branch[MaxBr], Fp, Lp, -1000, false); LocalVerbosity(1) { Intab(Sh); printf("%s: [%d%% N=%.2f tree=%.2f leaf=%.2f+%.2f br[%d]=%.2f]\n", AttName[T->Tested], (int) ((TreeErrors * 100) / (T->Items + 0.001)), T->Items, TreeErrors, LeafErrors, ExtraLeafErrors, MaxBr, BranchErrors); } /* See whether tree should be replaced with leaf or largest branch */ if ( LeafErrors + ExtraLeafErrors <= BranchErrors + 0.1 && LeafErrors + ExtraLeafErrors <= TreeErrors + 0.1 ) { LocalVerbosity(1) { Intab(Sh); printf("Replaced with leaf %s\n", ClassName[T->Leaf]); } T->NodeType = 0; T->Errors = LeafErrors + ExtraLeafErrors; Changed = true; } else if ( BranchErrors <= TreeErrors + 0.1 ) { LocalVerbosity(1) { Intab(Sh); printf("Replaced with branch %d\n", MaxBr); } AllKnown = PrevAllKnown; EstimateErrors(T->Branch[MaxBr], Fp, Lp, Sh, true); memcpy((char *) T, (char *) T->Branch[MaxBr], sizeof(TreeRec)); Changed = true; } else { T->Errors = TreeErrors; } AllKnown = PrevAllKnown; free(LocalClassDist); return T->Errors;}/*************************************************************************//* *//* Remove unnecessary subset tests on missing values *//* *//*************************************************************************/ CheckPossibleValues(T)/* ------------------- */ Tree T;{ Set HoldValues; int v, Bytes, b; Attribute A; char Any=0; if ( T->NodeType == BrSubset ) { A = T->Tested; Bytes = (MaxAttVal[A]>>3) + 1; HoldValues = (Set) malloc(Bytes); /* See if last (default) branch can be simplified or omitted */ ForEach(b, 0, Bytes-1) { T->Subset[T->Forks][b] &= PossibleValues[A][b]; Any |= T->Subset[T->Forks][b]; } if ( ! Any ) { T->Forks--; } /* Process each subtree, leaving only values in branch subset */ CopyBits(Bytes, PossibleValues[A], HoldValues); ForEach(v, 1, T->Forks) { CopyBits(Bytes, T->Subset[v], PossibleValues[A]); CheckPossibleValues(T->Branch[v]); } CopyBits(Bytes, HoldValues, PossibleValues[A]); free(HoldValues); } else if ( T->NodeType ) { ForEach(v, 1, T->Forks) { CheckPossibleValues(T->Branch[v]); } }}/*************************************************************************//* *//* Statistical routines for C4.5 *//* ----------------------------- *//* *//*************************************************************************/ /*************************************************************************//* *//* Compute the additional errors if the error rate increases to the *//* upper limit of the confidence level. The coefficient is the *//* square of the number of standard deviations corresponding to the *//* selected confidence level. (Taken from Documenta Geigy Scientific *//* Tables (Sixth Edition), p185 (with modifications).) *//* *//*************************************************************************/float Val[] = { 0, 0.001, 0.005, 0.01, 0.05, 0.10, 0.20, 0.40, 1.00}, Dev[] = {4.0, 3.09, 2.58, 2.33, 1.65, 1.28, 0.84, 0.25, 0.00};float AddErrs(N, e)/* ------- */ ItemCount N, e;{ static float Coeff=0; float Val0, Pr; if ( ! Coeff ) { /* Compute and retain the coefficient value, interpolating from the values in Val and Dev */ int i; i = 0; while ( CF > Val[i] ) i++; Coeff = Dev[i-1] + (Dev[i] - Dev[i-1]) * (CF - Val[i-1]) /(Val[i] - Val[i-1]); Coeff = Coeff * Coeff; } if ( e < 1E-6 ) { return N * (1 - exp(log(CF) / N)); } else if ( e < 0.9999 ) { Val0 = N * (1 - exp(log(CF) / N)); return Val0 + e * (AddErrs(N, 1.0) - Val0); } else if ( e + 0.5 >= N ) { return 0.67 * (N - e); } else { Pr = (e + 0.5 + Coeff/2 + sqrt(Coeff * ((e + 0.5) * (1 - (e + 0.5)/N) + Coeff/4)) ) / (N + Coeff); return (N * Pr - e); }}/*************************************************************************//* *//* Soften thresholds for continuous attributes *//* ------------------------------------------- *//* *//*************************************************************************/Boolean *LHSErr, /* Does a misclassification occur with this value of an att */ *RHSErr; /* if the below or above threshold branches are taken */ItemNo *ThreshErrs; /* ThreshErrs[i] is the no. of misclassifications if thresh is i */float *CVals; /* All values of a continuous attribute */#define Below(v,t) (v <= t + 1E-6)/*************************************************************************//* *//* Soften all thresholds for continuous attributes in tree T *//* *//*************************************************************************/ SoftenThresh(T)/* ------------ */ Tree T;{ CVals = (float *) calloc(MaxItem+1, sizeof(float)); LHSErr = (Boolean *) calloc(MaxItem+1, sizeof(Boolean)); RHSErr = (Boolean *) calloc(MaxItem+1, sizeof(Boolean)); ThreshErrs = (ItemNo *) calloc(MaxItem+1, sizeof(ItemNo)); InitialiseWeights(); ScanTree(T, 0, MaxItem); free(ThreshErrs); free(RHSErr); free(LHSErr); free(CVals);}/*************************************************************************//* *//* Calculate upper and lower bounds for each test on a continuous *//* attribute in tree T, using data items from Fp to Lp *//* *//*************************************************************************/ ScanTree(T, Fp, Lp)/* -------- */ Tree T; ItemNo Fp, Lp;{ short v; float Val, Se, Limit, Lower, Upper, GreatestValueBelow(); ItemNo i, Kp, Ep, LastI, Errors, BaseErrors; ClassNo CaseClass, Class1, Class2, Category(); Boolean LeftThresh=false; Description CaseDesc; Attribute Att; void Swap(); /* Stop when get to a leaf */ if ( ! T->NodeType ) return; /* Group the unknowns together */ Kp = Group(0, Fp, Lp, T); /* Soften a threshold for a continuous attribute */ Att = T->Tested; if ( T->NodeType == ThreshContin ) { prin
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -