📄 c4.5gt.c
字号:
}/*************************************************************************//* *//* Evaluation of a test on a continuous valued attribute *//* ----------------------------------------------------- *//* *//*************************************************************************/float *SplitGain, /* SplitGain[i] = gain with att value of item i as threshold */ *SplitInfo; /* SplitInfo[i] = potential info ditto *//*************************************************************************//* *//* Continuous attributes are treated as if they have possible values *//* 0 (unknown), 1 (less than cut), 2(greater than cut) *//* This routine finds the best cut for items Fp through Lp and sets *//* Info[], Gain[] and Bar[] *//* *//*************************************************************************/ EvalContinuousAtt(Att, Fp, Lp)/* ----------------- */ Attribute Att; ItemNo Fp, Lp; { ItemNo i, BestI, Xp, Tries=0; ItemCount Items, KnownItems, LowItems, MinSplit, CountItems(); ClassNo c; float AvGain=0, Val, BestVal, BaseInfo, ThreshCost, ComputeGain(), TotalInfo(), Worth(); void Swap(); Verbosity(2) printf("\tAtt %s", AttName[Att]); Verbosity(3) printf("\n"); ResetFreq(2); /* Omit and count unknown values */ Items = CountItems(Fp, Lp); Xp = Fp; ForEach(i, Fp, Lp) { if ( CVal(Item[i],Att) == Unknown ) { Freq[ 0 ][ Class(Item[i]) ] += Weight[i]; Swap(Xp, i); Xp++; } } ValFreq[0] = 0; ForEach(c, 0, MaxClass) { ValFreq[0] += Freq[0][c]; } KnownItems = Items - ValFreq[0]; UnknownRate[Att] = 1.0 - KnownItems / Items; /* Special case when very few known values */ if ( KnownItems < 2 * MINOBJS ) { Verbosity(2) printf("\tinsufficient cases with known values\n"); Gain[Att] = -Epsilon; Info[Att] = 0.0; return; } Quicksort(Xp, Lp, Att, Swap); /* Count base values and determine base information */ ForEach(i, Xp, Lp) { Freq[ 2 ][ Class(Item[i]) ] += Weight[i]; SplitGain[i] = -Epsilon; SplitInfo[i] = 0; } BaseInfo = TotalInfo(Freq[2], 0, MaxClass) / KnownItems; /* Try possible cuts between items i and i+1, and determine the information and gain of the split in each case. We have to be wary of splitting a small number of items off one end, as we can always split off a single item, but this has little predictive power. */ MinSplit = 0.10 * KnownItems / (MaxClass + 1); if ( MinSplit <= MINOBJS ) MinSplit = MINOBJS; else if ( MinSplit > 25 ) MinSplit = 25; LowItems = 0; ForEach(i, Xp, Lp - 1) { c = Class(Item[i]); LowItems += Weight[i]; Freq[1][c] += Weight[i]; Freq[2][c] -= Weight[i]; if ( LowItems < MinSplit ) continue; else if ( LowItems > KnownItems - MinSplit ) break; if ( CVal(Item[i],Att) < CVal(Item[i+1],Att) - 1E-5 ) { ValFreq[1] = LowItems; ValFreq[2] = KnownItems - LowItems; SplitGain[i] = ComputeGain(BaseInfo, UnknownRate[Att], 2, KnownItems); SplitInfo[i] = TotalInfo(ValFreq, 0, 2) / Items; AvGain += SplitGain[i]; Tries++; Verbosity(3) { printf("\t\tCut at %.3f (gain %.3f, val %.3f):", ( CVal(Item[i],Att) + CVal(Item[i+1],Att) ) / 2, SplitGain[i], Worth(SplitInfo[i], SplitGain[i], Epsilon)); PrintDistribution(Att, 2, true); } } } /* Find the best attribute according to the given criterion */ ThreshCost = Log(Tries) / Items; BestVal = 0; BestI = None; ForEach(i, Xp, Lp - 1) { if ( (Val = SplitGain[i] - ThreshCost) > BestVal ) { BestI = i; BestVal = Val; } } /* If a test on the attribute is able to make a gain, set the best break point, gain and information */ if ( BestI == None ) { Gain[Att] = -Epsilon; Info[Att] = 0.0; Verbosity(2) printf("\tno gain\n"); } else { Bar[Att] = (CVal(Item[BestI],Att) + CVal(Item[BestI+1],Att)) / 2; Gain[Att] = BestVal; Info[Att] = SplitInfo[BestI]; Verbosity(2) printf("\tcut=%.3f, inf %.3f, gain %.3f\n", Bar[Att], Info[Att], Gain[Att]); }} /*************************************************************************//* *//* Change a leaf into a test on a continuous attribute *//* *//*************************************************************************/ ContinTest(Node, Att)/* ---------- */ Tree Node; Attribute Att;{ float Thresh, GreatestValueBelow(); ItemCount CountItems(); Sprout(Node, 2); Thresh = GreatestValueBelow(Att, Bar[Att]); Node->NodeType = ThreshContin; Node->Tested = Att; Node->Cut = Node->Lower = Node->Upper = Thresh; Node->Errors = 0;}/*************************************************************************//* *//* Return the greatest value of attribute Att below threshold t *//* *//*************************************************************************/float GreatestValueBelow(Att, t)/* ------------------ */ Attribute Att; float t;{ ItemNo i; float v, Best; Boolean NotYet=true; ForEach(i, 0, MaxItem) { v = CVal(Item[i], Att); if ( v != Unknown && v <= t && ( NotYet || v > Best ) ) { Best = v; NotYet = false; } } return Best;}/*************************************************************************//* *//* Evaluation of the subsetting of a discrete attribute *//* ---------------------------------------------------- *//* *//*************************************************************************/ItemCount *Slice1, /* Slice1[c] = saved values of Freq[x][c] in subset.c */ *Slice2; /* Slice2[c] = saved values of Freq[y][c] */Set **Subset; /* Subset[a][s] = subset s for att a */short *Subsets; /* Subsets[a] = no. subsets for att a *//*************************************************************************//* *//* Evaluate subsetting a discrete attribute and form the chosen *//* subsets Subset[Att][], setting Subsets[Att] to the number of *//* subsets, and the Info[] and Gain[] of a test on the attribute *//* *//*************************************************************************/ EvalSubset(Att, Fp, Lp, Items)/* ---------- */ Attribute Att; ItemNo Fp, Lp; ItemCount Items;{ DiscrValue V1, V2, BestV1, BestV2, Barred; ItemCount KnownItems; ClassNo c; float BaseInfo, MinGain, ThisGain, ThisInfo, Val, BestVal, BestGain, BestInfo, PrevVal, PrevGain, PrevInfo, DiscrKnownBaseInfo(), Worth(), ComputeGain(), TotalInfo(); short Blocks=0, MissingValues=0, ReasonableSubsets, Bytes, b; Boolean MergedSubsets = false; int SaveMINOBJS; SaveMINOBJS = MINOBJS; MINOBJS = 1; /* First compute Freq[][], ValFreq[], base info, and the gain and total info of a split on discrete attribute Att */ ComputeFrequencies(Att, Fp, Lp); KnownItems = Items - ValFreq[0]; if ( KnownItems < Epsilon ) { Verbosity(2) printf("\tAtt %s: no known values\n", AttName[Att]); Gain[Att] = -Epsilon; Info[Att] = 0; return; } BaseInfo = DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]); PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], MaxAttVal[Att],KnownItems); PrevInfo = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items; PrevVal = Worth(PrevInfo, PrevGain, Epsilon); Verbosity(2) { printf("\tAtt %s", AttName[Att]); Verbosity(3) PrintDistribution(Att, MaxAttVal[Att], true); printf("\tinf %.3f, gain %.3f, val=%.3f\n", PrevInfo, PrevGain, PrevVal); } /* Eliminate unrepresented attribute values from Freq[] and ValFreq[] and form a separate subset for each represented attribute value */ Bytes = (MaxAttVal[Att]>>3) + 1; ClearBits(Bytes, Subset[Att][0]); ForEach(V1, 1, MaxAttVal[Att]) { if ( ValFreq[V1] > 0.5 ) { if ( ++Blocks < V1 ) { ValFreq[Blocks] = ValFreq[V1]; ForEach(c, 0, MaxClass) { Freq[Blocks][c] = Freq[V1][c]; } } ClearBits(Bytes, Subset[Att][Blocks]); SetBit(V1, Subset[Att][Blocks]); } else { SetBit(V1, Subset[Att][0]); MissingValues++; } } /* Merge any single-class subsets with others of the same class */ /* Note: have ValFreq[V] > 0 for all V */ ForEach(V1, 1, Blocks-1) { for ( c = 0 ; Freq[V1][c] < 0.1 ; c++ ) ; if ( Freq[V1][c] < ValFreq[V1] - 0.1 ) continue; /* Now have a single class -- look for others */ for ( V2 = V1+1 ; V2 <= Blocks ; ) { if ( Freq[V2][c] < ValFreq[V2] - 0.1 ) { V2++; } else { /* Merge these subsets */ Combine(V1, V2, Blocks); ForEach(b, 0, Bytes-1) { Subset[Att][V1][b] |= Subset[Att][V2][b]; Subset[Att][V2][b] = Subset[Att][Blocks][b]; } Blocks--; MergedSubsets = true; } } } if ( MergedSubsets ) { PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems); PrevInfo = TotalInfo(ValFreq, 0, Blocks) / Items; PrevVal = Worth(PrevInfo, PrevGain, Epsilon); Verbosity(2) { printf("\tAfter merging single-class subsets:"); Verbosity(3) PrintDistribution(Att, Blocks, false); printf("\tinf %.3f, gain %.3f, val=%.3f\n", PrevInfo, PrevGain, PrevVal); } } /* Examine possible pair mergers and hill-climb */ MinGain = PrevGain / 2; while ( Blocks > 2 ) { BestVal = BestV1 = 0; BestGain = -Epsilon; /* Check reasonable subsets; if less than 3, bar mergers involving the largest block */ ReasonableSubsets = 0; Barred = 1; ForEach(V1, 1, Blocks) { if ( ValFreq[V1] >= SaveMINOBJS ) ReasonableSubsets++; if ( ValFreq[V1] > ValFreq[Barred] ) Barred = V1; } if ( ReasonableSubsets >= 3 ) Barred = 0; /* For each possible pair of values, calculate the gain and total info of a split in which they are treated as one. Keep track of the pair with the best gain. */ ForEach(V1, 1, Blocks-1) { ForEach(V2, V1+1, Blocks) { if ( V1 == Barred || V2 == Barred ) continue; Combine(V1, V2, Blocks); ThisGain = ComputeGain(BaseInfo, UnknownRate[Att], Blocks-1, KnownItems); ThisInfo = TotalInfo(ValFreq, 0, Blocks-1) / Items; Val = Worth(ThisInfo, ThisGain, Epsilon); Verbosity(4) { printf("\tcombine %d %d info %.3f gain %.3f val %.3f", V1, V2, ThisInfo, ThisGain, Val); PrintDistribution(Att, Blocks-1, false); } /* Force a split if less than two reasonable subsets, or using GAIN criterion Prefer this split to the previous one if gain >= MinGain (and previous < MinGain), or val >= previous best val */ if ( ThisGain >= MinGain && BestGain < MinGain || Val >= BestVal || ! BestV1 && ( ! GAINRATIO || ReasonableSubsets < 2 ) ) { BestVal = Val; BestGain = ThisGain; BestInfo = ThisInfo; BestV1 = V1; BestV2 = V2; } Uncombine(V1, V2); } } if ( GAINRATIO && ReasonableSubsets >= 2 && ( ! BestV1 || BestVal < PrevVal + 1E-5 || BestVal == PrevVal && BestGain < PrevGain ) ) break; PrevGain = BestGain; PrevInfo = BestInfo; PrevVal = BestVal; Combine(BestV1, BestV2, Blocks); ForEach(b, 0, Bytes-1) { Subset[Att][BestV1][b] |= Subset[Att][BestV2][b]; Subset[Att][BestV2][b] = Subset[Att][Blocks][b]; } Blocks--; Verbosity(2) { printf("\t\tform subset "); PrintSubset(Att, Subset[Att][BestV1]); printf(": %d subsets, inf %.3f, gain %.3f, val %.3f\n", Blocks, BestInfo, BestGain, BestVal); Verbosity(3) { printf("\t\tcombine %d, %d", BestV1, BestV2); PrintDistribution(Att, Blocks, false); } } } MINOBJS = SaveMINOBJS; if ( PrevVal <= 0 ) { Gain[Att] = -Epsilon; Info[Att] = 0; } else { Gain[Att] = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems); Info[Att] = PrevInfo; if ( MissingValues ) { Blocks++; CopyBits(Bytes, Subset[Att][0], Subset[Att][Blocks]); } Subsets[Att] = Blocks; Verbosity(2) printf("\tFinal subsets:"); Verbosity(3) PrintDistribution(Att, Blocks, false); Verbosity(2) printf("\tinf %.3f gain %.3f val %.3f\n", Info[Att], Gain[Att], Worth(Info[Att], Gain[Att], Epsilon)); }}/*************************************************************************//* *//* Combine the distribution figures of discrete attribute values *//* x and y, putting the combined figures in Freq[x][] and *//* ValFreq[x][], and saving old values in Slice1 and Slice2 *//* *//*************************************************************************/ Combine(x, y, Last)/* ------- */ DiscrValue x, y, Last;{ ClassNo c; ForEach(c, 0, MaxClass) { Slice1[c] = Freq[x][c]; Slice2[c] = Freq[y][c]; Freq[x][c] += Freq[y][c]; Freq[y][c] = Freq[Last][c]; } Slice1[MaxClass+1] = ValFreq[x]; Slice2[MaxClass+1] = ValFreq[y]; ValFreq[x] += ValFreq[y]; ValFreq[y] = ValFreq[Last];}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -