📄 subset.c
字号:
/*************************************************************************/
/* */
/* Evaluation of the subsetting of a discrete attribute */
/* ---------------------------------------------------- */
/* */
/*************************************************************************/
#include "buildex.i"
ItemCount
*Slice1, /* Slice1[c] = saved values of Freq[x][c] in subset.c */
*Slice2; /* Slice2[c] = saved values of Freq[y][c] */
Set
**Subset; /* Subset[a][s] = subset s for att a */
short
*Subsets; /* Subsets[a] = no. subsets for att a */
/*************************************************************************/
/* */
/* Evaluate subsetting a discrete attribute and form the chosen */
/* subsets Subset[Att][], setting Subsets[Att] to the number of */
/* subsets, and the Info[] and Gain[] of a test on the attribute */
/* */
/*************************************************************************/
EvalSubset(Att, Fp, Lp, Items)
/* ---------- */
Attribute Att;
ItemNo Fp, Lp;
ItemCount Items;
{
DiscrValue V1, V2, BestV1, BestV2, Barred;
ItemCount KnownItems;
ClassNo c;
float BaseInfo, MinGain, ThisGain, ThisInfo,
Val, BestVal, BestGain, BestInfo,
PrevVal, PrevGain, PrevInfo,
DiscrKnownBaseInfo(), Worth(), ComputeGain(), TotalInfo();
short Blocks=0, MissingValues=0, ReasonableSubsets, Bytes, b;
Boolean MergedSubsets = false;
int SaveMINOBJS;
SaveMINOBJS = MINOBJS;
MINOBJS = 1;
/* First compute Freq[][], ValFreq[], base info, and the gain
and total info of a split on discrete attribute Att */
ComputeFrequencies(Att, Fp, Lp);
KnownItems = Items - ValFreq[0];
if ( KnownItems < Epsilon )
{
Verbosity(2) printf("\tAtt %s: no known values\n", AttName[Att]);
Gain[Att] = -Epsilon;
Info[Att] = 0;
return;
}
BaseInfo = DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]);
PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], MaxAttVal[Att],KnownItems);
PrevInfo = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items;
PrevVal = Worth(PrevInfo, PrevGain, Epsilon);
Verbosity(2)
{
printf("\tAtt %s", AttName[Att]);
Verbosity(3) PrintDistribution(Att, MaxAttVal[Att], true);
printf("\tinf %.3f, gain %.3f, val=%.3f\n",
PrevInfo, PrevGain, PrevVal);
}
/* Eliminate unrepresented attribute values from Freq[] and ValFreq[]
and form a separate subset for each represented attribute value */
Bytes = (MaxAttVal[Att]>>3) + 1;
ClearBits(Bytes, Subset[Att][0]);
ForEach(V1, 1, MaxAttVal[Att])
{
if ( ValFreq[V1] > 0.5 )
{
if ( ++Blocks < V1 )
{
ValFreq[Blocks] = ValFreq[V1];
ForEach(c, 0, MaxClass)
{
Freq[Blocks][c] = Freq[V1][c];
}
}
ClearBits(Bytes, Subset[Att][Blocks]);
SetBit(V1, Subset[Att][Blocks]);
}
else
{
SetBit(V1, Subset[Att][0]);
MissingValues++;
}
}
/* Merge any single-class subsets with others of the same class */
/* Note: have ValFreq[V] > 0 for all V */
ForEach(V1, 1, Blocks-1)
{
for ( c = 0 ; Freq[V1][c] < 0.1 ; c++ )
;
if ( Freq[V1][c] < ValFreq[V1] - 0.1 ) continue;
/* Now have a single class -- look for others */
for ( V2 = V1+1 ; V2 <= Blocks ; )
{
if ( Freq[V2][c] < ValFreq[V2] - 0.1 )
{
V2++;
}
else
{
/* Merge these subsets */
Combine(V1, V2, Blocks);
ForEach(b, 0, Bytes-1)
{
Subset[Att][V1][b] |= Subset[Att][V2][b];
Subset[Att][V2][b] = Subset[Att][Blocks][b];
}
Blocks--;
MergedSubsets = true;
}
}
}
if ( MergedSubsets )
{
PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems);
PrevInfo = TotalInfo(ValFreq, 0, Blocks) / Items;
PrevVal = Worth(PrevInfo, PrevGain, Epsilon);
Verbosity(2)
{
printf("\tAfter merging single-class subsets:");
Verbosity(3) PrintDistribution(Att, Blocks, false);
printf("\tinf %.3f, gain %.3f, val=%.3f\n",
PrevInfo, PrevGain, PrevVal);
}
}
/* Examine possible pair mergers and hill-climb */
MinGain = PrevGain / 2;
while ( Blocks > 2 )
{
BestVal = BestV1 = 0;
BestGain = -Epsilon;
/* Check reasonable subsets; if less than 3, bar mergers
involving the largest block */
ReasonableSubsets = 0;
Barred = 1;
ForEach(V1, 1, Blocks)
{
if ( ValFreq[V1] >= SaveMINOBJS ) ReasonableSubsets++;
if ( ValFreq[V1] > ValFreq[Barred] ) Barred = V1;
}
if ( ReasonableSubsets >= 3 ) Barred = 0;
/* For each possible pair of values, calculate the gain and
total info of a split in which they are treated as one.
Keep track of the pair with the best gain. */
ForEach(V1, 1, Blocks-1)
{
ForEach(V2, V1+1, Blocks)
{
if ( V1 == Barred || V2 == Barred ) continue;
Combine(V1, V2, Blocks);
ThisGain = ComputeGain(BaseInfo, UnknownRate[Att],
Blocks-1, KnownItems);
ThisInfo = TotalInfo(ValFreq, 0, Blocks-1) / Items;
Val = Worth(ThisInfo, ThisGain, Epsilon);
Verbosity(4)
{
printf("\tcombine %d %d info %.3f gain %.3f val %.3f",
V1, V2, ThisInfo, ThisGain, Val);
PrintDistribution(Att, Blocks-1, false);
}
/* Force a split if
less than two reasonable subsets, or
using GAIN criterion
Prefer this split to the previous one if
gain >= MinGain (and previous < MinGain), or
val >= previous best val */
if ( ThisGain >= MinGain && BestGain < MinGain ||
Val >= BestVal ||
! BestV1 && ( ! GAINRATIO || ReasonableSubsets < 2 ) )
{
BestVal = Val;
BestGain = ThisGain;
BestInfo = ThisInfo;
BestV1 = V1;
BestV2 = V2;
}
Uncombine(V1, V2);
}
}
if ( GAINRATIO &&
ReasonableSubsets >= 2 &&
( ! BestV1 ||
BestVal < PrevVal + 1E-5 ||
BestVal == PrevVal && BestGain < PrevGain ) ) break;
PrevGain = BestGain;
PrevInfo = BestInfo;
PrevVal = BestVal;
Combine(BestV1, BestV2, Blocks);
ForEach(b, 0, Bytes-1)
{
Subset[Att][BestV1][b] |= Subset[Att][BestV2][b];
Subset[Att][BestV2][b] = Subset[Att][Blocks][b];
}
Blocks--;
Verbosity(2)
{
printf("\t\tform subset ");
PrintSubset(Att, Subset[Att][BestV1]);
printf(": %d subsets, inf %.3f, gain %.3f, val %.3f\n",
Blocks, BestInfo, BestGain, BestVal);
Verbosity(3)
{
printf("\t\tcombine %d, %d", BestV1, BestV2);
PrintDistribution(Att, Blocks, false);
}
}
}
MINOBJS = SaveMINOBJS;
if ( PrevVal <= 0 )
{
Gain[Att] = -Epsilon;
Info[Att] = 0;
}
else
{
Gain[Att] = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems);
Info[Att] = PrevInfo;
if ( MissingValues )
{
Blocks++;
CopyBits(Bytes, Subset[Att][0], Subset[Att][Blocks]);
}
Subsets[Att] = Blocks;
Verbosity(2) printf("\tFinal subsets:");
Verbosity(3) PrintDistribution(Att, Blocks, false);
Verbosity(2)
printf("\tinf %.3f gain %.3f val %.3f\n",
Info[Att], Gain[Att], Worth(Info[Att], Gain[Att], Epsilon));
}
}
/*************************************************************************/
/* */
/* Combine the distribution figures of discrete attribute values */
/* x and y, putting the combined figures in Freq[x][] and */
/* ValFreq[x][], and saving old values in Slice1 and Slice2 */
/* */
/*************************************************************************/
Combine(x, y, Last)
/* ------- */
DiscrValue x, y, Last;
{
ClassNo c;
ForEach(c, 0, MaxClass)
{
Slice1[c] = Freq[x][c];
Slice2[c] = Freq[y][c];
Freq[x][c] += Freq[y][c];
Freq[y][c] = Freq[Last][c];
}
Slice1[MaxClass+1] = ValFreq[x];
Slice2[MaxClass+1] = ValFreq[y];
ValFreq[x] += ValFreq[y];
ValFreq[y] = ValFreq[Last];
}
/*************************************************************************/
/* */
/* Restore old class distribution figures of discrete attribute */
/* values x and y from Slice1 and Slice2 */
/* */
/*************************************************************************/
Uncombine(x, y)
/* --------- */
DiscrValue x, y;
{
ClassNo c;
ForEach(c, 0, MaxClass)
{
Freq[x][c] = Slice1[c];
Freq[y][c] = Slice2[c];
}
ValFreq[x] = Slice1[MaxClass+1];
ValFreq[y] = Slice2[MaxClass+1];
}
/*************************************************************************/
/* */
/* Print the values of attribute Att which are in the subset Ss */
/* */
/*************************************************************************/
PrintSubset(Att, Ss)
/* ----------- */
Attribute Att;
Set Ss;
{
DiscrValue V1;
Boolean First=true;
ForEach(V1, 1, MaxAttVal[Att])
{
if ( In(V1, Ss) )
{
if ( First )
{
First = false;
}
else
{
printf(", ");
}
printf("%s", AttValName[Att][V1]);
}
}
}
/*************************************************************************/
/* */
/* Construct and return a node for a test on a subset of values */
/* */
/*************************************************************************/
SubsetTest(Node, Att)
/* ----------- */
Tree Node;
Attribute Att;
{
ItemCount CountItems();
short S, Bytes;
Sprout(Node, Subsets[Att]);
Node->NodeType = BrSubset;
Node->Tested = Att;
Node->Errors = 0;
Bytes = (MaxAttVal[Att]>>3) + 1;
Node->Subset = (Set *) calloc(Subsets[Att] + 1, sizeof(Set));
ForEach(S, 1, Node->Forks)
{
Node->Subset[S] = (Set) malloc(Bytes);
CopyBits(Bytes, Subset[Att][S], Node->Subset[S]);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -