📄 build.c
字号:
/*************************************************************************/
/* */
/* Central tree-forming algorithm incorporating all criteria */
/* --------------------------------------------------------- */
/* */
/*************************************************************************/
#include "defns.i"
#include "types.i"
#include "extern.i"
ItemCount
*Weight, /* Weight[i] = current fraction of item i */
**Freq, /* Freq[x][c] = no. items of class c with outcome x */
*ValFreq, /* ValFreq[x] = no. items with outcome x */
*ClassFreq; /* ClassFreq[c] = no. items of class c */
float
*Gain, /* Gain[a] = info gain by split on att a */
*Info, /* Info[a] = potential info of split on att a */
*Bar, /* Bar[a] = best threshold for contin att a */
*UnknownRate; /* UnknownRate[a] = current unknown rate for att a */
Boolean
*Tested, /* Tested[a] set if att a has already been tested */
MultiVal; /* true when all atts have many values */
/* External variables initialised here */
extern float
*SplitGain, /* SplitGain[i] = gain with att value of item i as threshold */
*SplitInfo; /* SplitInfo[i] = potential info ditto */
extern ItemCount
*Slice1, /* Slice1[c] = saved values of Freq[x][c] in subset.c */
*Slice2; /* Slice2[c] = saved values of Freq[y][c] */
extern Set
**Subset; /* Subset[a][s] = subset s for att a */
extern short
*Subsets; /* Subsets[a] = no. subsets for att a */
/*************************************************************************/
/* */
/* Allocate space for tree tables */
/* */
/*************************************************************************/
InitialiseTreeData()
/* ------------------ */
{
DiscrValue v;
Attribute a;
Tested = (char *) calloc(MaxAtt+1, sizeof(char));
Gain = (float *) calloc(MaxAtt+1, sizeof(float));
Info = (float *) calloc(MaxAtt+1, sizeof(float));
Bar = (float *) calloc(MaxAtt+1, sizeof(float));
Subset = (Set **) calloc(MaxAtt+1, sizeof(Set *));
ForEach(a, 0, MaxAtt)
{
if ( MaxAttVal[a] )
{
Subset[a] = (Set *) calloc(MaxDiscrVal+1, sizeof(Set));
ForEach(v, 0, MaxAttVal[a])
{
Subset[a][v] = (Set) malloc((MaxAttVal[a]>>3) + 1);
}
}
}
Subsets = (short *) calloc(MaxAtt+1, sizeof(short));
SplitGain = (float *) calloc(MaxItem+1, sizeof(float));
SplitInfo = (float *) calloc(MaxItem+1, sizeof(float));
Weight = (ItemCount *) calloc(MaxItem+1, sizeof(ItemCount));
Freq = (ItemCount **) calloc(MaxDiscrVal+1, sizeof(ItemCount *));
ForEach(v, 0, MaxDiscrVal)
{
Freq[v] = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));
}
ValFreq = (ItemCount *) calloc(MaxDiscrVal+1, sizeof(ItemCount));
ClassFreq = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));
Slice1 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount));
Slice2 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount));
UnknownRate = (float *) calloc(MaxAtt+1, sizeof(float));
/* Check whether all attributes have many discrete values */
MultiVal = true;
if ( ! SUBSET )
{
for ( a = 0 ; MultiVal && a <= MaxAtt ; a++ )
{
if ( SpecialStatus[a] != IGNORE )
{
MultiVal = MaxAttVal[a] >= 0.3 * (MaxItem + 1);
}
}
}
}
/*************************************************************************/
/* */
/* Initialise the weight of each item */
/* */
/*************************************************************************/
InitialiseWeights()
/* ----------------- */
{
ItemNo i;
ForEach(i, 0, MaxItem)
{
Weight[i] = 1.0;
}
}
/*************************************************************************/
/* */
/* Build a decision tree for the cases Fp through Lp: */
/* */
/* - if all cases are of the same class, the tree is a leaf and so */
/* the leaf is returned labelled with this class */
/* */
/* - for each attribute, calculate the potential information provided */
/* by a test on the attribute (based on the probabilities of each */
/* case having a particular value for the attribute), and the gain */
/* in information that would result from a test on the attribute */
/* (based on the probabilities of each case with a particular */
/* value for the attribute being of a particular class) */
/* */
/* - on the basis of these figures, and depending on the current */
/* selection criterion, find the best attribute to branch on. */
/* Note: this version will not allow a split on an attribute */
/* unless two or more subsets have at least MINOBJS items. */
/* */
/* - try branching and test whether better than forming a leaf */
/* */
/*************************************************************************/
Tree FormTree(Fp, Lp)
/* --------- */
ItemNo Fp, Lp;
{
ItemNo i, Kp, Ep, Group();
ItemCount Cases, NoBestClass, KnownCases, CountItems();
float Factor, BestVal, Val, AvGain=0, Worth();
Attribute Att, BestAtt, Possible=0;
ClassNo c, BestClass;
Tree Node, Leaf();
DiscrValue v;
Boolean PrevAllKnown;
Cases = CountItems(Fp, Lp);
/* Generate the class frequency distribution */
ForEach(c, 0, MaxClass)
{
ClassFreq[c] = 0;
}
ForEach(i, Fp, Lp)
{
ClassFreq[ Class(Item[i]) ] += Weight[i];
}
/* Find the most frequent class */
BestClass = 0;
ForEach(c, 0, MaxClass)
{
if ( ClassFreq[c] > ClassFreq[BestClass] )
{
BestClass = c;
}
}
NoBestClass = ClassFreq[BestClass];
Node = Leaf(ClassFreq, BestClass, Cases, Cases - NoBestClass);
/* If all cases are of the same class or there are not enough
cases to divide, the tree is a leaf */
if ( NoBestClass == Cases || Cases < 2 * MINOBJS )
{
return Node;
}
Verbosity(1)
printf("\n%d items, total weight %.1f\n", Lp - Fp + 1, Cases);
/* For each available attribute, find the information and gain */
ForEach(Att, 0, MaxAtt)
{
Gain[Att] = -Epsilon;
if ( SpecialStatus[Att] == IGNORE ) continue;
if ( MaxAttVal[Att] )
{
/* discrete valued attribute */
if ( SUBSET && MaxAttVal[Att] > 2 )
{
EvalSubset(Att, Fp, Lp, Cases);
}
else
if ( ! Tested[Att] )
{
EvalDiscreteAtt(Att, Fp, Lp, Cases);
}
}
else
{
/* continuous attribute */
EvalContinuousAtt(Att, Fp, Lp);
}
/* Update average gain, excluding attributes with very many values */
if ( Gain[Att] > -Epsilon &&
( MultiVal || MaxAttVal[Att] < 0.3 * (MaxItem + 1) ) )
{
Possible++;
AvGain += Gain[Att];
}
}
/* Find the best attribute according to the given criterion */
BestVal = -Epsilon;
BestAtt = None;
AvGain = ( Possible ? AvGain / Possible : 1E6 );
Verbosity(2)
{
if ( AvGain < 1E6 ) printf("\taverage gain %.3f\n", AvGain);
}
ForEach(Att, 0, MaxAtt)
{
if ( Gain[Att] > -Epsilon )
{
Val = Worth(Info[Att], Gain[Att], AvGain);
if ( Val > BestVal )
{
BestAtt = Att;
BestVal = Val;
}
}
}
/* Decide whether to branch or not */
if ( BestAtt != None )
{
Verbosity(1)
{
printf("\tbest attribute %s", AttName[BestAtt]);
if ( ! MaxAttVal[BestAtt] )
{
printf(" cut %.3f", Bar[BestAtt]);
}
printf(" inf %.3f gain %.3f val %.3f\n",
Info[BestAtt], Gain[BestAtt], BestVal);
}
/* Build a node of the selected test */
if ( MaxAttVal[BestAtt] )
{
/* Discrete valued attribute */
if ( SUBSET && MaxAttVal[BestAtt] > 2 )
{
SubsetTest(Node, BestAtt);
}
else
{
DiscreteTest(Node, BestAtt);
}
}
else
{
/* Continuous attribute */
ContinTest(Node, BestAtt);
}
/* Remove unknown attribute values */
PrevAllKnown = AllKnown;
Kp = Group(0, Fp, Lp, Node) + 1;
if ( Kp != Fp ) AllKnown = false;
KnownCases = Cases - CountItems(Fp, Kp-1);
UnknownRate[BestAtt] = (Cases - KnownCases) / (Cases + 0.001);
Verbosity(1)
{
if ( UnknownRate[BestAtt] > 0 )
{
printf("\tunknown rate for %s = %.3f\n",
AttName[BestAtt], UnknownRate[BestAtt]);
}
}
/* Recursive divide and conquer */
++Tested[BestAtt];
Ep = Kp - 1;
Node->Errors = 0;
ForEach(v, 1, Node->Forks)
{
Ep = Group(v, Kp, Lp, Node);
if ( Kp <= Ep )
{
Factor = CountItems(Kp, Ep) / KnownCases;
ForEach(i, Fp, Kp-1)
{
Weight[i] *= Factor;
}
Node->Branch[v] = FormTree(Fp, Ep);
Node->Errors += Node->Branch[v]->Errors;
Group(0, Fp, Ep, Node);
ForEach(i, Fp, Kp-1)
{
Weight[i] /= Factor;
}
}
else
{
Node->Branch[v] = Leaf(Node->ClassDist, BestClass, 0.0, 0.0);
}
}
--Tested[BestAtt];
AllKnown = PrevAllKnown;
/* See whether we would have been no worse off with a leaf */
if ( Node->Errors >= Cases - NoBestClass - Epsilon )
{
Verbosity(1)
printf("Collapse tree for %d items to leaf %s\n",
Lp - Fp + 1, ClassName[BestClass]);
Node->NodeType = 0;
}
}
else
{
Verbosity(1)
printf("\tno sensible splits %.1f/%.1f\n",
Cases, Cases - NoBestClass);
}
return Node;
}
/*************************************************************************/
/* */
/* Group together the items corresponding to branch V of a test */
/* and return the index of the last such */
/* */
/* Note: if V equals zero, group the unknown values */
/* */
/*************************************************************************/
ItemNo Group(V, Fp, Lp, TestNode)
/* ----- */
DiscrValue V;
ItemNo Fp, Lp;
Tree TestNode;
{
ItemNo i;
Attribute Att;
float Thresh;
Set SS;
void Swap();
Att = TestNode->Tested;
if ( V )
{
/* Group items on the value of attribute Att, and depending
on the type of branch */
switch ( TestNode->NodeType )
{
case BrDiscr:
ForEach(i, Fp, Lp)
{
if ( DVal(Item[i], Att) == V ) Swap(Fp++, i);
}
break;
case ThreshContin:
Thresh = TestNode->Cut;
ForEach(i, Fp, Lp)
{
if ( (CVal(Item[i], Att) <= Thresh) == (V == 1) ) Swap(Fp++, i);
}
break;
case BrSubset:
SS = TestNode->Subset[V];
ForEach(i, Fp, Lp)
{
if ( In(DVal(Item[i], Att), SS) ) Swap(Fp++, i);
}
break;
}
}
else
{
/* Group together unknown values */
switch ( TestNode->NodeType )
{
case BrDiscr:
case BrSubset:
ForEach(i, Fp, Lp)
{
if ( ! DVal(Item[i], Att) ) Swap(Fp++, i);
}
break;
case ThreshContin:
ForEach(i, Fp, Lp)
{
if ( CVal(Item[i], Att) == Unknown ) Swap(Fp++, i);
}
break;
}
}
return Fp - 1;
}
/*************************************************************************/
/* */
/* Return the total weight of items from Fp to Lp */
/* */
/*************************************************************************/
ItemCount CountItems(Fp, Lp)
/* ---------- */
ItemNo Fp, Lp;
{
register ItemCount Sum=0.0, *Wt, *LWt;
if ( AllKnown ) return Lp - Fp + 1;
for ( Wt = Weight + Fp, LWt = Weight + Lp ; Wt <= LWt ; )
{
Sum += *Wt++;
}
return Sum;
}
/*************************************************************************/
/* */
/* Exchange items at a and b */
/* */
/*************************************************************************/
void Swap(a,b)
/* ---- */
ItemNo a, b;
{
register Description Hold;
register ItemCount HoldW;
Hold = Item[a];
Item[a] = Item[b];
Item[b] = Hold;
HoldW = Weight[a];
Weight[a] = Weight[b];
Weight[b] = HoldW;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -