📄 prune.c
字号:
/*************************************************************************/
/* */
/* Prune a decision tree and predict its error rate */
/* ------------------------------------------------ */
/* */
/*************************************************************************/
#include "defns.i"
#include "types.i"
#include "extern.i"
extern ItemCount *Weight;
Set *PossibleValues=Nil;
Boolean Changed;
#define LocalVerbosity(x) if (Sh >= 0 && VERBOSITY >= x)
#define Intab(x) Indent(x, "| ")
/*************************************************************************/
/* */
/* Prune tree T, returning true if tree has been modified */
/* */
/*************************************************************************/
Boolean Prune(T)
/* ----- */
Tree T;
{
ItemNo i;
float EstimateErrors();
Attribute a;
InitialiseWeights();
AllKnown = true;
Verbosity(1) printf("\n");
Changed = false;
EstimateErrors(T, 0, MaxItem, 0, true);
if ( SUBSET )
{
if ( ! PossibleValues )
{
PossibleValues = (Set *) calloc(MaxAtt+1, sizeof(Set));
}
ForEach(a, 0, MaxAtt)
{
if ( MaxAttVal[a] )
{
PossibleValues[a] = (Set) malloc((MaxAttVal[a]>>3) + 1);
ClearBits((MaxAttVal[a]>>3) + 1, PossibleValues[a]);
ForEach(i, 1, MaxAttVal[a])
{
SetBit(i, PossibleValues[a]);
}
}
}
CheckPossibleValues(T);
}
return Changed;
}
/*************************************************************************/
/* */
/* Estimate the errors in a given subtree */
/* */
/*************************************************************************/
float EstimateErrors(T, Fp, Lp, Sh, UpdateTree)
/* -------------- */
Tree T;
ItemNo Fp, Lp;
short Sh;
Boolean UpdateTree;
{
ItemNo i, Kp, Ep, Group();
ItemCount Cases, KnownCases, *LocalClassDist, TreeErrors, LeafErrors,
ExtraLeafErrors, BranchErrors, CountItems(), Factor, MaxFactor, AddErrs();
DiscrValue v, MaxBr;
ClassNo c, BestClass;
Boolean PrevAllKnown;
/* Generate the class frequency distribution */
Cases = CountItems(Fp, Lp);
LocalClassDist = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));
ForEach(i, Fp, Lp)
{
LocalClassDist[ Class(Item[i]) ] += Weight[i];
}
/* Find the most frequent class and update the tree */
BestClass = T->Leaf;
ForEach(c, 0, MaxClass)
{
if ( LocalClassDist[c] > LocalClassDist[BestClass] )
{
BestClass = c;
}
}
LeafErrors = Cases - LocalClassDist[BestClass];
ExtraLeafErrors = AddErrs(Cases, LeafErrors);
if ( UpdateTree )
{
T->Items = Cases;
T->Leaf = BestClass;
memcpy(T->ClassDist, LocalClassDist, (MaxClass + 1) * sizeof(ItemCount));
}
if ( ! T->NodeType ) /* leaf */
{
TreeErrors = LeafErrors + ExtraLeafErrors;
if ( UpdateTree )
{
T->Errors = TreeErrors;
LocalVerbosity(1)
{
Intab(Sh);
printf("%s (%.2f:%.2f/%.2f)\n", ClassName[T->Leaf],
T->Items, LeafErrors, T->Errors);
}
}
free(LocalClassDist);
return TreeErrors;
}
/* Estimate errors for each branch */
Kp = Group(0, Fp, Lp, T) + 1;
KnownCases = CountItems(Kp, Lp);
PrevAllKnown = AllKnown;
if ( Kp != Fp ) AllKnown = false;
TreeErrors = MaxFactor = 0;
ForEach(v, 1, T->Forks)
{
Ep = Group(v, Kp, Lp, T);
if ( Kp <= Ep )
{
Factor = CountItems(Kp, Ep) / KnownCases;
if ( Factor >= MaxFactor )
{
MaxBr = v;
MaxFactor = Factor;
}
ForEach(i, Fp, Kp-1)
{
Weight[i] *= Factor;
}
TreeErrors += EstimateErrors(T->Branch[v], Fp, Ep, Sh+1, UpdateTree);
Group(0, Fp, Ep, T);
ForEach(i, Fp, Kp-1)
{
Weight[i] /= Factor;
}
}
}
AllKnown = PrevAllKnown;
if ( ! UpdateTree )
{
free(LocalClassDist);
return TreeErrors;
}
/* See how the largest branch would fare */
BranchErrors = EstimateErrors(T->Branch[MaxBr], Fp, Lp, -1000, false);
LocalVerbosity(1)
{
Intab(Sh);
printf("%s: [%d%% N=%.2f tree=%.2f leaf=%.2f+%.2f br[%d]=%.2f]\n",
AttName[T->Tested],
(int) ((TreeErrors * 100) / (T->Items + 0.001)),
T->Items, TreeErrors, LeafErrors, ExtraLeafErrors,
MaxBr, BranchErrors);
}
/* See whether tree should be replaced with leaf or largest branch */
if ( LeafErrors + ExtraLeafErrors <= BranchErrors + 0.1 &&
LeafErrors + ExtraLeafErrors <= TreeErrors + 0.1 )
{
LocalVerbosity(1)
{
Intab(Sh);
printf("Replaced with leaf %s\n", ClassName[T->Leaf]);
}
T->NodeType = 0;
T->Errors = LeafErrors + ExtraLeafErrors;
Changed = true;
}
else
if ( BranchErrors <= TreeErrors + 0.1 )
{
LocalVerbosity(1)
{
Intab(Sh);
printf("Replaced with branch %d\n", MaxBr);
}
AllKnown = PrevAllKnown;
EstimateErrors(T->Branch[MaxBr], Fp, Lp, Sh, true);
memcpy((char *) T, (char *) T->Branch[MaxBr], sizeof(TreeRec));
Changed = true;
}
else
{
T->Errors = TreeErrors;
}
AllKnown = PrevAllKnown;
free(LocalClassDist);
return T->Errors;
}
/*************************************************************************/
/* */
/* Remove unnecessary subset tests on missing values */
/* */
/*************************************************************************/
CheckPossibleValues(T)
/* ------------------- */
Tree T;
{
Set HoldValues;
int v, Bytes, b;
Attribute A;
char Any=0;
if ( T->NodeType == BrSubset )
{
A = T->Tested;
Bytes = (MaxAttVal[A]>>3) + 1;
HoldValues = (Set) malloc(Bytes);
/* See if last (default) branch can be simplified or omitted */
ForEach(b, 0, Bytes-1)
{
T->Subset[T->Forks][b] &= PossibleValues[A][b];
Any |= T->Subset[T->Forks][b];
}
if ( ! Any )
{
T->Forks--;
}
/* Process each subtree, leaving only values in branch subset */
CopyBits(Bytes, PossibleValues[A], HoldValues);
ForEach(v, 1, T->Forks)
{
CopyBits(Bytes, T->Subset[v], PossibleValues[A]);
CheckPossibleValues(T->Branch[v]);
}
CopyBits(Bytes, HoldValues, PossibleValues[A]);
free(HoldValues);
}
else
if ( T->NodeType )
{
ForEach(v, 1, T->Forks)
{
CheckPossibleValues(T->Branch[v]);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -