📄 prunerule.c
字号:
/*************************************************************************/
/* */
/* Pruning single rules */
/* -------------------- */
/* */
/*************************************************************************/
#include "defns.i"
#include "types.i"
#include "extern.i"
#include "rulex.i"
/* External data structures used in building rules */
extern ItemNo *TargetClassFreq, /* [Boolean] */
*Errors, /* [Condition] */
*Total; /* [Condition] */
extern float *Pessimistic, /* [Condition] */
*Actual, /* [Condition] */
*CondSigLevel; /* [Condition] */
extern Boolean **CondSatisfiedBy, /* [Condition][ItemNo] */
*Deleted;
#define Before(n1,n2) (n1->Tested < n2->Tested ||\
n1->NodeType < n2->NodeType ||\
n1->Tested == n2->Tested && n1->Cut < n2->Cut)
#define IsTarget(case) (Class(case) == TargetClass ? 1 : 0)
/*************************************************************************/
/* */
/* Prune the rule given by the conditions Cond, and the number of */
/* conditions NCond, and add the resulting rule to the current */
/* ruleset if it is sufficiently accurate */
/* */
/*************************************************************************/
PruneRule(Cond, NCond, TargetClass)
/* --------- */
Condition Cond[];
short NCond;
ClassNo TargetClass;
{
short d, dd, id, Bestd, Bestid, Remaining=NCond;
float DefaultError, Extra, AddErrs(), TableProb();
Boolean Alter, Satisfies(), NewRule(), Redundant();
Condition Hold;
ItemNo i;
ForEach(d, 0, NCond)
{
Deleted[d] = false;
}
/* Evaluate the satisfaction matrix */
TargetClassFreq[0] = TargetClassFreq[1] = 0;
ForEach(i, 0, MaxItem)
{
ForEach(d, 1, NCond)
{
CondSatisfiedBy[d][i] = Satisfies(Item[i], Cond[d]);
}
TargetClassFreq[IsTarget(Item[i])]++;
}
DefaultError = 1.0 - (TargetClassFreq[true] + 1.0) / (MaxItem + 3.0);
/* Find conditions to delete */
Verbosity(1) printf("\n pruning rule for %s", ClassName[TargetClass]);
do
{
Alter = false;
FindTables(NCond, TargetClass);
/* Find the condition, deleting which would most improve
the accuracy of the rule.
Notes: The pessimistic error rate, and not the actual
error rate, is currently used.
When d is 0, we are dealing with all conditions. */
Bestd = id = 0;
Verbosity(1)
printf("\n Err Used Pess\tAbsent condition\n");
ForEach(d, 0, NCond)
{
if ( Deleted[d] ) continue;
if ( Total[d] )
{
Actual[d] = Errors[d] / (float) Total[d];
Extra = AddErrs((float) Total[d], (float) Errors[d]);
Pessimistic[d] = (Errors[d] + Extra) / Total[d];
}
else
{
Actual[d] = 0;
Pessimistic[d] = DefaultError;
}
Verbosity(1)
printf(" %5d%5d %4.1f%%",
Errors[d], Total[d], 100 * Pessimistic[d]);
if ( ! d )
{
Verbosity(1) printf("\t<base rule>\n");
}
else
{
id++;
/* If significance testing option used, invoke Fisher's
exact test here to assess probability that division
by d arises from chance. */
if ( SIGTEST )
{
CondSigLevel[d] =
TableProb(Errors[0],
Errors[d]-Errors[0],
Total[0]-Errors[0],
Total[d]-Total[0]-Errors[d]+Errors[0]);
Verbosity(1) printf(" Sig=%.3f", CondSigLevel[d]);
}
Verbosity(1) PrintCondition(Cond[d]);
/* Bestd identifies the condition with lowest pessimistic
error estimate */
if ( ! Bestd || Pessimistic[d] <= Pessimistic[Bestd] )
{
Bestd = d;
Bestid = id;
}
/* Alter is set true if we are going to drop a condition
(either because we get lower pessimistic est, or
because one of the conditions fails a significance test) */
if ( Pessimistic[d] <= Pessimistic[0] ||
Actual[d] <= Actual[0] ||
SIGTEST && CondSigLevel[d] > SIGTHRESH )
{
Alter = true;
}
}
}
if ( Alter )
{
Verbosity(1) printf("\teliminate test %d\n", Bestid);
Deleted[Bestd] = true;
Remaining--;
}
} while ( Alter && Remaining );
if ( ! Remaining || ! Total[0] )
{
return;
}
if ( Pessimistic[0] >= DefaultError )
{
Verbosity(1) printf("\ttoo inaccurate\n");
return;
}
/* Sort the conditions */
ForEach(d, 1, Remaining)
{
dd = 0;
ForEach(id, d, NCond)
{
if ( ! Deleted[id] &&
( ! dd ||
Before(Cond[id]->CondTest, Cond[dd]->CondTest) ) )
{
dd = id;
}
}
if ( dd != d )
{
Hold = Cond[d];
Cond[d] = Cond[dd];
Cond[dd] = Hold;
Deleted[dd] = Deleted[d];
}
Deleted[d] = true;
}
NewRule(Cond, Remaining, TargetClass, Pessimistic[0]);
}
/*************************************************************************/
/* */
/* See whether condition R is redundant */
/* */
/*************************************************************************/
Boolean Redundant(R, Cond, NCond)
/* --------- */
Condition Cond[];
short R, NCond;
{
short d, v, vv;
Test t, Rt;
Boolean IsSubset();
Rt = Cond[R]->CondTest;
v = Cond[R]->TestValue;
ForEach(d, 1, NCond)
{
if ( Deleted[d] || d == R ) continue;
t = Cond[d]->CondTest;
vv = Cond[d]->TestValue;
if ( t->Tested != Rt->Tested ) continue;
switch ( t->NodeType )
{
case BrDiscr: /* test of discrete attribute */
return false;
case ThreshContin: /* test of continuous attribute */
if ( vv == v &&
( v == 1 ? t->Cut < Rt->Cut : t->Cut > Rt->Cut ) )
{
return true;
}
break;
case BrSubset: /* subset test on discrete attribute */
if ( IsSubset(t->Subset[vv], Rt->Subset[v], Rt->Tested) )
{
return true;
}
}
}
return false;
}
/*************************************************************************/
/* */
/* Decide whether subset S1 of values is contained in subset S2 */
/* */
/*************************************************************************/
Boolean IsSubset(S1, S2, Att)
/* -------- */
Set S1, S2;
Attribute Att;
{
DiscrValue v;
ForEach(v, 1, MaxAttVal[Att])
{
if ( In(v, S1) && ! In(v, S2) ) return false;
}
return true;
}
/*************************************************************************/
/* */
/* Find the frequency distribution tables for the current conditions: */
/* */
/* Total[0] = items matching all conditions */
/* Total[d] = items matching all except condition d */
/* */
/* Errors[0] = wrong-class items matching all conditions */
/* Errors[d] = wrong-class items matching all but cond d */
/* */
/* This routine is critical to the efficiency of rule pruning. It */
/* computes the information above in one pass through the data, */
/* looking at cases that fail to satisfy at most one of the */
/* non-deleted conditions */
/* */
/*************************************************************************/
FindTables(NCond, TargetClass)
/* ----------- */
short NCond;
ClassNo TargetClass;
{
ItemNo i;
short Misses, Missed[2], d;
Boolean CorrectClass;
/* Clear distributions */
ForEach(d, 0, NCond)
{
Total[d] = Errors[d] = 0;
}
/* Set distributions */
ForEach(i, 0, MaxItem)
{
Misses = 0;
CorrectClass = IsTarget(Item[i]);
for ( d = 1 ; d <= NCond && Misses <= 1 ; d++ )
{
if ( ! Deleted[d] && ! CondSatisfiedBy[d][i] )
{
Missed[Misses++] = d;
}
}
if ( ! Misses )
{
UpdateCount(Total, Errors, 0, CorrectClass);
}
else
if ( Misses == 1 )
{
UpdateCount(Total, Errors, Missed[0], CorrectClass);
}
}
/* Adjust counts to reflect cases that met all conditions */
ForEach(d, 1, NCond)
{
if ( ! Deleted[d] )
{
Total[d] += Total[0];
Errors[d] += Errors[0];
}
}
}
/*************************************************************************/
/* */
/* Increment the counts Total[d] and Errors[d] */
/* */
/*************************************************************************/
UpdateCount(T, E, d, OK)
/* ----------- */
ItemNo T[], E[];
short d;
Boolean OK;
{
T[d]++;
if ( ! OK ) E[d]++;
}
/*************************************************************************/
/* */
/* Determine whether the given case description satisfies the given */
/* condition. */
/* */
/*************************************************************************/
Boolean Satisfies(CaseDesc, OneCond)
/* --------- */
Description CaseDesc;
Condition OneCond;
{
DiscrValue v;
float cv;
Test t;
short s;
Boolean Outcome;
t = OneCond->CondTest;
/* Determine the outcome of this test on this item */
switch ( t->NodeType )
{
case BrDiscr: /* test of discrete attribute */
v = DVal(CaseDesc, t->Tested);
Outcome = ( v == 0 ? -1 : v );
break;
case ThreshContin: /* test of continuous attribute */
cv = CVal(CaseDesc, t->Tested);
Outcome = ( cv == Unknown ? -1 : cv <= t->Cut ? 1 : 2 );
break;
case BrSubset: /* subset test on discrete attribute */
v = DVal(CaseDesc, t->Tested);
Outcome = -1;
ForEach(s, 1, t->Forks)
{
if ( In(v, t->Subset[s]) )
{
Outcome = s;
break;
}
}
}
return ( Outcome == OneCond->TestValue );
}
/*************************************************************************/
/* */
/* Hypergeometric distribution (uses tabulated log factorials) */
/* */
/*************************************************************************/
double Hypergeom(a, r, A, B)
/* --------- */
int a, r, A, B;
{
return exp( LogFact[A] + LogFact[B] + LogFact[r] + LogFact[A+B-r] -
( LogFact[a] + LogFact[r-a] + LogFact[A-a]
+ LogFact[B-(r-a)] + LogFact[A+B]) );
}
/*************************************************************************/
/* */
/* TableProb examines the 2x2 contingency table t and computes the */
/* probability that a random division could have produced a split at */
/* least as extreme as this. Also known as "Fisher's Exact Test", */
/* after its inventor, R.A. Fisher. */
/* */
/*************************************************************************/
float TableProb(t11, t12, t21, t22)
/* --------- */
int t11, t12, t21, t22;
{
double Sum=0.0;
int A, B, r, a, k, a0;
/* First, rearrange the rows and columns of the table to get it into
canonical form */
if ( t11 + t12 > t21 + t22 )
{
A = t11 + t12;
B = t21 + t22;
if ( t11 * (t21 + t22) > t21 * (t11 + t12) )
{
a0 = t11;
r = t11 + t21;
}
else
{
a0 = t12;
r = t12 + t22;
}
}
else
{
A = t21 + t22;
B = t11 + t12;
if ( t21 * (t11 + t12) > t11 * (t21 + t22) )
{
a0 = t21;
r = t21 + t11;
}
else
{
a0 = t22;
r = t22 + t12;
}
}
/* Now compute the probability */
k = Min(r, A);
ForEach(a, a0, k)
{
Sum += Hypergeom(a, r, A, B);
}
return Sum;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -