📄 prunerule.c
字号:
/*************************************************************************//* *//* Pruning single rules *//* -------------------- *//* *//*************************************************************************/#include "defns.i"#include "types.i"#include "extern.i"#include "rulex.i"/* External data structures used in building rules */extern ItemNo *TargetClassFreq, /* [Boolean] */ *Errors, /* [Condition] */ *Total; /* [Condition] */extern float *Pessimistic, /* [Condition] */ *Actual, /* [Condition] */ *CondSigLevel; /* [Condition] */extern Boolean **CondSatisfiedBy, /* [Condition][ItemNo] */ *Deleted;#define Before(n1,n2) (n1->Tested < n2->Tested ||\ n1->NodeType < n2->NodeType ||\ n1->Tested == n2->Tested && n1->Cut < n2->Cut)#define IsTarget(case) (Class(case) == TargetClass ? 1 : 0)/*************************************************************************//* *//* Prune the rule given by the conditions Cond, and the number of *//* conditions NCond, and add the resulting rule to the current *//* ruleset if it is sufficiently accurate *//* *//*************************************************************************/ PruneRule(Cond, NCond, TargetClass)/* --------- */ Condition Cond[]; short NCond; ClassNo TargetClass;{ short d, dd, id, Bestd, Bestid, Remaining=NCond; float DefaultError, Extra, AddErrs(), TableProb(); Boolean Alter, Satisfies(), NewRule(), Redundant(); Condition Hold; ItemNo i; ForEach(d, 0, NCond) { Deleted[d] = false; } /* Evaluate the satisfaction matrix */ TargetClassFreq[0] = TargetClassFreq[1] = 0; ForEach(i, 0, MaxItem) { ForEach(d, 1, NCond) { CondSatisfiedBy[d][i] = Satisfies(Item[i], Cond[d]); } TargetClassFreq[IsTarget(Item[i])]++; } DefaultError = 1.0 - (TargetClassFreq[true] + 1.0) / (MaxItem + 3.0); /* Find conditions to delete */ Verbosity(1) printf("\n pruning rule for %s", ClassName[TargetClass]); do { Alter = false; FindTables(NCond, TargetClass); /* Find the condition, deleting which would most improve the accuracy of the rule. Notes: The pessimistic error rate, and not the actual error rate, is currently used. When d is 0, we are dealing with all conditions. */ Bestd = id = 0; Verbosity(1) printf("\n Err Used Pess\tAbsent condition\n"); ForEach(d, 0, NCond) { if ( Deleted[d] ) continue; if ( Total[d] ) { Actual[d] = Errors[d] / (float) Total[d]; Extra = AddErrs((float) Total[d], (float) Errors[d]); Pessimistic[d] = (Errors[d] + Extra) / Total[d]; } else { Actual[d] = 0; Pessimistic[d] = DefaultError; } Verbosity(1) printf(" %5d%5d %4.1f%%", Errors[d], Total[d], 100 * Pessimistic[d]); if ( ! d ) { Verbosity(1) printf("\t<base rule>\n"); } else { id++; /* If significance testing option used, invoke Fisher's exact test here to assess probability that division by d arises from chance. */ if ( SIGTEST ) { CondSigLevel[d] = TableProb(Errors[0], Errors[d]-Errors[0], Total[0]-Errors[0], Total[d]-Total[0]-Errors[d]+Errors[0]); Verbosity(1) printf(" Sig=%.3f", CondSigLevel[d]); } Verbosity(1) PrintCondition(Cond[d]); /* Bestd identifies the condition with lowest pessimistic error estimate */ if ( ! Bestd || Pessimistic[d] <= Pessimistic[Bestd] ) { Bestd = d; Bestid = id; } /* Alter is set true if we are going to drop a condition (either because we get lower pessimistic est, or because one of the conditions fails a significance test) */ if ( Pessimistic[d] <= Pessimistic[0] || Actual[d] <= Actual[0] || SIGTEST && CondSigLevel[d] > SIGTHRESH ) { Alter = true; } } } if ( Alter ) { Verbosity(1) printf("\teliminate test %d\n", Bestid); Deleted[Bestd] = true; Remaining--; } } while ( Alter && Remaining ); if ( ! Remaining || ! Total[0] ) { return; } if ( Pessimistic[0] >= DefaultError ) { Verbosity(1) printf("\ttoo inaccurate\n"); return; } /* Sort the conditions */ ForEach(d, 1, Remaining) { dd = 0; ForEach(id, d, NCond) { if ( ! Deleted[id] && ( ! dd || Before(Cond[id]->CondTest, Cond[dd]->CondTest) ) ) { dd = id; } } if ( dd != d ) { Hold = Cond[d]; Cond[d] = Cond[dd]; Cond[dd] = Hold; Deleted[dd] = Deleted[d]; } Deleted[d] = true; } NewRule(Cond, Remaining, TargetClass, Pessimistic[0]);}/*************************************************************************//* *//* See whether condition R is redundant *//* *//*************************************************************************/Boolean Redundant(R, Cond, NCond)/* --------- */ Condition Cond[]; short R, NCond;{ short d, v, vv; Test t, Rt; Boolean IsSubset(); Rt = Cond[R]->CondTest; v = Cond[R]->TestValue; ForEach(d, 1, NCond) { if ( Deleted[d] || d == R ) continue; t = Cond[d]->CondTest; vv = Cond[d]->TestValue; if ( t->Tested != Rt->Tested ) continue; switch ( t->NodeType ) { case BrDiscr: /* test of discrete attribute */ return false; case ThreshContin: /* test of continuous attribute */ if ( vv == v && ( v == 1 ? t->Cut < Rt->Cut : t->Cut > Rt->Cut ) ) { return true; } break; case BrSubset: /* subset test on discrete attribute */ if ( IsSubset(t->Subset[vv], Rt->Subset[v], Rt->Tested) ) { return true; } } } return false;}/*************************************************************************//* *//* Decide whether subset S1 of values is contained in subset S2 *//* *//*************************************************************************/Boolean IsSubset(S1, S2, Att)/* -------- */ Set S1, S2; Attribute Att;{ DiscrValue v; ForEach(v, 1, MaxAttVal[Att]) { if ( In(v, S1) && ! In(v, S2) ) return false; } return true;}/*************************************************************************//* *//* Find the frequency distribution tables for the current conditions: *//* *//* Total[0] = items matching all conditions *//* Total[d] = items matching all except condition d *//* *//* Errors[0] = wrong-class items matching all conditions *//* Errors[d] = wrong-class items matching all but cond d *//* *//* This routine is critical to the efficiency of rule pruning. It *//* computes the information above in one pass through the data, *//* looking at cases that fail to satisfy at most one of the *//* non-deleted conditions *//* *//*************************************************************************/ FindTables(NCond, TargetClass)/* ----------- */ short NCond; ClassNo TargetClass;{ ItemNo i; short Misses, Missed[2], d; Boolean CorrectClass; /* Clear distributions */ ForEach(d, 0, NCond) { Total[d] = Errors[d] = 0; } /* Set distributions */ ForEach(i, 0, MaxItem) { Misses = 0; CorrectClass = IsTarget(Item[i]); for ( d = 1 ; d <= NCond && Misses <= 1 ; d++ ) { if ( ! Deleted[d] && ! CondSatisfiedBy[d][i] ) { Missed[Misses++] = d; } } if ( ! Misses ) { UpdateCount(Total, Errors, 0, CorrectClass); } else if ( Misses == 1 ) { UpdateCount(Total, Errors, Missed[0], CorrectClass); } } /* Adjust counts to reflect cases that met all conditions */ ForEach(d, 1, NCond) { if ( ! Deleted[d] ) { Total[d] += Total[0]; Errors[d] += Errors[0]; } }}/*************************************************************************//* *//* Increment the counts Total[d] and Errors[d] *//* *//*************************************************************************/ UpdateCount(T, E, d, OK)/* ----------- */ ItemNo T[], E[]; short d; Boolean OK;{ T[d]++; if ( ! OK ) E[d]++;}/*************************************************************************//* *//* Determine whether the given case description satisfies the given *//* condition. *//* *//*************************************************************************/Boolean Satisfies(CaseDesc, OneCond)/* --------- */ Description CaseDesc; Condition OneCond;{ DiscrValue v; float cv; Test t; short s; Boolean Outcome; t = OneCond->CondTest; /* Determine the outcome of this test on this item */ switch ( t->NodeType ) { case BrDiscr: /* test of discrete attribute */ v = DVal(CaseDesc, t->Tested); Outcome = ( v == 0 ? -1 : v ); break; case ThreshContin: /* test of continuous attribute */ cv = CVal(CaseDesc, t->Tested); Outcome = ( cv == Unknown ? -1 : cv <= t->Cut ? 1 : 2 ); break; case BrSubset: /* subset test on discrete attribute */ v = DVal(CaseDesc, t->Tested); Outcome = -1; ForEach(s, 1, t->Forks) { if ( In(v, t->Subset[s]) ) { Outcome = s; break; } } } return ( Outcome == OneCond->TestValue );}/*************************************************************************//* *//* Hypergeometric distribution (uses tabulated log factorials) *//* *//*************************************************************************/double Hypergeom(a, r, A, B)/* --------- */ int a, r, A, B;{ return exp( LogFact[A] + LogFact[B] + LogFact[r] + LogFact[A+B-r] - ( LogFact[a] + LogFact[r-a] + LogFact[A-a] + LogFact[B-(r-a)] + LogFact[A+B]) );}/*************************************************************************//* *//* TableProb examines the 2x2 contingency table t and computes the *//* probability that a random division could have produced a split at *//* least as extreme as this. Also known as "Fisher's Exact Test", *//* after its inventor, R.A. Fisher. *//* *//*************************************************************************/float TableProb(t11, t12, t21, t22)/* --------- */ int t11, t12, t21, t22;{ double Sum=0.0; int A, B, r, a, k, a0; /* First, rearrange the rows and columns of the table to get it into canonical form */ if ( t11 + t12 > t21 + t22 ) { A = t11 + t12; B = t21 + t22; if ( t11 * (t21 + t22) > t21 * (t11 + t12) ) { a0 = t11; r = t11 + t21; } else { a0 = t12; r = t12 + t22; } } else { A = t21 + t22; B = t11 + t12; if ( t21 * (t11 + t12) > t11 * (t21 + t22) ) { a0 = t21; r = t21 + t11; } else { a0 = t22; r = t22 + t12; } } /* Now compute the probability */ k = Min(r, A); ForEach(a, a0, k) { Sum += Hypergeom(a, r, A, B); } return Sum;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -