⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 prunerule.c

📁 决策树是用二叉树形图来表示处理逻辑的一种工具。可以直观、清晰地表达加工的逻辑要求。特别适合于判断因素比较少、逻辑组合关系不复杂的情况。
💻 C
字号:
/*************************************************************************//*								   	 *//*	Pruning single rules					   	 *//*	--------------------				   	      	 *//*								   	 *//*************************************************************************/#include "defns.i"#include "types.i"#include "extern.i"#include "rulex.i"/*  External data structures used in building rules  */extern ItemNo	*TargetClassFreq,	/* [Boolean] */		*Errors,		/* [Condition] */		*Total;			/* [Condition] */extern float	*Pessimistic,		/* [Condition] */		*Actual,		/* [Condition] */		*CondSigLevel;		/* [Condition] */extern Boolean	**CondSatisfiedBy,	/* [Condition][ItemNo] */		*Deleted;#define Before(n1,n2)  (n1->Tested < n2->Tested ||\			n1->NodeType < n2->NodeType ||\		        n1->Tested == n2->Tested && n1->Cut < n2->Cut)#define IsTarget(case)  (Class(case) == TargetClass ? 1 : 0)/*************************************************************************//*									 *//*  Prune the rule given by the conditions Cond, and the number of	 *//*  conditions NCond, and add the resulting rule to the current		 *//*  ruleset if it is sufficiently accurate				 *//*									 *//*************************************************************************/    PruneRule(Cond, NCond, TargetClass)/*  ---------  */    Condition Cond[];    short NCond;    ClassNo TargetClass;{    short d, dd, id, Bestd, Bestid, Remaining=NCond;    float DefaultError, Extra, AddErrs(), TableProb();    Boolean Alter, Satisfies(), NewRule(), Redundant();    Condition Hold;    ItemNo i;    ForEach(d, 0, NCond)    {	Deleted[d] = false;    }    /*  Evaluate the satisfaction matrix  */    TargetClassFreq[0] = TargetClassFreq[1] = 0;    ForEach(i, 0, MaxItem)    {	ForEach(d, 1, NCond)	{	    CondSatisfiedBy[d][i] = Satisfies(Item[i], Cond[d]);	}	TargetClassFreq[IsTarget(Item[i])]++;    }    DefaultError = 1.0 - (TargetClassFreq[true] + 1.0) / (MaxItem + 3.0);    /*  Find conditions to delete  */    Verbosity(1) printf("\n  pruning rule for %s", ClassName[TargetClass]);    do    {	Alter = false;	FindTables(NCond, TargetClass);	/*  Find the condition, deleting which would most improve	    the accuracy of the rule.	    Notes: The pessimistic error rate, and not the actual		   error rate, is currently used.	    	   When d is 0, we are dealing with all conditions.  */	Bestd = id = 0;	Verbosity(1)	    printf("\n     Err Used   Pess\tAbsent condition\n");	ForEach(d, 0, NCond)	{	    if ( Deleted[d] ) continue;	    if ( Total[d] )	    {		Actual[d] = Errors[d] / (float) Total[d];		Extra = AddErrs((float) Total[d], (float) Errors[d]);		Pessimistic[d] = (Errors[d] + Extra) / Total[d];	    }	    else	    {		Actual[d] = 0;		Pessimistic[d] = DefaultError;	    }	    Verbosity(1)		printf("   %5d%5d  %4.1f%%",		       Errors[d], Total[d], 100 * Pessimistic[d]);	    if ( ! d )	    {		Verbosity(1) printf("\t<base rule>\n");	    }	    else	    {		id++;		/*  If significance testing option used, invoke Fisher's		    exact test here to assess probability that division		    by d arises from chance.  */		if ( SIGTEST )		{		    CondSigLevel[d] =			TableProb(Errors[0],				   Errors[d]-Errors[0],				   Total[0]-Errors[0],			           Total[d]-Total[0]-Errors[d]+Errors[0]);		    Verbosity(1) printf("  Sig=%.3f", CondSigLevel[d]);		}		Verbosity(1) PrintCondition(Cond[d]);		/*  Bestd identifies the condition with lowest pessimistic		    error  estimate  */		if ( ! Bestd || Pessimistic[d] <= Pessimistic[Bestd] )		{		    Bestd = d;		    Bestid = id;		}		/*  Alter is set true if we are going to drop a condition		    (either because we get lower pessimistic est, or		    because one of the conditions fails a significance test)  */		if ( Pessimistic[d] <= Pessimistic[0] ||		     Actual[d] <= Actual[0]  ||		     SIGTEST && CondSigLevel[d] > SIGTHRESH )		{		    Alter = true;		}	    }	}	if ( Alter )	{	    Verbosity(1) printf("\teliminate test %d\n", Bestid);	    Deleted[Bestd] = true;	    Remaining--;	}    } while ( Alter && Remaining );    if ( ! Remaining || ! Total[0] )    {	return;    }    if ( Pessimistic[0] >= DefaultError )    {	Verbosity(1) printf("\ttoo inaccurate\n");	return;    }    /*  Sort the conditions  */    ForEach(d, 1, Remaining)    {	dd =  0;	ForEach(id, d, NCond)	{	    if ( ! Deleted[id] &&		 ( ! dd ||		   Before(Cond[id]->CondTest, Cond[dd]->CondTest) ) )	    {		dd = id;	    }	}	if ( dd != d )	{	    Hold    = Cond[d];	    Cond[d] = Cond[dd];	    Cond[dd] = Hold;	    Deleted[dd] = Deleted[d];	}	Deleted[d] = true;    }    NewRule(Cond, Remaining, TargetClass, Pessimistic[0]);}/*************************************************************************//*									 *//*  See whether condition R is redundant                           	 *//*									 *//*************************************************************************/Boolean Redundant(R, Cond, NCond)/*      ---------  */    Condition Cond[];    short R, NCond;{    short d, v, vv;    Test t, Rt;    Boolean IsSubset();    Rt = Cond[R]->CondTest;    v =  Cond[R]->TestValue;    ForEach(d, 1, NCond)    {	if ( Deleted[d] || d == R ) continue;	t = Cond[d]->CondTest;	vv = Cond[d]->TestValue;	if ( t->Tested != Rt->Tested ) continue;	switch ( t->NodeType )	{	    case BrDiscr:  /* test of discrete attribute */		return false;	    case ThreshContin:  /* test of continuous attribute */		if ( vv == v &&		     ( v == 1 ? t->Cut < Rt->Cut : t->Cut > Rt->Cut ) )		{		    return true;		}		break;	    case BrSubset:  /* subset test on discrete attribute  */		if ( IsSubset(t->Subset[vv], Rt->Subset[v], Rt->Tested) )		{		    return true;		}	}    }    return false;}/*************************************************************************//*									 *//*  Decide whether subset S1 of values is contained in subset S2	 *//*									 *//*************************************************************************/Boolean IsSubset(S1, S2, Att)/*      --------  */    Set S1, S2;    Attribute Att;{    DiscrValue v;    ForEach(v, 1, MaxAttVal[Att])    {	if ( In(v, S1) && ! In(v, S2) ) return false;    }    return true;}/*************************************************************************//*									 *//*  Find the frequency distribution tables for the current conditions: 	 *//*									 *//*	Total[0] = items matching all conditions		   	 *//*	Total[d] = items matching all except condition d	   	 *//*									 *//*	Errors[0] = wrong-class items matching all conditions	   	 *//*	Errors[d] = wrong-class items matching all but cond d	   	 *//*									 *//*  This routine is critical to the efficiency of rule pruning. It	 *//*  computes the information above in one pass through the data,	 *//*  looking at cases that fail to satisfy at most one of the		 *//*  non-deleted conditions						 *//*									 *//*************************************************************************/    FindTables(NCond, TargetClass)/*  -----------  */    short NCond;    ClassNo TargetClass;{    ItemNo i;    short Misses, Missed[2], d;    Boolean CorrectClass;    /*  Clear distributions  */    ForEach(d, 0, NCond)    {	Total[d] = Errors[d] = 0;    }    /*  Set distributions  */    ForEach(i, 0, MaxItem)    {	Misses = 0;	CorrectClass = IsTarget(Item[i]);	for ( d = 1 ; d <= NCond && Misses <= 1 ; d++ )	{	    if ( ! Deleted[d] && ! CondSatisfiedBy[d][i] )	    {		Missed[Misses++] = d;	    }	}	if ( ! Misses )	{	    UpdateCount(Total, Errors, 0, CorrectClass);	}	else	if ( Misses == 1 )	{	    UpdateCount(Total, Errors, Missed[0], CorrectClass);	}    }    /*  Adjust counts to reflect cases that met all conditions  */    ForEach(d, 1, NCond)    {	if ( ! Deleted[d] )	{	    Total[d] += Total[0];	    Errors[d] += Errors[0];	}    }}/*************************************************************************//*									 *//*  Increment the counts Total[d] and Errors[d]				 *//*									 *//*************************************************************************/    UpdateCount(T, E, d, OK)/*  -----------  */    ItemNo T[], E[];    short d;    Boolean OK;{    T[d]++;    if ( ! OK ) E[d]++;}/*************************************************************************//*									 *//*  Determine whether the given case description satisfies the given	 *//*  condition.								 *//*									 *//*************************************************************************/Boolean Satisfies(CaseDesc, OneCond)/*      ---------  */    Description CaseDesc;     Condition OneCond;{    DiscrValue v;    float cv;    Test t;    short s;    Boolean Outcome;    t = OneCond->CondTest;    /*  Determine the outcome of this test on this item  */    switch ( t->NodeType )    {	case BrDiscr:  /* test of discrete attribute */	    v = DVal(CaseDesc, t->Tested);	    Outcome = ( v == 0 ? -1 : v );	    break;	case ThreshContin:  /* test of continuous attribute */	    cv = CVal(CaseDesc, t->Tested);	    Outcome = ( cv == Unknown ? -1 : cv <= t->Cut ? 1 : 2 );	    break;	case BrSubset:  /* subset test on discrete attribute  */	    v = DVal(CaseDesc, t->Tested);	    Outcome = -1;	    ForEach(s, 1, t->Forks)	    {		if ( In(v, t->Subset[s]) )		{		    Outcome = s;		    break;		}	    }    }    return ( Outcome == OneCond->TestValue );}/*************************************************************************//*									 *//*  Hypergeometric distribution	(uses tabulated log factorials)		 *//*									 *//*************************************************************************/double Hypergeom(a, r, A, B)/*               ---------  */    int a, r, A, B;{    return exp( LogFact[A] + LogFact[B] + LogFact[r] + LogFact[A+B-r] -	        ( LogFact[a] + LogFact[r-a] + LogFact[A-a]		  + LogFact[B-(r-a)] + LogFact[A+B]) );}/*************************************************************************//*									 *//*  TableProb examines the 2x2 contingency table t and computes the      *//*  probability that a random division could have produced a split at	 *//*  least as extreme as this.  Also known as "Fisher's Exact Test",	 *//*  after its inventor, R.A. Fisher.					 *//*									 *//*************************************************************************/float TableProb(t11, t12, t21, t22)/*    ---------  */    int t11, t12, t21, t22;{    double Sum=0.0;    int A, B, r, a, k, a0;    /*  First, rearrange the rows and columns of the table to get it into	canonical form  */    if ( t11 + t12 > t21 + t22 )    {	A = t11 + t12;	B = t21 + t22;	if ( t11 * (t21 + t22) > t21 * (t11 + t12) )	{	    a0 = t11;	    r  = t11 + t21;	}	else	{	    a0 = t12;	    r  = t12 + t22;	}    }    else    {	A = t21 + t22;	B = t11 + t12;	if ( t21 * (t11 + t12) > t11 * (t21 + t22) )	{	    a0 = t21;	    r  = t21 + t11;	}	else	{	    a0 = t22;	    r  = t22 + t12;	}    }    /*  Now compute the probability  */    k = Min(r, A);    ForEach(a, a0, k)    {	Sum += Hypergeom(a, r, A, B);    }    return Sum;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -