📄 prunerule.cpp

📁 实现决策树分类训练试验。源自c4.5
💻 CPP
字号:
/*************************************************************************/
/*								   	 */
/*	Pruning single rules					   	 */
/*	--------------------				   	      	 */
/*								   	 */
/*************************************************************************/
#include "stdafx.h"
#include "MyBase.h"

#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif

extern FILE *fLog;
/*********************************************/
/* This is from Ruleinex.i    **/

extern PR	*Rule;		/* production rules */

extern RuleNo	NRules,		/* number of production rules */
		*RuleIndex;	/* index to production rules */

extern short	RuleSpace;	/* space currently allocated for rules */

extern RuleSet	*PRSet;		/* set of rulesets */

extern ClassNo	DefaultClass;	/* default class associated with ruleset */

extern BOOL	SIGTEST,	/* use Fisher's test in rule pruning */
		SIMANNEAL;	/* use simulated annealing */

extern float	SIGTHRESH,	/* sig level used in rule pruning */
		REDUNDANCY,	/* factor governing encoding tradeoff
				   between rules and exceptions */
		AttTestBits,	/* average bits to encode tested attribute */
		*BranchBits;	/* ditto attribute value */

extern float	*LogItemNo;	/* LogItemNo[i] = log2(i) */
extern double	*LogFact;	/* LogFact[i] = log2(i!) */
/*************************************************/

/*  External data structures used in building rules  */

extern ItemNo	*TargetClassFreq,	/* [Boolean] */
		*Errors,		/* [Condition] */
		*Total;			/* [Condition] */

extern float	*Pessimistic,		/* [Condition] */
		*Actual,		/* [Condition] */
		*CondSigLevel;		/* [Condition] */

extern Boolean	**CondSatisfiedBy,	/* [Condition][ItemNo] */
		*Deleted;

#define Before(n1,n2)  (n1->Tested < n2->Tested ||\
			n1->NodeType < n2->NodeType ||\
		        n1->Tested == n2->Tested && n1->Cut < n2->Cut)

#define IsTarget(case)  (Class(case) == TargetClass ? 1 : 0)



/*************************************************************************/
/*									 */
/*  Prune the rule given by the conditions Cond, and the number of	 */
/*  conditions NCond, and add the resulting rule to the current		 */
/*  ruleset if it is sufficiently accurate				 */
/*									 */
/*************************************************************************/
void PruneRule(Condition Cond[], short NCond, ClassNo TargetClass)
{
    short d, dd, id, Bestd, Bestid, Remaining=NCond;
    float DefaultError, Extra;
    BOOL Alter;
    Condition Hold;
    ItemNo i;

    ForEach(d, 0, NCond)
    {
		Deleted[d] = false;
    }

    /*  Evaluate the satisfaction matrix  */

    TargetClassFreq[0] = TargetClassFreq[1] = 0;

    ForEach(i, 0, MaxItem)
    {
		ForEach(d, 1, NCond)
		{
			CondSatisfiedBy[d][i] = Satisfies(Item[i], Cond[d]);
		}
		TargetClassFreq[IsTarget(Item[i])]++;
    }

    DefaultError = 1.0 - (TargetClassFreq[true] + 1.0) / (MaxItem + 3.0);

    /*  Find conditions to delete  */

    Verbosity(1) fprintf(fLog,"\n  pruning rule for %s", ClassName[TargetClass]);

    do
	{
		Alter = false;

		FindTables(NCond, TargetClass);

		/*  Find the condition, deleting which would most improve
			the accuracy of the rule.
			Notes: The pessimistic error rate, and not the actual
			   error rate, is currently used.
	    		   When d is 0, we are dealing with all conditions.  */

		Bestd = id = 0;

		Verbosity(1)
			fprintf(fLog,"\n     Err Used   Pess\tAbsent condition\n");

		ForEach(d, 0, NCond)
		{
			if ( Deleted[d] ) continue;

			if ( Total[d] )
			{
				Actual[d] = Errors[d] / (float) Total[d];
				Extra = AddErrs((float) Total[d], (float) Errors[d]);
				Pessimistic[d] = (Errors[d] + Extra) / Total[d];
			}
			else
			{
				Actual[d] = 0;
				Pessimistic[d] = DefaultError;
			}

			Verbosity(1)
			fprintf(fLog,"   %5d%5d  %4.1f%%",
				   Errors[d], Total[d], 100 * Pessimistic[d]);

			if ( ! d )
			{
				Verbosity(1) fprintf(fLog,"\t<base rule>\n");
			}
			else
			{
				id++;

				/*  If significance testing option used, invoke Fisher's
					exact test here to assess probability that division
					by d arises from chance.  */

				if ( SIGTEST )
				{
					CondSigLevel[d] =
					TableProb(Errors[0],
						   Errors[d]-Errors[0],
						   Total[0]-Errors[0],
							   Total[d]-Total[0]-Errors[d]+Errors[0]);

					Verbosity(1) fprintf(fLog,"  Sig=%.3f", CondSigLevel[d]);
				}

				Verbosity(1) PrintCondition(Cond[d]);

				/*  Bestd identifies the condition with lowest pessimistic
					error  estimate  */

				if ( ! Bestd || Pessimistic[d] <= Pessimistic[Bestd] )
				{
					Bestd = d;
					Bestid = id;
				}

				/*  Alter is set true if we are going to drop a condition
					(either because we get lower pessimistic est, or
					because one of the conditions fails a significance test)  */

				if ( Pessimistic[d] <= Pessimistic[0] ||
					 Actual[d] <= Actual[0]  ||
					 SIGTEST && CondSigLevel[d] > SIGTHRESH )
				{
					Alter = true;
				}
			}
		}

		if ( Alter )
		{
			Verbosity(1) fprintf(fLog,"\teliminate test %d\n", Bestid);

			Deleted[Bestd] = true;
			Remaining--;
		}
	} while ( Alter && Remaining );

    if ( ! Remaining || ! Total[0] )
    {
		return;
    }

    if ( Pessimistic[0] >= DefaultError )
    {
		Verbosity(1) fprintf(fLog,"\ttoo inaccurate\n");
		return;
    }

    /*  Sort the conditions  */

    ForEach(d, 1, Remaining)
    {
		dd =  0;
		ForEach(id, d, NCond)
		{
			if ( ! Deleted[id] && ( ! dd ||
			   Before(Cond[id]->CondTest, Cond[dd]->CondTest) ) )
			{
				dd = id;
			}
		}
		if ( dd != d )
		{
			Hold    = Cond[d];
			Cond[d] = Cond[dd];
			Cond[dd] = Hold;
			Deleted[dd] = Deleted[d];
		}
		Deleted[d] = true;
    }

    NewRule(Cond, Remaining, TargetClass, Pessimistic[0]);
}



/*************************************************************************/
/*									 */
/*  See whether condition R is redundant                           	 */
/*									 */
/*************************************************************************/
BOOL Redundant(short R, Condition Cond[], short NCond)
{
    short d, v, vv;
    Test t, Rt;

    Rt = Cond[R]->CondTest;
    v =  Cond[R]->TestValue;

    ForEach(d, 1, NCond)
	{
		if ( Deleted[d] || d == R ) continue;

		t = Cond[d]->CondTest;
		vv = Cond[d]->TestValue;

		if ( t->Tested != Rt->Tested ) continue;

		switch ( t->NodeType )
		{
			case BrDiscr:  /* test of discrete attribute */

			return false;

			case ThreshContin:  /* test of continuous attribute */

			if ( vv == v &&
				 ( v == 1 ? t->Cut < Rt->Cut : t->Cut > Rt->Cut ) )
			{
				return true;
			}

			break;

			case BrSubset:  /* subset test on discrete attribute  */

			if ( IsSubset(t->Subset[vv], Rt->Subset[v], Rt->Tested) )
			{
				return true;
			}
		}
	}

    return false;
}



/*************************************************************************/
/*									 */
/*  Decide whether subset S1 of values is contained in subset S2	 */
/*									 */
/*************************************************************************/
BOOL IsSubset(Set S1, Set S2, Attribute Att)
{
    DiscrValue v;

    ForEach(v, 1, MaxAttVal[Att])
    {
		if ( In(v, S1) && ! In(v, S2) ) return false;
    }

    return true;
}

/*************************************************************************/
/*									 */
/*  Find the frequency distribution tables for the current conditions: 	 */
/*									 */
/*	Total[0] = items matching all conditions		   	 */
/*	Total[d] = items matching all except condition d	   	 */
/*									 */
/*	Errors[0] = wrong-class items matching all conditions	   	 */
/*	Errors[d] = wrong-class items matching all but cond d	   	 */
/*									 */
/*  This routine is critical to the efficiency of rule pruning. It	 */
/*  computes the information above in one pass through the data,	 */
/*  looking at cases that fail to satisfy at most one of the		 */
/*  non-deleted conditions						 */
/*									 */
/*************************************************************************/
void  FindTables(short NCond, ClassNo TargetClass)
{
    ItemNo i;
    short Misses, Missed[2], d;
    Boolean CorrectClass;

    /*  Clear distributions  */

    ForEach(d, 0, NCond)
    {
		Total[d] = Errors[d] = 0;
    }

    /*  Set distributions  */

    ForEach(i, 0, MaxItem)
	{
		Misses = 0;
		CorrectClass = IsTarget(Item[i]);

		for ( d = 1 ; d <= NCond && Misses <= 1 ; d++ )
		{
			if ( ! Deleted[d] && ! CondSatisfiedBy[d][i] )
			{
				Missed[Misses++] = d;
			}
		}

		if ( ! Misses )
		{
			UpdateCount(Total, Errors, 0, CorrectClass);
		}
		else if ( Misses == 1 )
		{
			UpdateCount(Total, Errors, Missed[0], CorrectClass);
		}
	}

    /*  Adjust counts to reflect cases that met all conditions  */

    ForEach(d, 1, NCond)
    {
		if ( ! Deleted[d] )
		{
			Total[d] += Total[0];
			Errors[d] += Errors[0];
		}
    }
}



/*************************************************************************/
/*									 */
/*  Increment the counts Total[d] and Errors[d]				 */
/*									 */
/*************************************************************************/
void UpdateCount(ItemNo T[], ItemNo E[], short d, Boolean OK)
{
    T[d]++;
    if ( ! OK ) E[d]++;
}


/*************************************************************************/
/*									 */
/*  Determine whether the given case description satisfies the given	 */
/*  condition.								 */
/*									 */
/*************************************************************************/

BOOL Satisfies(Description CaseDesc, Condition OneCond)
{
    DiscrValue v;
    float cv;
    Test t;
    short s;
    Boolean Outcome;

    t = OneCond->CondTest;

    /*  Determine the outcome of this test on this item  */

    switch ( t->NodeType )
    {
	case BrDiscr:  /* test of discrete attribute */

	    v = DVal(CaseDesc, t->Tested);
	    Outcome = ( v == 0 ? -1 : v );
	    break;

	case ThreshContin:  /* test of continuous attribute */

	    cv = CVal(CaseDesc, t->Tested);
	    Outcome = ( cv == Unknown ? -1 : cv <= t->Cut ? 1 : 2 );
	    break;

	case BrSubset:  /* subset test on discrete attribute  */

	    v = DVal(CaseDesc, t->Tested);
	    Outcome = -1;
	    ForEach(s, 1, t->Forks)
	    {
			if ( In(v, t->Subset[s]) )
			{
				Outcome = s;
				break;
			}
	    }
    }

    return ( Outcome == OneCond->TestValue );
}



/*************************************************************************/
/*									 */
/*  Hypergeometric distribution	(uses tabulated log factorials)		 */
/*									 */
/*************************************************************************/


double Hypergeom(int a,int  r, int A, int B)
{
    return exp( LogFact[A] + LogFact[B] + LogFact[r] + LogFact[A+B-r] -
	        ( LogFact[a] + LogFact[r-a] + LogFact[A-a]
		  + LogFact[B-(r-a)] + LogFact[A+B]) );
}

/*************************************************************************/
/*									 */
/*  TableProb examines the 2x2 contingency table t and computes the      */
/*  probability that a random division could have produced a split at	 */
/*  least as extreme as this.  Also known as "Fisher's Exact Test",	 */
/*  after its inventor, R.A. Fisher.					 */
/*									 */
/*************************************************************************/


float TableProb(int t11, int t12, int t21, int t22)
{
    double Sum=0.0;
    int A, B, r, a, k, a0;

    /*  First, rearrange the rows and columns of the table to get it into
	canonical form  */

    if ( t11 + t12 > t21 + t22 )
    {
		A = t11 + t12;
		B = t21 + t22;

		if ( t11 * (t21 + t22) > t21 * (t11 + t12) )
		{
			a0 = t11;
			r  = t11 + t21;
		}
		else
		{
			a0 = t12;
			r  = t12 + t22;
		}
    }
    else
    {
		A = t21 + t22;
		B = t11 + t12;
		if ( t21 * (t11 + t12) > t11 * (t21 + t22) )
		{
			a0 = t21;
			r  = t21 + t11;
		}
		else
		{
			a0 = t22;
			r  = t22 + t12;
		}
    }

    /*  Now compute the probability  */

    k = Min(r, A);
    ForEach(a, a0, k)
    {
		Sum += Hypergeom(a, r, A, B);
    }

    return Sum;
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -