⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 prune.c

📁 C4.5决策树算法C语言实现
💻 C
字号:
/*************************************************************************//*									 *//*	Prune a decision tree and predict its error rate		 *//*	------------------------------------------------		 *//*									 *//*************************************************************************/#include "defns.i"#include "types.i"#include "extern.i"
#include <malloc.h>#include <memory.h>
extern	ItemCount	*Weight;Set	*PossibleValues=Nil;Boolean	Changed;#define	LocalVerbosity(x)	if (Sh >= 0 && VERBOSITY >= x)#define	Intab(x)		Indent(x, "| ") void   CheckPossibleValues(Tree T);void    InitialiseWeights();
float EstimateErrors(Tree T,ItemNo Fp,ItemNo Lp,short Sh,Boolean UpdateTree);void    Indent(short Sh,char * Mark);

ItemCount CountItems(ItemNo Fp,ItemNo Lp);
float AddErrs(ItemCount N,ItemCount e);
ItemNo Group(DiscrValue V,ItemNo Fp,ItemNo Lp,Tree TestNode);/*************************************************************************//*									 *//*  Prune tree T, returning true if tree has been modified		 *//*									 *//*************************************************************************/Boolean Prune(Tree T)/*      -----  */    {    ItemNo i;
    Attribute a;
	
    InitialiseWeights();
    AllKnown = true;
	
    Verbosity(1) printf("\n");
	
    Changed = false;
	
    EstimateErrors(T, 0, MaxItem, 0, true);
	
    if ( SUBSET )
    {
		if ( ! PossibleValues )
		{
			PossibleValues = (Set *) calloc(MaxAtt+1, sizeof(Set));
		}
		
		ForEach(a, 0, MaxAtt)
		{
			if ( MaxAttVal[a] )
			{
				PossibleValues[a] = (Set) malloc((MaxAttVal[a]>>3) + 1);
				ClearBits((MaxAttVal[a]>>3) + 1, PossibleValues[a]);
				ForEach(i, 1, MaxAttVal[a])
				{
					SetBit(i, PossibleValues[a]);
				}
			}
		}
		
		CheckPossibleValues(T);
    }
	
    return Changed;
}/*************************************************************************//*									 *//*	Estimate the errors in a given subtree				 *//*									 *//*************************************************************************/float EstimateErrors(Tree T,ItemNo Fp,ItemNo Lp,short Sh,Boolean UpdateTree)/*    --------------  */{     ItemNo i, Kp, Ep ;
    ItemCount Cases, KnownCases, *LocalClassDist, TreeErrors, LeafErrors,
		ExtraLeafErrors, BranchErrors, Factor, MaxFactor;
    DiscrValue v, MaxBr;
    ClassNo c, BestClass;
    Boolean PrevAllKnown;
	
    /*  Generate the class frequency distribution  */
	
    Cases = CountItems(Fp, Lp);
    LocalClassDist = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));
	
    ForEach(i, Fp, Lp)
    { 
		LocalClassDist[ Class(Item[i]) ] += Weight[i];
    } 
	
    /*  Find the most frequent class and update the tree  */
	
    BestClass = T->Leaf;
    ForEach(c, 0, MaxClass)
    {
		if ( LocalClassDist[c] > LocalClassDist[BestClass] )
		{
			BestClass = c;
		}
    }
    LeafErrors = Cases - LocalClassDist[BestClass];
    ExtraLeafErrors = AddErrs(Cases, LeafErrors);
	
    if ( UpdateTree )
    {
		T->Items = Cases;
		T->Leaf  = BestClass;
		memcpy(T->ClassDist, LocalClassDist, (MaxClass + 1) * sizeof(ItemCount));
    }
	
    if ( ! T->NodeType )	/*  leaf  */
    {
		TreeErrors = LeafErrors + ExtraLeafErrors;
		
		if ( UpdateTree )
		{
			T->Errors = TreeErrors;
			
			LocalVerbosity(1)
			{
				Intab(Sh);
				printf("%s (%.2f:%.2f/%.2f)\n", ClassName[T->Leaf],
					T->Items, LeafErrors, T->Errors);
			}
		}
		
		free(LocalClassDist);
		
		return TreeErrors;
    }
	
    /*  Estimate errors for each branch  */
	
    Kp = Group(0, Fp, Lp, T) + 1;
    KnownCases = CountItems(Kp, Lp);
	
    PrevAllKnown = AllKnown;
    if ( Kp != Fp ) AllKnown = false;
	
    TreeErrors = MaxFactor = 0;
	
    ForEach(v, 1, T->Forks)
    {
		Ep = Group(v, Kp, Lp, T);
		
		if ( Kp <= Ep )
		{
			Factor = CountItems(Kp, Ep) / KnownCases;
			
			if ( Factor >= MaxFactor )
			{
				MaxBr = v;
				MaxFactor = Factor;
			}
			
			ForEach(i, Fp, Kp-1)
			{
				Weight[i] *= Factor;
			}
			
			TreeErrors += EstimateErrors(T->Branch[v], Fp, Ep,(short)( Sh+1), UpdateTree);
			
			Group(0, Fp, Ep, T);
			ForEach(i, Fp, Kp-1)
			{
				Weight[i] /= Factor;
			}
		}
    }
	
    AllKnown = PrevAllKnown;
	
    if ( ! UpdateTree )
    {
		free(LocalClassDist);
		
		return TreeErrors;
    }
	
    /*  See how the largest branch would fare  */
	
    BranchErrors = EstimateErrors(T->Branch[MaxBr], Fp, Lp, -1000, false);
	
    LocalVerbosity(1)
    {
        Intab(Sh);
        printf("%s:  [%d%%  N=%.2f  tree=%.2f  leaf=%.2f+%.2f  br[%d]=%.2f]\n",
			AttName[T->Tested],
			(int) ((TreeErrors * 100) / (T->Items + 0.001)),
			T->Items, TreeErrors, LeafErrors, ExtraLeafErrors,
			MaxBr, BranchErrors);
    }
	
    /*  See whether tree should be replaced with leaf or largest branch  */
	
    if ( LeafErrors + ExtraLeafErrors <= BranchErrors + 0.1 &&
		LeafErrors + ExtraLeafErrors <= TreeErrors + 0.1 )
    {
		LocalVerbosity(1)
		{
			Intab(Sh);
			printf("Replaced with leaf %s\n", ClassName[T->Leaf]);
		}
		
		T->NodeType = 0;
		T->Errors = LeafErrors + ExtraLeafErrors;
		Changed = true;
    }
    else
		if ( BranchErrors <= TreeErrors + 0.1 )
		{
			LocalVerbosity(1)
			{
				Intab(Sh);
				printf("Replaced with branch %d\n", MaxBr);
			}
			
			AllKnown = PrevAllKnown;
			EstimateErrors(T->Branch[MaxBr], Fp, Lp, Sh, true);
			memcpy((char *) T, (char *) T->Branch[MaxBr], sizeof(TreeRec));
			Changed = true;
		}
		else
		{
			T->Errors = TreeErrors;
		}
		
		AllKnown = PrevAllKnown;
		free(LocalClassDist);
		
		return T->Errors;
}/*************************************************************************//*									 *//*	Remove unnecessary subset tests on missing values		 *//*									 *//*************************************************************************/ void   CheckPossibleValues(Tree T)/*  -------------------  */{    Set HoldValues;    int v, Bytes, b;    Attribute A;    char Any=0;    if ( T->NodeType == BrSubset )    {		A = T->Tested;
		
		Bytes = (MaxAttVal[A]>>3) + 1;
		HoldValues = (Set) malloc(Bytes);
		
		/*  See if last (default) branch can be simplified or omitted  */
		
		ForEach(b, 0, Bytes-1)
		{
			T->Subset[T->Forks][b] &= PossibleValues[A][b];
			Any |= T->Subset[T->Forks][b];
		}
		
		if ( ! Any )
		{
			T->Forks--;
		}
		
		/*  Process each subtree, leaving only values in branch subset  */
		
		CopyBits(Bytes, PossibleValues[A], HoldValues);
		
		ForEach(v, 1, T->Forks)
		{
			CopyBits(Bytes, T->Subset[v], PossibleValues[A]);
			
			CheckPossibleValues(T->Branch[v]);
		}
		
		CopyBits(Bytes, HoldValues, PossibleValues[A]);
		
		free(HoldValues);
    }
    else
		if ( T->NodeType )
		{
			ForEach(v, 1, T->Forks)
			{
				CheckPossibleValues(T->Branch[v]);
			}
		}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -