⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 c4.5gt.c

📁 决策树c4.5算法的c++实现 希望对大家有用
💻 C
📖 第 1 页 / 共 5 页
字号:
{    ItemNo i, Start=0, More;    ClassNo c;    void Swap();    Shuffle();    ForEach(c, 0, MaxClass)    {	More = TargetClassFreq[c];	for ( i = Start ; More ; i++ )	{	    if ( Class(Item[i]) == c )	    {		Swap(Start, i);		Start++;		More--;	    }	}    }}/*************************************************************************//*									 *//*		Shuffle the data items randomly				 *//*									 *//*************************************************************************/    Shuffle()/*  -------  */{    ItemNo This, Alt, Left;    Description Hold;    This = 0;    for( Left = MaxItem+1 ; Left ; )    {        Alt = This + (Left--) * Random;        Hold = Item[This];        Item[This++] = Item[Alt];        Item[Alt] = Hold;    }}/*************************************************************************//*									 *//*  Grow a tree iteratively with initial window size Window and		 *//*  initial window increment IncExceptions.				 *//*									 *//*  Construct a classifier tree using the data items in the		 *//*  window, then test for the successful classification of other	 *//*  data items by this tree.  If there are misclassified items,		 *//*  put them immediately after the items in the window, increase	 *//*  the size of the window and build another classifier tree, and	 *//*  so on until we have a tree which successfully classifies all	 *//*  of the test items or no improvement is apparent.			 *//*									 *//*  On completion, return the tree which produced the least errors.	 *//*									 *//*************************************************************************/Tree Iterate(Window, IncExceptions)/*   -------  */    ItemNo Window, IncExceptions;{    Tree Classifier, BestClassifier=Nil, FormTree();    ItemNo i, Errors, TotalErrors, BestTotalErrors=MaxItem+1,	   Exceptions, Additions;    ClassNo Assigned, Category();    short Cycle=0;    void Swap();    printf("Cycle   Tree    -----Cases----");    printf("    -----------------Errors-----------------\n");    printf("        size    window   other");    printf("    window  rate   other  rate   total  rate\n");    printf("-----   ----    ------  ------");    printf("    ------  ----  ------  ----  ------  ----\n");    do    {	/*  Build a classifier tree with the first Window items  */	InitialiseWeights();	AllKnown = true;	Classifier = FormTree(0, Window-1);	/*  Error analysis  */	Errors = Round(Classifier->Errors);	/*  Move all items that are incorrectly classified by the	    classifier tree to immediately after the items in the	    current window.  */	Exceptions = Window;	ForEach(i, Window, MaxItem)	{	    Assigned = Category(Item[i], Classifier);	    if ( Assigned != Class(Item[i]) )	    {		Swap(Exceptions, i);		Exceptions++;	    }	}        Exceptions -= Window;	TotalErrors = Errors + Exceptions;	/*  Print error analysis  */	printf("%3d  %7d  %8d  %6d  %8d%5.1f%%  %6d%5.1f%%  %6d%5.1f%%\n",	       ++Cycle, TreeSize(Classifier), Window, MaxItem-Window+1,	       Errors, 100*(float)Errors/Window,	       Exceptions, 100*Exceptions/(MaxItem-Window+1.001),	       TotalErrors, 100*TotalErrors/(MaxItem+1.0));	/*  Keep track of the most successful classifier tree so far  */	if ( ! BestClassifier || TotalErrors < BestTotalErrors )	{	    if ( BestClassifier ) ReleaseTree(BestClassifier);	    BestClassifier = Classifier;	    BestTotalErrors = TotalErrors;        }	else	{	    ReleaseTree(Classifier);	}	/*  Increment window size  */	Additions = Min(Exceptions, IncExceptions);	Window = Min(Window + Max(Additions, Exceptions / 2), MaxItem + 1);    }    while ( Exceptions );    return BestClassifier;}/*************************************************************************//*									 *//*	Print report of errors for each of the trials			 *//*									 *//*************************************************************************/    Evaluate(CMInfo, Saved)/*  --------  */    Boolean CMInfo;    short Saved;{    ClassNo RealClass, PrunedClass, Category();    short t;    ItemNo *ConfusionMat, i, RawErrors, PrunedErrors;    if ( CMInfo )    {	ConfusionMat = (ItemNo *) calloc((MaxClass+1)*(MaxClass+1), sizeof(ItemNo));    }    printf("\n");    if ( TRIALS > 1 )    {	printf("Trial\t Before Pruning           After Pruning\n");	printf("-----\t----------------   ---------------------------\n");    }    else    {	printf("\t Before Pruning           After Pruning\n");	printf("\t----------------   ---------------------------\n");    }    printf("\tSize      Errors   Size      Errors   Estimate\n\n");    ForEach(t, 0, TRIALS-1)    {	RawErrors = PrunedErrors = 0;	ForEach(i, 0, MaxItem)	{	    RealClass = Class(Item[i]);	    if ( Category(Item[i], Raw[t]) != RealClass ) RawErrors++;	    PrunedClass = Category(Item[i], Pruned[t]);	    if ( PrunedClass != RealClass ) PrunedErrors++;	    if ( CMInfo && t == Saved )	    {		ConfusionMat[RealClass*(MaxClass+1)+PrunedClass]++;	    }	}    	if ( TRIALS > 1 )	{	    printf("%4d", t);	}	printf("\t%4d  %3d(%4.1f%%)   %4d  %3d(%4.1f%%)    (%4.1f%%)%s\n",	       TreeSize(Raw[t]), RawErrors, 100.0*RawErrors / (MaxItem+1.0),	       TreeSize(Pruned[t]), PrunedErrors, 100.0*PrunedErrors / (MaxItem+1.0),	       100 * Pruned[t]->Errors / Pruned[t]->Items,	       ( t == Saved ? "   <<" : "" ));    }    if ( CMInfo )    {	PrintConfusionMatrix(ConfusionMat);	free(ConfusionMat);    }}/*************************************************************************//*								 	 *//*    Central tree-forming algorithm incorporating all criteria  	 *//*    ---------------------------------------------------------	 	 *//*								 	 *//*************************************************************************/ItemCount	*Weight,	/* Weight[i]  = current fraction of item i */	**Freq,		/* Freq[x][c] = no. items of class c with outcome x */	*ValFreq,	/* ValFreq[x]   = no. items with outcome x */	*ClassFreq;	/* ClassFreq[c] = no. items of class c */float	*Gain,		/* Gain[a] = info gain by split on att a */	*Info,		/* Info[a] = potential info of split on att a */	*Bar,		/* Bar[a]  = best threshold for contin att a */	*UnknownRate;	/* UnknownRate[a] = current unknown rate for att a */Boolean	*Tested,	/* Tested[a] set if att a has already been tested */	MultiVal;	/* true when all atts have many values */	/*  External variables initialised here  */extern float	*SplitGain,	/* SplitGain[i] = gain with att value of item i as threshold */	*SplitInfo;	/* SplitInfo[i] = potential info ditto */extern ItemCount	*Slice1,	/* Slice1[c]    = saved values of Freq[x][c] in subset.c */	*Slice2;	/* Slice2[c]    = saved values of Freq[y][c] */extern Set	**Subset;	/* Subset[a][s] = subset s for att a */extern short	*Subsets;	/* Subsets[a] = no. subsets for att a *//*************************************************************************//*								 	 *//*		Allocate space for tree tables			 	 *//*								 	 *//*************************************************************************/    InitialiseTreeData()/*  ------------------  */{     DiscrValue v;    Attribute a;    Tested	= (char *) calloc(MaxAtt+1, sizeof(char));    Gain	= (float *) calloc(MaxAtt+1, sizeof(float));    Info	= (float *) calloc(MaxAtt+1, sizeof(float));    Bar		= (float *) calloc(MaxAtt+1, sizeof(float));    Subset = (Set **) calloc(MaxAtt+1, sizeof(Set *));    ForEach(a, 0, MaxAtt)    {	if ( MaxAttVal[a] )	{	    Subset[a]  = (Set *) calloc(MaxDiscrVal+1, sizeof(Set));	    ForEach(v, 0, MaxAttVal[a])	    {		Subset[a][v] = (Set) malloc((MaxAttVal[a]>>3) + 1);	    }	}    }    Subsets = (short *) calloc(MaxAtt+1, sizeof(short));    SplitGain = (float *) calloc(MaxItem+1, sizeof(float));    SplitInfo = (float *) calloc(MaxItem+1, sizeof(float));    Weight = (ItemCount *) calloc(MaxItem+1, sizeof(ItemCount));    Freq  = (ItemCount **) calloc(MaxDiscrVal+1, sizeof(ItemCount *));    ForEach(v, 0, MaxDiscrVal)    {	Freq[v]  = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));    }    ValFreq = (ItemCount *) calloc(MaxDiscrVal+1, sizeof(ItemCount));    ClassFreq = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));    Slice1 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount));    Slice2 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount));    UnknownRate = (float *) calloc(MaxAtt+1, sizeof(float));    /*  Check whether all attributes have many discrete values  */    MultiVal = true;    if ( ! SUBSET )    {	for ( a = 0 ; MultiVal && a <= MaxAtt ; a++ )	{	    if ( SpecialStatus[a] != IGNORE )	    {		MultiVal = MaxAttVal[a] >= 0.3 * (MaxItem + 1);	    }	}    }}/*************************************************************************//*								 	 *//*		Initialise the weight of each item		 	 *//*								 	 *//*************************************************************************/    InitialiseWeights()/*  -----------------  */{    ItemNo i;    ForEach(i, 0, MaxItem)    {        Weight[i] = 1.0;    }}/*************************************************************************//*								 	 *//*  Build a decision tree for the cases Fp through Lp:		 	 *//*								 	 *//*  - if all cases are of the same class, the tree is a leaf and so	 *//*      the leaf is returned labelled with this class		 	 *//*								 	 *//*  - for each attribute, calculate the potential information provided 	 *//*	by a test on the attribute (based on the probabilities of each	 *//*	case having a particular value for the attribute), and the gain	 *//*	in information that would result from a test on the attribute	 *//*	(based on the probabilities of each case with a particular	 *//*	value for the attribute being of a particular class)		 *//*								 	 *//*  - on the basis of these figures, and depending on the current	 *//*	selection criterion, find the best attribute to branch on. 	 *//*	Note:  this version will not allow a split on an attribute	 *//*	unless two or more subsets have at least MINOBJS items. 	 *//*								 	 *//*  - try branching and test whether better than forming a leaf	 	 *//*								 	 *//*************************************************************************/Tree FormTree(Fp, Lp)/*   ---------  */    ItemNo Fp, Lp; {     ItemNo i, Kp, Ep, Group();    ItemCount Cases, NoBestClass, KnownCases, CountItems();    float Factor, BestVal, Val, AvGain=0, Worth();    Attribute Att, BestAtt, Possible=0;    ClassNo c, BestClass;    Tree Node, Leaf();    DiscrValue v;    Boolean PrevAllKnown;    Cases = CountItems(Fp, Lp);    /*  Generate the class frequency distribution  */    ForEach(c, 0, MaxClass)    {	ClassFreq[c] = 0;    }    ForEach(i, Fp, Lp)    { 	ClassFreq[ Class(Item[i]) ] += Weight[i];    }     /*  Find the most frequent class  */    BestClass = 0;    ForEach(c, 0, MaxClass)    {	if ( ClassFreq[c] > ClassFreq[BestClass] )	{	    BestClass = c;	}    }    NoBestClass = ClassFreq[BestClass];    Node = Leaf(ClassFreq, BestClass, Cases, Cases - NoBestClass);    /*  If all cases are of the same class or there are not enough	cases to divide, the tree is a leaf  */    if ( NoBestClass == Cases  || Cases < 2 * MINOBJS )    { 	return Node;    }     Verbosity(1)    	printf("\n%d items, total weight %.1f\n", Lp - Fp + 1, Cases);    /*  For each available attribute, find the information and gain  */    ForEach(Att, 0, MaxAtt)     { 	Gain[Att] = -Epsilon;	if ( SpecialStatus[Att] == IGNORE ) continue;        if ( MaxAttVal[Att] )	{	    /*  discrete valued attribute  */	    if ( SUBSET && MaxAttVal[Att] > 2 )	    {		EvalSubset(Att, Fp, Lp, Cases);	    }	    else	    if ( ! Tested[Att] )	    {	        EvalDiscreteAtt(Att, Fp, Lp, Cases);	    }	}	else	{ 	    /*  continuous attribute  */	    EvalContinuousAtt(Att, Fp, Lp);	} 	/*  Update average gain, excluding attributes with very many values  */	if ( Gain[Att] > -Epsilon &&	     ( MultiVal || MaxAttVal[Att] < 0.3 * (MaxItem + 1) ) )	{	    Possible++;	    AvGain += Gain[Att];	}    }     /*  Find the best attribute according to the given criterion  */    BestVal = -Epsilon;    BestAtt = None;    AvGain  = ( Possible ? AvGain / Possible : 1E6 );    Verbosity(2)    {	if ( AvGain < 1E6 ) printf("\taverage gain %.3f\n", AvGain);    }    ForEach(Att, 0, MaxAtt)     { 	if ( Gain[Att] > -Epsilon )	{ 	    Val = Worth(Info[Att], Gain[Att], AvGain);	    if ( Val > BestVal ) 	    { 	        BestAtt  = Att; 	        BestVal = Val;	    } 	}     }     /*  Decide whether to branch or not  */     if ( BestAtt != None )    { 	Verbosity(1)	{	    printf("\tbest attribute %s", AttName[BestAtt]);	    if ( ! MaxAttVal[BestAtt] )	    {		printf(" cut %.3f", Bar[BestAtt]);	    }	    printf(" inf %.3f gain %.3f val %.3f\n",		   Info[BestAtt], Gain[BestAtt], BestVal);	}		/*  Build a node of the selected test  */        if ( MaxAttVal[BestAtt] )	{	    /*  Discrete valued attribute  */	    if ( SUBSET && MaxAttVal[BestAtt] > 2 )	    {	        SubsetTest(Node, BestAtt);	    }	    else	    {	        DiscreteTest(Node, BestAtt);	    }	}	else	{ 	    /*  Continuous attribute  */	    ContinTest(Node, BestAtt);	} 	/*  Remove unknown attribute values  */	PrevAllKnown = AllKnown;	Kp = Group(0, Fp, Lp, Node) + 1;	if ( Kp != Fp ) AllKnown = false;	KnownCases = Cases - CountItems(Fp, Kp-1);	UnknownRate[BestAtt] = (Cases - KnownCases) / (Cases + 0.001);	Verbosity(1)	{	    if ( UnknownRate[BestAtt] > 0 )	    {		printf("\tunknown rate for %s = %.3f\n",		       AttName[BestAtt], UnknownRate[BestAtt]);	    }	}	/*  Recursive divide and conquer  */	++Tested[BestAtt];	Ep = Kp - 1;	Node->Errors = 0;	ForEach(v, 1, Node->Forks)	{	    Ep = Group(v, Kp, Lp, Node);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -