⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 build.cpp

📁 计算机人工智能方面的决策树方法 c4.5
💻 CPP
字号:
/*************************************************************************/
/*                                                                       */
/*    Central tree-forming algorithm incorporating all criteria          */
/*    ---------------------------------------------------------          */
/*                                                                       */
/*************************************************************************/


#include "defns.h"
#include "c45types.h"
#include "extern.h"


ItemCount
	*Weight,        /* Weight[i]  = current fraction of item i */
	**Freq,         /* Freq[x][c] = no. items of class c with outcome x */
	*ValFreq,       /* ValFreq[x]   = no. items with outcome x */
	*ClassFreq;     /* ClassFreq[c] = no. items of class c */

double
	*Gain,          /* Gain[a] = info gain by split on att a */
	*Info,          /* Info[a] = potential info of split on att a */
	*Bar,           /* Bar[a]  = best threshold for contin att a */
	*UnknownRate;   /* UnknownRate[a] = current unknown rate for att a */

char
	*Tested;        /* Tested[a] set if att a has already been tested */


	/*  External variables initialised here  */

extern double
	*SplitGain,     /* SplitGain[i] = gain with att value of item i as threshold */
	*SplitInfo;     /* SplitInfo[i] = potential info ditto */

extern ItemCount
	*Slice1,        /* Slice1[c]    = saved values of Freq[x][c] in subset.c */
	*Slice2;        /* Slice2[c]    = saved values of Freq[y][c] */

extern Set
	**Subset;       /* Subset[a][s] = subset s for att a */

extern int
	*Subsets;       /* Subsets[a] = no. subsets for att a */

extern FILE *fpScreen;

/*************************************************************************/
/*                                                                       */
/*              Allocate space for tree tables                           */
/*                                                                       */
/*************************************************************************/


void InitialiseTreeData()
/*  ------------------  */
{ 
    DiscrValue v;
    Attribute a;

    Tested      = (char *) calloc(MaxAtt+1, sizeof(char));

    Gain        = (double *) calloc(MaxAtt+1, sizeof(double));
    Info        = (double *) calloc(MaxAtt+1, sizeof(double));
    Bar         = (double *) calloc(MaxAtt+1, sizeof(double));

    Subset = (Set **) calloc(MaxAtt+1, sizeof(Set *));
    ForEach(a, 0, MaxAtt)
    {
	if ( MaxAttVal[a] )
	{
	    Subset[a]  = (Set *) calloc(MaxDiscrVal+1, sizeof(Set));
	    ForEach(v, 0, MaxAttVal[a])
	    {
		Subset[a][v] = (Set) malloc((MaxAttVal[a]>>3) + 1);
	    }
	}
    }
    Subsets = (int *) calloc(MaxAtt+1, sizeof(int));

    SplitGain = (double *) calloc(MaxItem+1, sizeof(double));
    SplitInfo = (double *) calloc(MaxItem+1, sizeof(double));

    Weight = (ItemCount *) calloc(MaxItem+1, sizeof(ItemCount));

    Freq  = (ItemCount **) calloc(MaxDiscrVal+1, sizeof(ItemCount *));
    ForEach(v, 0, MaxDiscrVal)
    {
	Freq[v]  = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));
    }

    ValFreq = (ItemCount *) calloc(MaxDiscrVal+1, sizeof(ItemCount));
    ClassFreq = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));

    Slice1 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount));
    Slice2 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount));

    UnknownRate = (double *) calloc(MaxAtt+1, sizeof(double));
}



/*************************************************************************/
/*                                                                       */
/*              Initialise the weight of each item                       */
/*                                                                       */
/*************************************************************************/


void InitialiseWeights()
/*  -----------------  */
{
    ItemNo i;

    ForEach(i, 0, MaxItem)
    {
	Weight[i] = 1.0;
    }
}



/*************************************************************************/
/*                                                                       */
/*  Build a decision tree for the cases Fp through Lp:                   */
/*                                                                       */
/*  - if all cases are of the same class, the tree is a leaf and so      */
/*      the leaf is returned labelled with this class                    */
/*                                                                       */
/*  - for each attribute, calculate the potential information provided   */
/*      by a test on the attribute (based on the probabilities of each   */
/*      case having a particular value for the attribute), and the gain  */
/*      in information that would result from a test on the attribute    */
/*      (based on the probabilities of each case with a particular       */
/*      value for the attribute being of a particular class)             */
/*                                                                       */
/*  - on the basis of these figures, and depending on the current        */
/*      selection criterion, find the best attribute to branch on.       */
/*      Note:  this version will not allow a split on an attribute       */
/*      unless two or more subsets have at least MINOBJS items.          */
/*                                                                       */
/*  - try branching and test whether better than forming a leaf          */
/*                                                                       */
/*************************************************************************/


Tree FormTree(ItemNo Fp, ItemNo Lp)
/*   ---------  */
{ 
    ItemNo i, Kp, Ep;
    ItemCount Cases, NoBestClass, KnownCases;
    double Factor, BestVal, Val, AvGain=0;
    Attribute Att, BestAtt, Possible=0;
    ClassNo c, BestClass;
    Tree Node;
    DiscrValue v;
    Boolean PrevAllKnown;

    Cases = CountItems(Fp, Lp);

    /*  Generate the class frequency distribution  */

    ForEach(c, 0, MaxClass)
    {
	ClassFreq[c] = 0;
    }
    ForEach(i, Fp, Lp)
    { 
	ClassFreq[ Class(Item[i]) ] += Weight[i];
    } 

    /*  Find the most frequent class  */

    BestClass = 0;
    ForEach(c, 0, MaxClass)
    {
	if ( ClassFreq[c] > ClassFreq[BestClass] )
	{
	    BestClass = c;
	}
    }
    NoBestClass = ClassFreq[BestClass];

    Node = Leaf(ClassFreq, BestClass, Cases, Cases - NoBestClass);

    /*  If all cases are of the same class or there are not enough
	cases to divide, the tree is a leaf  */

    if ( NoBestClass == Cases  || Cases < 2 * MINOBJS )
    { 
	return Node;
    } 

    Verbosity(1)
	{
		fprintf(fpScreen,"\n%d items, total weight %.1f\n", Lp - Fp + 1, Cases);
		printf("\n%d items, total weight %.1f\n", Lp - Fp + 1, Cases);
	}

    /*  For each available attribute, find the information and gain  */

    ForEach(Att, 0, MaxAtt) 
    { 
	Gain[Att] = -Epsilon;

	if ( SpecialStatus[Att] == IGNORE ) continue;

	if ( MaxAttVal[Att] )
	{
	    /*  discrete valued attribute  */

	    if ( SUBSET && MaxAttVal[Att] > 2 )
	    {
		EvalSubset(Att, Fp, Lp, Cases);
	    }
	    else
	    if ( ! Tested[Att] )
	    {
		EvalDiscreteAtt(Att, Fp, Lp, Cases);
	    }
	}
	else
	{ 
	    /*  continuous attribute  */

	    EvalContinuousAtt(Att, Fp, Lp);
	} 

	/*  Update average gain, excluding attributes with very many values  */

	if ( Gain[Att] >= 0 && ( SUBSET || MaxAttVal[Att] < 0.3 * MaxItem ) )
	{
	    Possible++;
	    AvGain += Gain[Att];
	}
    } 

    /*  Find the best attribute according to the given criterion  */

    BestVal = -Epsilon;
    BestAtt = None;
    AvGain  = ( Possible ? AvGain / Possible : 1E6 );

    Verbosity(2) 
	{
		fprintf(fpScreen,"\taverage gain %.3f\n", AvGain);
		printf("\taverage gain %.3f\n", AvGain);
	}

    ForEach(Att, 0, MaxAtt) 
    { 
	if ( Gain[Att] >= 0 )
	{ 
	    Val = Worth(Info[Att], Gain[Att], AvGain);
	    if ( Val > BestVal ) 
	    { 
		BestAtt  = Att; 
		BestVal = Val;
	    } 
	} 
    } 

    /*  Decide whether to branch or not  */ 

    if ( BestAtt != None )
    { 
	Verbosity(1)
	{
	    fprintf(fpScreen,"\tbest attribute %s", AttName[BestAtt]);
		printf("\tbest attribute %s", AttName[BestAtt]);
	    if ( ! MaxAttVal[BestAtt] )
	    {
			fprintf(fpScreen," cut %.3f", Bar[BestAtt]);
			printf(" cut %.3f", Bar[BestAtt]);
	    }
	    fprintf(fpScreen," inf %.3f gain %.3f val %.3f\n",
		   Info[BestAtt], Gain[BestAtt], BestVal);
		printf(" inf %.3f gain %.3f val %.3f\n",
		   Info[BestAtt], Gain[BestAtt], BestVal);
	}       

	/*  Build a node of the selected test  */

	if ( MaxAttVal[BestAtt] )
	{
	    /*  Discrete valued attribute  */

	    if ( SUBSET && MaxAttVal[BestAtt] > 2 )
	    {
		SubsetTest(Node, BestAtt);
	    }
	    else
	    {
		DiscreteTest(Node, BestAtt);
	    }
	}
	else
	{ 
	    /*  Continuous attribute  */

	    ContinTest(Node, BestAtt);
	} 

	/*  Remove unknown attribute values  */

	PrevAllKnown = AllKnown;

	Kp = Group(0, Fp, Lp, Node) + 1;
	if ( Kp != Fp ) AllKnown = false;
	KnownCases = Cases - CountItems(Fp, Kp-1);
	UnknownRate[BestAtt] = (Cases - KnownCases) / (Cases + 0.001);

	Verbosity(1)
	{
	    fprintf(fpScreen,"\tunknown rate for %s = %.3f\n",
		   AttName[BestAtt], UnknownRate[BestAtt]);
		printf("\tunknown rate for %s = %.3f\n",
		   AttName[BestAtt], UnknownRate[BestAtt]);
	}

	/*  Recursive divide and conquer  */

	++Tested[BestAtt];

	Ep = Kp - 1;
	Node->Errors = 0;

	ForEach(v, 1, Node->Forks)
	{
	    Ep = Group(v, Kp, Lp, Node);

	    if ( Kp <= Ep )
	    {
		Factor = CountItems(Kp, Ep) / KnownCases;

		ForEach(i, Fp, Kp-1)
		{
		    Weight[i] *= Factor;
		}

		Node->Branch[v] = FormTree(Fp, Ep);
		Node->Errors += Node->Branch[v]->Errors;

		Group(0, Fp, Ep, Node);
		ForEach(i, Fp, Kp-1)
		{
		    Weight[i] /= Factor;
		}
	    }
	    else
	    {
		Node->Branch[v] = Leaf(Node->ClassDist, BestClass, 0.0, 0.0);
	    }
	}

	--Tested[BestAtt];
	AllKnown = PrevAllKnown;

	/*  See whether we would have been no worse off with a leaf  */

	if ( Node->Errors >= Cases - NoBestClass - Epsilon )
	{ 
	    Verbosity(1)
		{
			fprintf(fpScreen,"Collapse tree for %d items to leaf %s\n",
				Lp - Fp + 1, ClassName[BestClass]);
			printf("Collapse tree for %d items to leaf %s\n",
				Lp - Fp + 1, ClassName[BestClass]);
		}

	    Node->NodeType = 0;
	} 
    }
    else
    { 
		Verbosity(1)
		{
			fprintf(fpScreen,"\tno sensible splits  %.1f/%.1f\n",
				Cases, Cases - NoBestClass);
			printf("\tno sensible splits  %.1f/%.1f\n",
				Cases, Cases - NoBestClass);
		}
    } 

    return Node; 
} 



/*************************************************************************/
/*                                                                       */
/*  Group together the items corresponding to branch V of a test         */
/*  and return the index of the last such                                */
/*                                                                       */
/*  Note: if V equals zero, group the unknown values                     */
/*                                                                       */
/*************************************************************************/


ItemNo Group(DiscrValue V, ItemNo Fp, ItemNo Lp, Tree TestNode)
/*     -----  */
{
    ItemNo i;
    Attribute Att;
    double Thresh;
    Set SS;


    Att = TestNode->Tested;

    if ( V )
    {
	/*  Group items on the value of attribute Att, and depending
	    on the type of branch  */

	switch ( TestNode->NodeType )
	{
	    case BrDiscr:

		ForEach(i, Fp, Lp)
		{
		    if ( DVal(Item[i], Att) == V ) Swap(Fp++, i);
		}
		break;

	    case ThreshContin:

		Thresh = TestNode->Cut;
		ForEach(i, Fp, Lp)
		{
		    if ( (CVal(Item[i], Att) <= Thresh) == (V == 1) ) Swap(Fp++, i);
		}
		break;

	    case BrSubset:

		SS = TestNode->Subset[V];
		ForEach(i, Fp, Lp)
		{
		    if ( In(DVal(Item[i], Att), SS) ) Swap(Fp++, i);
		}
		break;
	}
    }
    else
    {
	/*  Group together unknown values  */

	switch ( TestNode->NodeType )
	{
	    case BrDiscr:
	    case BrSubset:

		ForEach(i, Fp, Lp)
		{
		    if ( ! DVal(Item[i], Att) ) Swap(Fp++, i);
		}
		break;

	    case ThreshContin:

		ForEach(i, Fp, Lp)
		{
		    if ( CVal(Item[i], Att) == Unknown ) Swap(Fp++, i);
		}
		break;
	}
    }

    return Fp - 1;
}



/*************************************************************************/
/*                                                                       */
/*      Return the total weight of items from Fp to Lp                   */
/*                                                                       */
/*************************************************************************/


ItemCount CountItems(ItemNo Fp,ItemNo Lp)
/*        ----------  */
{
    register ItemCount Sum=0.0, *Wt, *LWt;

    if ( AllKnown ) return Lp - Fp + 1;

    for ( Wt = Weight + Fp, LWt = Weight + Lp ; Wt <= LWt ; )
    {
	Sum += *Wt++;
    }

    return Sum;
}



/*************************************************************************/
/*                                                                       */
/*              Exchange items at a and b                                */
/*                                                                       */
/*************************************************************************/


void Swap(ItemNo a,ItemNo b)
{
    register Description Hold;
    register ItemCount HoldW;

    Hold = Item[a];
    Item[a] = Item[b];
    Item[b] = Hold;

    HoldW = Weight[a];
    Weight[a] = Weight[b];
    Weight[b] = HoldW;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -