⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 c4.5gt.c

📁 决策树c4.5算法的c++实现 希望对大家有用
💻 C
📖 第 1 页 / 共 5 页
字号:
/*************************************************************************//*									 *//*  Restore old class distribution figures of discrete attribute	 *//*  values x and y from Slice1 and Slice2				 *//*									 *//*************************************************************************/    Uncombine(x, y)/*  ---------  */    DiscrValue x, y;{    ClassNo c;    ForEach(c, 0, MaxClass)    {	Freq[x][c] = Slice1[c];	Freq[y][c] = Slice2[c];    }    ValFreq[x] = Slice1[MaxClass+1];    ValFreq[y] = Slice2[MaxClass+1];}/*************************************************************************//*									 *//*  Print the values of attribute Att which are in the subset Ss	 *//*									 *//*************************************************************************/    PrintSubset(Att, Ss)/*  -----------  */    Attribute Att;    Set Ss;{    DiscrValue V1;    Boolean First=true;    ForEach(V1, 1, MaxAttVal[Att])    {	if ( In(V1, Ss) )	{	    if ( First )	    {		First = false;	    }	    else	    {		printf(", ");	    }	    printf("%s", AttValName[Att][V1]);	}    }}/*************************************************************************//*									 *//*  Construct and return a node for a test on a subset of values	 *//*									 *//*************************************************************************/    SubsetTest(Node, Att)/*  -----------  */    Tree Node;    Attribute Att;{     ItemCount CountItems();    short S, Bytes;    Sprout(Node, Subsets[Att]);    Node->NodeType	= BrSubset;    Node->Tested	= Att;    Node->Errors	= 0;        Bytes = (MaxAttVal[Att]>>3) + 1;    Node->Subset = (Set *) calloc(Subsets[Att] + 1, sizeof(Set));    ForEach(S, 1, Node->Forks)    {	Node->Subset[S] = (Set) malloc(Bytes);	CopyBits(Bytes, Subset[Att][S], Node->Subset[S]);    }} /*************************************************************************//*									 *//*	Prune a decision tree and predict its error rate		 *//*	------------------------------------------------		 *//*									 *//*************************************************************************/extern	ItemCount	*Weight;Set	*PossibleValues=Nil;Boolean	Changed;#define	LocalVerbosity(x)	if (Sh >= 0 && VERBOSITY >= x)#define	Intab(x)		Indent(x, "| ")/*************************************************************************//*									 *//*  Prune tree T, returning true if tree has been modified		 *//*									 *//*************************************************************************/Boolean Prune(T)/*      -----  */    Tree T;{    ItemNo i;    float EstimateErrors();    Attribute a;    InitialiseWeights();    AllKnown = true;    Verbosity(1) printf("\n");    Changed = false;    EstimateErrors(T, 0, MaxItem, 0, true);    if ( SUBSET )    {	if ( ! PossibleValues )	{	    PossibleValues = (Set *) calloc(MaxAtt+1, sizeof(Set));	}	ForEach(a, 0, MaxAtt)	{	    if ( MaxAttVal[a] )	    {		PossibleValues[a] = (Set) malloc((MaxAttVal[a]>>3) + 1);		ClearBits((MaxAttVal[a]>>3) + 1, PossibleValues[a]);		ForEach(i, 1, MaxAttVal[a])		{		    SetBit(i, PossibleValues[a]);		}	    }	}	CheckPossibleValues(T);    }    return Changed;}/*************************************************************************//*									 *//*	Estimate the errors in a given subtree				 *//*									 *//*************************************************************************/float EstimateErrors(T, Fp, Lp, Sh, UpdateTree)/*    --------------  */    Tree T;    ItemNo Fp, Lp;     short Sh;    Boolean UpdateTree;{     ItemNo i, Kp, Ep, Group();    ItemCount Cases, KnownCases, *LocalClassDist, TreeErrors, LeafErrors,	ExtraLeafErrors, BranchErrors, CountItems(), Factor, MaxFactor, AddErrs();    DiscrValue v, MaxBr;    ClassNo c, BestClass;    Boolean PrevAllKnown;    /*  Generate the class frequency distribution  */    Cases = CountItems(Fp, Lp);    LocalClassDist = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));    ForEach(i, Fp, Lp)    { 	LocalClassDist[ Class(Item[i]) ] += Weight[i];    }     /*  Find the most frequent class and update the tree  */    BestClass = T->Leaf;    ForEach(c, 0, MaxClass)    {	if ( LocalClassDist[c] > LocalClassDist[BestClass] )	{	    BestClass = c;	}    }    LeafErrors = Cases - LocalClassDist[BestClass];    ExtraLeafErrors = AddErrs(Cases, LeafErrors);    if ( UpdateTree )    {	T->Items = Cases;	T->Leaf  = BestClass;	memcpy(T->ClassDist, LocalClassDist, (MaxClass + 1) * sizeof(ItemCount));    }    if ( ! T->NodeType )	/*  leaf  */    {	TreeErrors = LeafErrors + ExtraLeafErrors;	if ( UpdateTree )	{	    T->Errors = TreeErrors;	    LocalVerbosity(1)	    {		Intab(Sh);	    	printf("%s (%.2f:%.2f/%.2f)\n", ClassName[T->Leaf],	    		T->Items, LeafErrors, T->Errors);	    }	}	free(LocalClassDist);	return TreeErrors;    }    /*  Estimate errors for each branch  */    Kp = Group(0, Fp, Lp, T) + 1;    KnownCases = CountItems(Kp, Lp);    PrevAllKnown = AllKnown;    if ( Kp != Fp ) AllKnown = false;    TreeErrors = MaxFactor = 0;    ForEach(v, 1, T->Forks)    {	Ep = Group(v, Kp, Lp, T);	if ( Kp <= Ep )	{	    Factor = CountItems(Kp, Ep) / KnownCases;	    if ( Factor >= MaxFactor )	    {		MaxBr = v;		MaxFactor = Factor;	    }	    ForEach(i, Fp, Kp-1)	    {		Weight[i] *= Factor;	    }	    TreeErrors += EstimateErrors(T->Branch[v], Fp, Ep, Sh+1, UpdateTree);	    Group(0, Fp, Ep, T);	    ForEach(i, Fp, Kp-1)	    {		Weight[i] /= Factor;	    }	}    }     AllKnown = PrevAllKnown;    if ( ! UpdateTree )    {	free(LocalClassDist);	return TreeErrors;    }    /*  See how the largest branch would fare  */    BranchErrors = EstimateErrors(T->Branch[MaxBr], Fp, Lp, -1000, false);    LocalVerbosity(1)    {        Intab(Sh);        printf("%s:  [%d%%  N=%.2f  tree=%.2f  leaf=%.2f+%.2f  br[%d]=%.2f]\n",		AttName[T->Tested],		(int) ((TreeErrors * 100) / (T->Items + 0.001)),		T->Items, TreeErrors, LeafErrors, ExtraLeafErrors,		MaxBr, BranchErrors);    }    /*  See whether tree should be replaced with leaf or largest branch  */    if ( LeafErrors + ExtraLeafErrors <= BranchErrors + 0.1 &&	 LeafErrors + ExtraLeafErrors <= TreeErrors + 0.1 )    {	LocalVerbosity(1)	{	    Intab(Sh);	    printf("Replaced with leaf %s\n", ClassName[T->Leaf]);	}	T->NodeType = 0;	T->Errors = LeafErrors + ExtraLeafErrors;	Changed = true;    }    else    if ( BranchErrors <= TreeErrors + 0.1 )    {	LocalVerbosity(1)	{	    Intab(Sh);	    printf("Replaced with branch %d\n", MaxBr);	}	AllKnown = PrevAllKnown;	EstimateErrors(T->Branch[MaxBr], Fp, Lp, Sh, true);	memcpy((char *) T, (char *) T->Branch[MaxBr], sizeof(TreeRec));	Changed = true;    }    else    {	T->Errors = TreeErrors;    }    AllKnown = PrevAllKnown;    free(LocalClassDist);    return T->Errors;}/*************************************************************************//*									 *//*	Remove unnecessary subset tests on missing values		 *//*									 *//*************************************************************************/    CheckPossibleValues(T)/*  -------------------  */    Tree T;{    Set HoldValues;    int v, Bytes, b;    Attribute A;    char Any=0;    if ( T->NodeType == BrSubset )    {	A = T->Tested;	Bytes = (MaxAttVal[A]>>3) + 1;	HoldValues = (Set) malloc(Bytes);	/*  See if last (default) branch can be simplified or omitted  */	ForEach(b, 0, Bytes-1)	{	    T->Subset[T->Forks][b] &= PossibleValues[A][b];	    Any |= T->Subset[T->Forks][b];	}	if ( ! Any )	{	    T->Forks--;	}	/*  Process each subtree, leaving only values in branch subset  */	CopyBits(Bytes, PossibleValues[A], HoldValues);	ForEach(v, 1, T->Forks)	{	    CopyBits(Bytes, T->Subset[v], PossibleValues[A]);	    CheckPossibleValues(T->Branch[v]);	}	CopyBits(Bytes, HoldValues, PossibleValues[A]);	free(HoldValues);    }    else    if ( T->NodeType )    {	ForEach(v, 1, T->Forks)	{	    CheckPossibleValues(T->Branch[v]);	}    }}/*************************************************************************//*									 *//*  Statistical routines for C4.5					 *//*  -----------------------------					 *//*									 *//*************************************************************************/									/*************************************************************************//*									 *//*  Compute the additional errors if the error rate increases to the	 *//*  upper limit of the confidence level.  The coefficient is the	 *//*  square of the number of standard deviations corresponding to the	 *//*  selected confidence level.  (Taken from Documenta Geigy Scientific	 *//*  Tables (Sixth Edition), p185 (with modifications).)			 *//*									 *//*************************************************************************/float Val[] = {  0,  0.001, 0.005, 0.01, 0.05, 0.10, 0.20, 0.40, 1.00},      Dev[] = {4.0,  3.09,  2.58,  2.33, 1.65, 1.28, 0.84, 0.25, 0.00};float AddErrs(N, e)/*    -------  */    ItemCount N, e;{    static float Coeff=0;    float Val0, Pr;    if ( ! Coeff )    {	/*  Compute and retain the coefficient value, interpolating from	    the values in Val and Dev  */	int i;	i = 0;	while ( CF > Val[i] ) i++;	Coeff = Dev[i-1] +		  (Dev[i] - Dev[i-1]) * (CF - Val[i-1]) /(Val[i] - Val[i-1]);	Coeff = Coeff * Coeff;    }    if ( e < 1E-6 )    {	return N * (1 - exp(log(CF) / N));    }    else    if ( e < 0.9999 )    {	Val0 = N * (1 - exp(log(CF) / N));	return Val0 + e * (AddErrs(N, 1.0) - Val0);    }    else    if ( e + 0.5 >= N )    {	return 0.67 * (N - e);    }    else    {	Pr = (e + 0.5 + Coeff/2	        + sqrt(Coeff * ((e + 0.5) * (1 - (e + 0.5)/N) + Coeff/4)) )             / (N + Coeff);	return (N * Pr - e);    }}/*************************************************************************//*									 *//*	Soften thresholds for continuous attributes			 *//*	-------------------------------------------			 *//*									 *//*************************************************************************/Boolean *LHSErr,	/*  Does a misclassification occur with this value of an att  */	*RHSErr;	/*  if the below or above threshold branches are taken  */ItemNo	*ThreshErrs;	/*  ThreshErrs[i] is the no. of misclassifications if thresh is i  */float	*CVals;		/*  All values of a continuous attribute  */#define	Below(v,t)	(v <= t + 1E-6)/*************************************************************************//*									 *//*  Soften all thresholds for continuous attributes in tree T		 *//*									 *//*************************************************************************/    SoftenThresh(T)/*  ------------  */    Tree T;{    CVals = (float *) calloc(MaxItem+1, sizeof(float));    LHSErr = (Boolean *) calloc(MaxItem+1, sizeof(Boolean));    RHSErr = (Boolean *) calloc(MaxItem+1, sizeof(Boolean));    ThreshErrs = (ItemNo *) calloc(MaxItem+1, sizeof(ItemNo));    InitialiseWeights();    ScanTree(T, 0, MaxItem);    free(ThreshErrs);    free(RHSErr);    free(LHSErr);    free(CVals);}/*************************************************************************//*								  	 *//*  Calculate upper and lower bounds for each test on a continuous	 *//*  attribute in tree T, using data items from Fp to Lp			 *//*								  	 *//*************************************************************************/    ScanTree(T, Fp, Lp)/*  --------  */    Tree T;    ItemNo Fp, Lp;{    short v;    float Val, Se, Limit, Lower, Upper, GreatestValueBelow();    ItemNo i, Kp, Ep, LastI, Errors, BaseErrors;    ClassNo CaseClass, Class1, Class2, Category();    Boolean LeftThresh=false;    Description CaseDesc;    Attribute Att;    void Swap();    /*  Stop when get to a leaf  */    if ( ! T->NodeType ) return;    /*  Group the unknowns together  */    Kp = Group(0, Fp, Lp, T);    /*  Soften a threshold for a continuous attribute  */    Att = T->Tested;    if ( T->NodeType == ThreshContin )    {	prin

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -