⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 c45dt.cpp

📁 这是一个改进的C4.5决策数算法C++类
💻 CPP
📖 第 1 页 / 共 5 页
字号:
/*************************************************************************/
/*									 */
/*  Given tables Freq[][] and ValFreq[], compute the information gain.	 */
/*									 */
/*	    Parameters:							 */
/*		BaseInfo:	average information for all items with	 */
/*				known values of the test attribute	 */
/*		UnknownRate:	fraction of items with unknown ditto	 */
/*		MaxVal:		number of forks	,determined by MaxAttVal[Att]			 */
/*		TotalItems:	number of items with known values of	 */
/*				test att				 */
/*									 */
/*  where Freq[x][y] contains the no. of cases with vafor lue x a	 */
/*  particular attribute that are members of class y,			 */
/*  and ValFreq[x] contains the no. of cases with value x for a		 */
/*  particular attribute						 */
/*									 */
/*
Gain[Att] = ComputeGain(DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]),
			    UnknownRate[Att], MaxAttVal[Att], KnownItems);
*/
/*************************************************************************/

float C45DT::ComputeGain(float BaseInfo,float UnknFrac,DiscrValue MaxVal,ItemCount TotalItems)
{
    DiscrValue v;
    float ThisInfo=0.0, ThisGain;
    short ReasonableSubsets=0;

    /*  Check whether all values are unknown or the same  */

    if ( ! TotalItems ) return (float)-Epsilon;/*如果已知属性值样本数为0*/

    
	/*subsets are divided by value of the attitude*/
    ForEach(v, 1, MaxVal)
    {
		if ( ValFreq[v] >= MINOBJS ) ReasonableSubsets++;
    }
    if ( ReasonableSubsets < 2 ) return (float)-Epsilon;/*  There must be at least two subsets with MINOBJS items  */
	
    /*  Compute total info after split, by summing the
	info of each of the subsets formed by the test  */

    ForEach(v, 1, MaxVal)
    {
		ThisInfo += TotalInfo(Freq[v], 0, MaxClass);
    }

    /*  Set the gain in information for all items, adjusted for unknowns  */

    ThisGain = (1 - UnknFrac) * (BaseInfo - ThisInfo / TotalItems);

    Verbosity(5)
        printf("ComputeThisGain: items %.1f info %.3f base %.3f unkn %.3f result %.3f\n",
    		TotalItems + ValFreq[0], ThisInfo, BaseInfo, UnknFrac, ThisGain);

    return ThisGain;
}



/*************************************************************************/
/*									 */
/*  Compute the total information in V[ MinVal..MaxVal ]		 */
/*									 */
/*************************************************************************/

/*Info[Att] = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items;*/

float C45DT::TotalInfo(ItemCount V[],DiscrValue MinVal,DiscrValue MaxVal)
{
    DiscrValue v;
    float Sum=0.0;
    ItemCount N, TotalItems=0;

    ForEach(v, MinVal, MaxVal)
    {
		N = V[v];

		Sum +=(float)( N * Log(N));
		TotalItems += N;
    }

    return (float)(TotalItems*Log(TotalItems) - Sum);
}

/*************************************************************************/
/*									 */
/*	Print distribution table for given attribute			 */
/*									 */
/*************************************************************************/

void C45DT::PrintDistribution(Attribute Att,DiscrValue MaxVal,Boolean ShowNames)
{
    DiscrValue v;
    ClassNo c;
    C45_String Val;

    printf("\n\t\t\t ");
    ForEach(c, 0, MaxClass)
    {
		printf("%7.6s", ClassName[c]);
    }
    printf("\n");

    ForEach(v, 0, MaxVal)
    {
		if ( ShowNames )
		{
			Val = ( !v ? "unknown" :
				MaxAttVal[Att] ? AttValName[Att][v] :
				v == 1 ? "below" : "above" );
			printf("\t\t[%-7.7s:", Val);
		}
		else
		{
			printf("\t\t[%-7d:", v);
		}

		ForEach(c, 0, MaxClass)
		{
			printf(" %6.1f", Freq[v][c]);
		}

		printf("]\n");
    }
}

/*************************************************************************/
/*									 */
/*	Sorting utilities						 */
/*	-----------------						 */
/*	sort.c								 */
/*************************************************************************/
/*************************************************************************/
/*									 */
/*	Sort items from Fp to Lp on attribute a				 */
/*									 */
/*************************************************************************/

void C45DT::Quicksort(ItemNo Fp,ItemNo Lp,Attribute Att)
{
//    register ItemNo Lower, Middle;
//    register float Thresh;
//    register ItemNo i;
//    register Description Hold;
//    register ItemCount HoldW;
    register ItemNo Lower, Middle;
    register float Thresh;
    register ItemNo i;
	
    Description Hold;
    ItemCount HoldW;

    if ( Fp < Lp )
    {
	Thresh = CVal(Item[Lp], Att);

	/* 隔离Isolate all items with values <= threshold  */
	/*Item[Middle]之前的值<= threshold*/
	Middle = Fp;

	for ( i = Fp ; i < Lp ; i++ )
	{ 
	    if ( CVal(Item[i], Att) <= Thresh )
	    { 
			if ( i != Middle ) 
			{
				Hold = Item[Middle];
				Item[Middle] = Item[i];
				Item[i] = Hold;

				HoldW = Weight[Middle];
				Weight[Middle] = Weight[i];
				Weight[i] = HoldW;
			}
			Middle++; 
	    } 
	} 

	/*  Extract all values equal to the threshold  */


	Lower = Middle - 1;

	for ( i = Lower ; i >= Fp ; i-- )
	{
	    if ( CVal(Item[i], Att) == Thresh )
	    { 
			if ( i != Lower ) 
			{
//				(*Exchange)(Lower, i);
				Hold = Item[Lower];
				Item[Lower] = Item[i];
				Item[i] = Hold;

				HoldW = Weight[Lower];
				Weight[Lower] = Weight[i];
				Weight[i] = HoldW;
			}
			Lower--;
	    } 
	} 

	/*  Item[Fp]- Item[Lower]: <= threshold
		Item[Lower]- Item[Middle]: = threshold
		Item[Middle]- Item[Lp]: > threshold
	*/

	/*  Sort the lower values  */

	Quicksort(Fp, Lower, Att);

	/*  Position the middle element  */

//	(*Exchange)(Middle, Lp);

    Hold = Item[Middle];
    Item[Middle] = Item[Lp];
    Item[Lp] = Hold;

    HoldW = Weight[Middle];
    Weight[Middle] = Weight[Lp];
    Weight[Lp] = HoldW;

	/*  Sort the higher values  */

	Quicksort(Middle+1, Lp, Att);
    }
}

/*************************************************************************/
/*									 */
/*	Routines for displaying, building, saving and restoring trees	 */
/*	-------------------------------------------------------------	 */
/*	trees.c								 */
/*************************************************************************/

/*	by wuxing, 2003,4,21								 */

void  C45DT::PrintHeaderToFile(char *Title)
{
    char TitleLine[80];
    time_t clock;
	short Underline;

    time(&clock);
    sprintf(TitleLine, "C4.5 [release %s] %s", "8.1", Title);
	
	fprintf(TRf,"\n%s\t%s", TitleLine, ctime(&clock));


    Underline = strlen(TitleLine);
	while ( Underline-- ) fprintf(TRf,"%c",'-');
	
	fprintf(TRf,"%s","\n");
	//-------------------------------------------------------------

	fprintf(TRf,"\nRead %d cases (%d attributes) from %s.data\n",
	   MaxItem+1, MaxAtt+1, FileName);

}

void C45DT::PrintHeaderToBuffer(char *Title)
{
    char TitleLine[80];
    time_t clock;
	short Underline;
	char buffer[100];

    time(&clock);
    sprintf(TitleLine, "标准C4.5算法[版本 %s] %s", "8.1", Title);
	
	sprintf(buffer,"\n%s\t%s", TitleLine, ctime(&clock));
	strcat(sShowTree,buffer);


    Underline = strlen(TitleLine);
	while ( Underline-- ) 
	{
		sprintf(buffer,"%c",'-');
		strcat(sShowTree,buffer);
	}
	
	sprintf(buffer,"%s","\n");
	strcat(sShowTree,buffer);
	//-------------------------------------------------------------

	sprintf(buffer,"\n学习样本 %d 个 (%d 属性)\n",MaxItem+1, MaxAtt+1);
	strcat(sShowTree,buffer);
}

/*************************************************************************/
/*									 */
/*	Print a node T with offset Sh, branch value v, and continue	 */
/*									 */
/*************************************************************************/


void C45DT::ShowBranch(short Sh,Tree T,DiscrValue v)
{
    DiscrValue Pv, Last;
    Attribute Att;
    Boolean FirstValue;
    short TextWidth, Skip, Values=0, i;
    
    Att = T->Tested;

    switch ( T->NodeType )//非叶节点
    {
	case BrDiscr:

	    Indent(Sh, Tab);

	    printf("%s = %s:", AttName[Att], AttValName[Att][v]);
	    break;

	case ThreshContin:

	    Indent(Sh, Tab);

	    printf("%s %s %g ",AttName[Att], ( v == 1 ? "<=" : ">" ), T->Cut);

	    if ( T->Lower != T->Upper )
	    {
			printf("[%g,%g]", T->Lower, T->Upper);
	    }

	    printf(":");
	    break;

	case BrSubset:

	    /*  Count values at this branch  */

	    ForEach(Pv, 1, MaxAttVal[Att])
	    {
			if ( In(Pv, T->Subset[v]) )
			{
				Last = Pv;
				Values++;
			}
	    }
	    if ( ! Values ) return;

	    Indent(Sh, Tab);

	    if ( Values == 1 )
	    {
			printf("%s = %s:", AttName[Att], AttValName[Att][Last]);
			break;
	    }

	    printf("%s in {", AttName[Att]);
	    FirstValue = DTTrue;
	    Skip = TextWidth = strlen(AttName[Att]) + 5;

	    ForEach(Pv, 1, MaxAttVal[Att])
	    {
			if ( In(Pv, T->Subset[v]) )
			{
				if ( ! FirstValue &&
				 TextWidth + strlen(AttValName[Att][Pv]) + 11 > Width )
				{
		  			Indent(Sh, Tab);
					ForEach(i, 1, Skip) putchar(' ');

					TextWidth = Skip;
					FirstValue = DTTrue;
				}

				printf("%s%c", AttValName[Att][Pv], Pv == Last ? '}' : ',');
				TextWidth += strlen(AttValName[Att][Pv]) + 1;
				FirstValue = DTFalse;
			}
	    }
	    putchar(':');
		break;
    }

	//有待进一步分析
    Show(T->Branch[v], Sh+1);//显示分支的下一层,递归调用
	//Show(T, Sh+1);//显示分支的下一层,递归调用
}

/*************************************************************************/
/*									 */
/*	Print a node T with offset Sh, branch value v, and continue	to file */
/*	by wuxing, 2003,4,21								 */
/*									 */
/*************************************************************************/

void C45DT::ShowBranchToFile(short Sh,Tree T,DiscrValue v)
{
    DiscrValue Pv, Last;
    Attribute Att;
    Boolean FirstValue;
    short TextWidth, Skip, Values=0, i;
    
    Att = T->Tested;

    switch ( T->NodeType )//非叶节点
    {
	case BrDiscr:

	    IndentToFile(Sh, Tab);

	    fprintf(TRf,"%s = %s:", AttName[Att], AttValName[Att][v]);
	    break;

	case ThreshContin:

	    IndentToFile(Sh, Tab);

	    fprintf(TRf,"%s %s %g ",AttName[Att], ( v == 1 ? "<=" : ">" ), T->Cut);

	    if ( T->Lower != T->Upper )
	    {
			fprintf(TRf,"[%g,%g]", T->Lower, T->Upper);
	    }

	    fprintf(TRf,"%s",":");
	    break;

	case BrSubset:

	    /*  Count values at this branch  */

	    ForEach(Pv, 1, MaxAttVal[Att])
	    {
			if ( In(Pv, T->Subset[v]) )
			{
				Last = Pv;
				Values++;
			}
	    }
	    if ( ! Values ) return;

	    IndentToFile(Sh, Tab);

	    if ( Values == 1 )
	    {
			fprintf(TRf,"%s = %s:", AttName[Att], AttValName[Att][Last]);
			break;
	    }

	    fprintf(TRf,"%s in {", AttName[Att]);
	    FirstValue = DTTrue;
	    Skip = TextWidth = strlen(AttName[Att]) + 5;

	    ForEach(Pv, 1, MaxAttVal[Att])
	    {
			if ( In(Pv, T->Subset[v]) )
			{
				if ( ! FirstValue &&
				 TextWidth + strlen(AttValName[Att][Pv]) + 11 > Width )
				{
		  			IndentToFile(Sh, Tab);
					ForEach(i, 1, Skip) fprintf(TRf,"%c",' ');

					TextWidth = Skip;
					FirstValue = DTTrue;
				}

				fprintf(TRf,"%s%c", AttValName[Att][Pv], Pv == Last ? '}' : ',');
				TextWidth += strlen(AttValName[Att][Pv]) + 1;
				FirstValue = DTFalse;
			}
	    }
	    fprintf(TRf,"%c",':');
		break;
    }

	//有待进一步分析
    ShowToFile(T->Branch[v], Sh+1);//显示分支的下一层,递归调用
	//Show(T, Sh+1);//显示分支的下一层,递归调用
}

void C45DT::ShowBranchToBuffer(short Sh,Tree T,DiscrValue v)
{
    DiscrValue Pv, Last;
    Attribute Att;
    Boolean FirstValue;
    short TextWidth, Skip, Values=0, i;
	char buffer[100];
    
    Att = T->Tested;

    switch ( T->NodeType )//非叶节点
    {
	case BrDiscr:

	    IndentToBuffer(Sh, Tab);

	    sprintf(buffer,"%s = %s:", AttName[Att], AttValName[Att][v]);
		strcat(sShowTree,buffer);
	    break;

	case ThreshContin:

	    IndentToBuffer(Sh, Tab);

	    sprintf(buffer,"%s %s %g ",AttName[Att], ( v == 1 ? "<=" : ">" ), T->Cut);
		strcat(sShowTree,buffer);
	    if ( T->Lower != T->Upper )
	    {
			sprintf(buffer,"[%g,%g]", T->Lower, T->Upper);
			strcat(sShowTree,buffer);
	    }

	    sprintf(buffer,"%s",":");
		strcat(sShowTree,buffer);
	    break;

	case BrSubset:

	    /*  Count values at this branch  */

	    ForEach(Pv, 1, MaxAttVal[Att])
	    {
			if ( In(Pv, T->Subset[v]) )
			{
				Last = Pv;
				Values++;
			}
	    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -