⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 subset.c

📁 这是一个决策树实现的算法
💻 C
字号:
/*************************************************************************/
/*									 */
/*      Evaluation of the subsetting of a discrete attribute		 */
/*      ----------------------------------------------------		 */
/*									 */
/*************************************************************************/


#include "buildex.i"


ItemCount
	*Slice1,	/* Slice1[c]    = saved values of Freq[x][c] in subset.c */
	*Slice2;	/* Slice2[c]    = saved values of Freq[y][c] */

Set
	**Subset;	/* Subset[a][s] = subset s for att a */

short
	*Subsets;	/* Subsets[a] = no. subsets for att a */



/*************************************************************************/
/*									 */
/*  Evaluate subsetting a discrete attribute and form the chosen	 */
/*  subsets Subset[Att][], setting Subsets[Att] to the number of	 */
/*  subsets, and the Info[] and Gain[] of a test on the attribute	 */
/*									 */
/*************************************************************************/


    EvalSubset(Att, Fp, Lp, Items)
/*  ----------  */ 
    Attribute Att;
    ItemNo Fp, Lp; 
    ItemCount Items;
{ 
    DiscrValue V1, V2, BestV1, BestV2, Barred;
    ItemCount KnownItems;
    ClassNo c;
    float BaseInfo, MinGain, ThisGain, ThisInfo,
	Val, BestVal, BestGain, BestInfo,
	PrevVal, PrevGain, PrevInfo,
	DiscrKnownBaseInfo(), Worth(), ComputeGain(), TotalInfo();
    short Blocks=0, MissingValues=0, ReasonableSubsets, Bytes, b;
    Boolean MergedSubsets = false;
    int SaveMINOBJS;

    SaveMINOBJS = MINOBJS;
    MINOBJS = 1;

    /*  First compute Freq[][], ValFreq[], base info, and the gain
	and total info of a split on discrete attribute Att  */

    ComputeFrequencies(Att, Fp, Lp);

    KnownItems = Items - ValFreq[0];
    if ( KnownItems < Epsilon )
    {
	Verbosity(2) printf("\tAtt %s: no known values\n", AttName[Att]);

	Gain[Att] = -Epsilon;
	Info[Att] = 0;
	return;
    }

    BaseInfo = DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]);

    PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], MaxAttVal[Att],KnownItems);
    PrevInfo = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items;
    PrevVal = Worth(PrevInfo, PrevGain, Epsilon);

    Verbosity(2)
    {
	printf("\tAtt %s", AttName[Att]);

	Verbosity(3) PrintDistribution(Att, MaxAttVal[Att], true);

	printf("\tinf %.3f, gain %.3f, val=%.3f\n",
		PrevInfo, PrevGain, PrevVal);
    }

    /*  Eliminate unrepresented attribute values from Freq[] and ValFreq[]
	and form a separate subset for each represented attribute value  */

    Bytes = (MaxAttVal[Att]>>3) + 1;
    ClearBits(Bytes, Subset[Att][0]);

    ForEach(V1, 1, MaxAttVal[Att])
    {
	if ( ValFreq[V1] > 0.5 )
	{
	    if ( ++Blocks < V1 )
	    {
		ValFreq[Blocks] = ValFreq[V1];
		ForEach(c, 0, MaxClass)
		{
		    Freq[Blocks][c] = Freq[V1][c];
		}
	    }
	    ClearBits(Bytes, Subset[Att][Blocks]);
	    SetBit(V1, Subset[Att][Blocks]);
	}
	else
	{
	    SetBit(V1, Subset[Att][0]);
	    MissingValues++;
	}
    }

    /*  Merge any single-class subsets with others of the same class  */
    /*  Note: have ValFreq[V] > 0 for all V  */

    ForEach(V1, 1, Blocks-1)
    {
	for ( c = 0 ; Freq[V1][c] < 0.1 ; c++ )
	    ;

	if ( Freq[V1][c] < ValFreq[V1] - 0.1 ) continue;

	/*  Now have a single class -- look for others  */

	for ( V2 = V1+1 ; V2 <= Blocks ; )
	{
	    if ( Freq[V2][c] < ValFreq[V2] - 0.1 )
	    {
		V2++;
	    }
	    else
	    {
		/*  Merge these subsets  */

		Combine(V1, V2, Blocks);

		ForEach(b, 0, Bytes-1)
		{
		    Subset[Att][V1][b] |= Subset[Att][V2][b];
		    Subset[Att][V2][b] = Subset[Att][Blocks][b];
		}

		Blocks--;
		MergedSubsets = true;
	    }
	}
    }

    if ( MergedSubsets )
    {
	PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems);
	PrevInfo = TotalInfo(ValFreq, 0, Blocks) / Items;
	PrevVal = Worth(PrevInfo, PrevGain, Epsilon);

	Verbosity(2)
	{
	    printf("\tAfter merging single-class subsets:");

	    Verbosity(3) PrintDistribution(Att, Blocks, false);

	    printf("\tinf %.3f, gain %.3f, val=%.3f\n",
		    PrevInfo, PrevGain, PrevVal);
	}
    }

    /*  Examine possible pair mergers and hill-climb  */

    MinGain = PrevGain / 2;

    while ( Blocks > 2 )
    {
	BestVal = BestV1 = 0;
	BestGain = -Epsilon;

	/*  Check reasonable subsets; if less than 3, bar mergers
	    involving the largest block  */

	ReasonableSubsets = 0;
	Barred = 1;

	ForEach(V1, 1, Blocks)
	{
	    if ( ValFreq[V1] >= SaveMINOBJS ) ReasonableSubsets++;

	    if ( ValFreq[V1] > ValFreq[Barred] ) Barred = V1;
	}

	if ( ReasonableSubsets >= 3 ) Barred = 0;

	/*  For each possible pair of values, calculate the gain and
	    total info of a split in which they are treated as one.
	    Keep track of the pair with the best gain.  */

	ForEach(V1, 1, Blocks-1)
	{
	    ForEach(V2, V1+1, Blocks)
	    {
		if ( V1 == Barred || V2 == Barred ) continue;

		Combine(V1, V2, Blocks);

		ThisGain = ComputeGain(BaseInfo, UnknownRate[Att],
					Blocks-1, KnownItems);
		ThisInfo = TotalInfo(ValFreq, 0, Blocks-1) / Items;
		Val      = Worth(ThisInfo, ThisGain, Epsilon);

		Verbosity(4)
		{
		    printf("\tcombine %d %d info %.3f gain %.3f val %.3f",
		           V1, V2, ThisInfo, ThisGain, Val);
		    PrintDistribution(Att, Blocks-1, false);
		}

		/*  Force a split if
			less than two reasonable subsets, or
			using GAIN criterion
		    Prefer this split to the previous one if
			gain >= MinGain (and previous < MinGain), or
			val >= previous best val  */

		if ( ThisGain >= MinGain && BestGain < MinGain ||
		     Val >= BestVal ||
		     ! BestV1 && ( ! GAINRATIO || ReasonableSubsets < 2 ) )
		{
		    BestVal  = Val;
		    BestGain = ThisGain;
		    BestInfo = ThisInfo;
		    BestV1   = V1;
		    BestV2   = V2;
		}

		Uncombine(V1, V2);
	    }
	}

	if ( GAINRATIO &&
	     ReasonableSubsets >= 2 &&
	     ( ! BestV1 ||
	       BestVal < PrevVal + 1E-5 ||
	       BestVal == PrevVal && BestGain < PrevGain ) ) break;

	PrevGain = BestGain;
	PrevInfo = BestInfo;
        PrevVal = BestVal;

	Combine(BestV1, BestV2, Blocks);

	ForEach(b, 0, Bytes-1)
	{
	    Subset[Att][BestV1][b] |= Subset[Att][BestV2][b];
	    Subset[Att][BestV2][b] = Subset[Att][Blocks][b];
	}

	Blocks--;

	Verbosity(2)
	{
	    printf("\t\tform subset ");
	    PrintSubset(Att, Subset[Att][BestV1]);
	    printf(": %d subsets, inf %.3f, gain %.3f, val %.3f\n",
		   Blocks, BestInfo, BestGain, BestVal);
	    Verbosity(3)
	    {
		printf("\t\tcombine %d, %d", BestV1, BestV2);
		PrintDistribution(Att, Blocks, false);
	    }
	}
    }

    MINOBJS = SaveMINOBJS;

    if ( PrevVal <= 0 )
    {
	Gain[Att] = -Epsilon;
	Info[Att] = 0;
    }
    else
    {
	Gain[Att] = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems);
	Info[Att] = PrevInfo;

	if ( MissingValues )
	{
	    Blocks++;
	    CopyBits(Bytes, Subset[Att][0], Subset[Att][Blocks]);
	}

	Subsets[Att] = Blocks;

	Verbosity(2) printf("\tFinal subsets:");
	Verbosity(3) PrintDistribution(Att, Blocks, false);
	Verbosity(2)
	    printf("\tinf %.3f gain %.3f val %.3f\n", 
		   Info[Att], Gain[Att], Worth(Info[Att], Gain[Att], Epsilon));
    }
}



/*************************************************************************/
/*									 */
/*  Combine the distribution figures of discrete attribute values	 */
/*  x and y, putting the combined figures in Freq[x][] and		 */
/*  ValFreq[x][], and saving old values in Slice1 and Slice2		 */
/*									 */
/*************************************************************************/


    Combine(x, y, Last)
/*  -------  */
    DiscrValue x, y, Last;
{
    ClassNo c;

    ForEach(c, 0, MaxClass)
    {
	Slice1[c] = Freq[x][c];
	Slice2[c] = Freq[y][c];

	Freq[x][c] += Freq[y][c];
	Freq[y][c]  = Freq[Last][c];
    }

    Slice1[MaxClass+1] = ValFreq[x];
    Slice2[MaxClass+1] = ValFreq[y];

    ValFreq[x] += ValFreq[y];
    ValFreq[y]  = ValFreq[Last];
}



/*************************************************************************/
/*									 */
/*  Restore old class distribution figures of discrete attribute	 */
/*  values x and y from Slice1 and Slice2				 */
/*									 */
/*************************************************************************/


    Uncombine(x, y)
/*  ---------  */
    DiscrValue x, y;
{
    ClassNo c;

    ForEach(c, 0, MaxClass)
    {
	Freq[x][c] = Slice1[c];
	Freq[y][c] = Slice2[c];
    }

    ValFreq[x] = Slice1[MaxClass+1];
    ValFreq[y] = Slice2[MaxClass+1];
}



/*************************************************************************/
/*									 */
/*  Print the values of attribute Att which are in the subset Ss	 */
/*									 */
/*************************************************************************/


    PrintSubset(Att, Ss)
/*  -----------  */
    Attribute Att;
    Set Ss;
{
    DiscrValue V1;
    Boolean First=true;

    ForEach(V1, 1, MaxAttVal[Att])
    {
	if ( In(V1, Ss) )
	{
	    if ( First )
	    {
		First = false;
	    }
	    else
	    {
		printf(", ");
	    }

	    printf("%s", AttValName[Att][V1]);
	}
    }
}



/*************************************************************************/
/*									 */
/*  Construct and return a node for a test on a subset of values	 */
/*									 */
/*************************************************************************/


    SubsetTest(Node, Att)
/*  -----------  */
    Tree Node;
    Attribute Att;
{ 
    ItemCount CountItems();
    short S, Bytes;

    Sprout(Node, Subsets[Att]);

    Node->NodeType	= BrSubset;
    Node->Tested	= Att;
    Node->Errors	= 0;
    
    Bytes = (MaxAttVal[Att]>>3) + 1;
    Node->Subset = (Set *) calloc(Subsets[Att] + 1, sizeof(Set));
    ForEach(S, 1, Node->Forks)
    {
	Node->Subset[S] = (Set) malloc(Bytes);
	CopyBits(Bytes, Subset[Att][S], Node->Subset[S]);
    }
} 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -