⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 subset.cpp

📁 实现决策树分类训练试验。 源自c4.5
💻 CPP
字号:
/*************************************************************************/
/*									 */
/*      Evaluation of the subsetting of a discrete attribute		 */
/*      ----------------------------------------------------		 */
/*									 */
/*************************************************************************/
#include "stdafx.h"
#include "MyBase.h"

extern FILE *fLog;
extern ItemCount
	*Weight,	/* Weight[i]  = current fraction of item i */
	**Freq,		/* Freq[x][c] = no. items of class c with outcome x */
	*ValFreq;	/* ValFreq[x] = no. items with att value v */

extern float
	*Gain,		/* Gain[a] = info gain by split on att a */
	*Info,		/* Info[a] = potential info from split on att a */
	*Bar,		/* Bar[a]  = best threshold for contin att a */
	*UnknownRate;	/* UnknownRate[a] = current unknown rate for att a */

extern char
	*Tested;	/* Tested[a] = true if att a already tested */

ItemCount
	*Slice1,	/* Slice1[c]    = saved values of Freq[x][c] in subset.c */
	*Slice2;	/* Slice2[c]    = saved values of Freq[y][c] */

Set 	**Subset;	/* Subset[a][s] = subset s for att a */

short *Subsets;	/* Subsets[a] = no. subsets for att a */



/*************************************************************************/
/*									 */
/*  Evaluate subsetting a discrete attribute and form the chosen	 */
/*  subsets Subset[Att][], setting Subsets[Att] to the number of	 */
/*  subsets, and the Info[] and Gain[] of a test on the attribute	 */
/*									 */
/*************************************************************************/
void EvalSubset(Attribute Att, ItemNo Fp, ItemNo Lp, ItemCount Items)
{ 
    DiscrValue V1, V2, BestV1, BestV2, Barred;
    ItemCount KnownItems;
    ClassNo c;
    float BaseInfo, MinGain, ThisGain, ThisInfo,
	Val, BestVal, BestGain, BestInfo,
	PrevVal, PrevGain, PrevInfo;
    short Blocks=0, MissingValues=0, ReasonableSubsets, Bytes, b;
    Boolean MergedSubsets = false;
    int SaveMINOBJS;

    SaveMINOBJS = MINOBJS;
    MINOBJS = 1;

    /*  First compute Freq[][], ValFreq[], base info, and the gain
	and total info of a split on discrete attribute Att  */

    ComputeFrequencies(Att, Fp, Lp);

    KnownItems = Items - ValFreq[0];
    if ( KnownItems < Epsilon )
    {
		Verbosity(2) fprintf(fLog,"\tAtt %s: no known values\n", AttName[Att]);

		Gain[Att] = -Epsilon;
		Info[Att] = 0;
		return;
    }

    BaseInfo = DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]);

    PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], MaxAttVal[Att],KnownItems);
    PrevInfo = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items;
    PrevVal = Worth(PrevInfo, PrevGain, Epsilon);

    Verbosity(2)
    {
		fprintf(fLog,"\tAtt %s", AttName[Att]);

		Verbosity(3) PrintDistribution(Att, MaxAttVal[Att], true);

		fprintf(fLog,"\tinf %.3f, gain %.3f, val=%.3f\n",
			PrevInfo, PrevGain, PrevVal);
    }

    /*  Eliminate unrepresented attribute values from Freq[] and ValFreq[]
	and form a separate subset for each represented attribute value  */

    Bytes = (MaxAttVal[Att]>>3) + 1;
    ClearBits(Bytes, Subset[Att][0]);

    ForEach(V1, 1, MaxAttVal[Att])
    {
		if ( ValFreq[V1] > 0.5 )
		{
			if ( ++Blocks < V1 )
			{
				ValFreq[Blocks] = ValFreq[V1];
				ForEach(c, 0, MaxClass)
				{
					Freq[Blocks][c] = Freq[V1][c];
				}
			}
			ClearBits(Bytes, Subset[Att][Blocks]);
			SetBit(V1, Subset[Att][Blocks]);
		}
		else
		{
			SetBit(V1, Subset[Att][0]);
			MissingValues++;
		}
    }

    /*  Merge any single-class subsets with others of the same class  */
    /*  Note: have ValFreq[V] > 0 for all V  */

    ForEach(V1, 1, Blocks-1)
    {
		for ( c = 0 ; Freq[V1][c] < 0.1 ; c++ )
			;

		if ( Freq[V1][c] < ValFreq[V1] - 0.1 ) continue;

		/*  Now have a single class -- look for others  */

		for ( V2 = V1+1 ; V2 <= Blocks ; )
		{
			if ( Freq[V2][c] < ValFreq[V2] - 0.1 )
			{
				V2++;
			}
			else
			{
				/*  Merge these subsets  */

				Combine(V1, V2, Blocks);

				ForEach(b, 0, Bytes-1)
				{
					Subset[Att][V1][b] |= Subset[Att][V2][b];
					Subset[Att][V2][b] = Subset[Att][Blocks][b];
				}

				Blocks--;
				MergedSubsets = true;
			}
		}
    }

    if ( MergedSubsets )
    {
		PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems);
		PrevInfo = TotalInfo(ValFreq, 0, Blocks) / Items;
		PrevVal = Worth(PrevInfo, PrevGain, Epsilon);

		Verbosity(2)
		{
			fprintf(fLog,"\tAfter merging single-class subsets:");

			Verbosity(3) PrintDistribution(Att, Blocks, false);

			fprintf(fLog,"\tinf %.3f, gain %.3f, val=%.3f\n",
				PrevInfo, PrevGain, PrevVal);
		}
    }

    /*  Examine possible pair mergers and hill-climb  */

    MinGain = PrevGain / 2;

    while ( Blocks > 2 )
    {
		BestVal = BestV1 = 0;
		BestGain = -Epsilon;

		/*  Check reasonable subsets; if less than 3, bar mergers
			involving the largest block  */

		ReasonableSubsets = 0;
		Barred = 1;

		ForEach(V1, 1, Blocks)
		{
			if ( ValFreq[V1] >= SaveMINOBJS ) ReasonableSubsets++;

			if ( ValFreq[V1] > ValFreq[Barred] ) Barred = V1;
		}

		if ( ReasonableSubsets >= 3 ) Barred = 0;

		/*  For each possible pair of values, calculate the gain and
			total info of a split in which they are treated as one.
			Keep track of the pair with the best gain.  */

		ForEach(V1, 1, Blocks-1)
		{
			ForEach(V2, V1+1, Blocks)
			{
				if ( V1 == Barred || V2 == Barred ) continue;

				Combine(V1, V2, Blocks);

				ThisGain = ComputeGain(BaseInfo, UnknownRate[Att],
							Blocks-1, KnownItems);
				ThisInfo = TotalInfo(ValFreq, 0, Blocks-1) / Items;
				Val      = Worth(ThisInfo, ThisGain, Epsilon);

				Verbosity(4)
				{
					fprintf(fLog,"\tcombine %d %d info %.3f gain %.3f val %.3f",
						   V1, V2, ThisInfo, ThisGain, Val);
					PrintDistribution(Att, Blocks-1, false);
				}

				/*  Force a split if
					less than two reasonable subsets, or
					using GAIN criterion
					Prefer this split to the previous one if
					gain >= MinGain (and previous < MinGain), or
					val >= previous best val  */

				if ( ThisGain >= MinGain && BestGain < MinGain ||
					 Val >= BestVal ||
					 ! BestV1 && ( ! GAINRATIO || ReasonableSubsets < 2 ) )
				{
					BestVal  = Val;
					BestGain = ThisGain;
					BestInfo = ThisInfo;
					BestV1   = V1;
					BestV2   = V2;
				}

				Uncombine(V1, V2);
			}
		}

		if ( GAINRATIO &&
			 ReasonableSubsets >= 2 &&
			 ( ! BestV1 ||
			   BestVal < PrevVal + 1E-5 ||
			   BestVal == PrevVal && BestGain < PrevGain ) ) break;

		PrevGain = BestGain;
		PrevInfo = BestInfo;
			PrevVal = BestVal;

		Combine(BestV1, BestV2, Blocks);

		ForEach(b, 0, Bytes-1)
		{
			Subset[Att][BestV1][b] |= Subset[Att][BestV2][b];
			Subset[Att][BestV2][b] = Subset[Att][Blocks][b];
		}

		Blocks--;

		Verbosity(2)
		{
			fprintf(fLog,"\t\tform subset ");
			PrintSubset(Att, Subset[Att][BestV1]);
			fprintf(fLog,": %d subsets, inf %.3f, gain %.3f, val %.3f\n",
			   Blocks, BestInfo, BestGain, BestVal);
			Verbosity(3)
			{
				fprintf(fLog,"\t\tcombine %d, %d", BestV1, BestV2);
				PrintDistribution(Att, Blocks, false);
			}
		}
    }

    MINOBJS = SaveMINOBJS;

    if ( PrevVal <= 0 )
    {
		Gain[Att] = -Epsilon;
		Info[Att] = 0;
    }
    else
    {
		Gain[Att] = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems);
		Info[Att] = PrevInfo;

		if ( MissingValues )
		{
			Blocks++;
			CopyBits(Bytes, Subset[Att][0], Subset[Att][Blocks]);
		}

		Subsets[Att] = Blocks;

		Verbosity(2) fprintf(fLog,"\tFinal subsets:");
		Verbosity(3) PrintDistribution(Att, Blocks, false);
		Verbosity(2)
			fprintf(fLog,"\tinf %.3f gain %.3f val %.3f\n", 
			   Info[Att], Gain[Att], Worth(Info[Att], Gain[Att], Epsilon));
    }
}



/*************************************************************************/
/*									 */
/*  Combine the distribution figures of discrete attribute values	 */
/*  x and y, putting the combined figures in Freq[x][] and		 */
/*  ValFreq[x][], and saving old values in Slice1 and Slice2		 */
/*									 */
/*************************************************************************/
void Combine(DiscrValue x,DiscrValue y, DiscrValue Last)
{
    ClassNo c;

    ForEach(c, 0, MaxClass)
    {
		Slice1[c] = Freq[x][c];
		Slice2[c] = Freq[y][c];

		Freq[x][c] += Freq[y][c];
		Freq[y][c]  = Freq[Last][c];
    }

    Slice1[MaxClass+1] = ValFreq[x];
    Slice2[MaxClass+1] = ValFreq[y];

    ValFreq[x] += ValFreq[y];
    ValFreq[y]  = ValFreq[Last];
}



/*************************************************************************/
/*									 */
/*  Restore old class distribution figures of discrete attribute	 */
/*  values x and y from Slice1 and Slice2				 */
/*									 */
/*************************************************************************/
void Uncombine(DiscrValue x, DiscrValue y)
{
    ClassNo c;

    ForEach(c, 0, MaxClass)
    {
	Freq[x][c] = Slice1[c];
	Freq[y][c] = Slice2[c];
    }

    ValFreq[x] = Slice1[MaxClass+1];
    ValFreq[y] = Slice2[MaxClass+1];
}



/*************************************************************************/
/*									 */
/*  Print the values of attribute Att which are in the subset Ss	 */
/*									 */
/*************************************************************************/
void PrintSubset(Attribute Att,Set  Ss)
{
    DiscrValue V1;
    Boolean First=true;

    ForEach(V1, 1, MaxAttVal[Att])
    {
		if ( In(V1, Ss) )
		{
			if ( First )
			{
			First = false;
			}
			else
			{
			fprintf(fLog,", ");
			}

			fprintf(fLog,"%s", AttValName[Att][V1]);
		}
    }
}



/*************************************************************************/
/*									 */
/*  Construct and return a node for a test on a subset of values	 */
/*									 */
/*************************************************************************/
void  SubsetTest(Tree Node, Attribute Att)
{ 
    ItemCount CountItems();
    short S, Bytes;

    Sprout(Node, Subsets[Att]);

    Node->NodeType	= BrSubset;
    Node->Tested	= Att;
    Node->Errors	= 0;
    
    Bytes = (MaxAttVal[Att]>>3) + 1;
    Node->Subset = (Set *) calloc(Subsets[Att] + 1, sizeof(Set));
    ForEach(S, 1, Node->Forks)
    {
		Node->Subset[S] = (Set) malloc(Bytes);
		CopyBits(Bytes, Subset[Att][S], Node->Subset[S]);
    }
} 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -