⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 subset.c

📁 决策树是用二叉树形图来表示处理逻辑的一种工具。可以直观、清晰地表达加工的逻辑要求。特别适合于判断因素比较少、逻辑组合关系不复杂的情况。
💻 C
字号:
/*************************************************************************//*									 *//*      Evaluation of the subsetting of a discrete attribute		 *//*      ----------------------------------------------------		 *//*									 *//*************************************************************************/#include "buildex.i"ItemCount	*Slice1,	/* Slice1[c]    = saved values of Freq[x][c] in subset.c */	*Slice2;	/* Slice2[c]    = saved values of Freq[y][c] */Set	**Subset;	/* Subset[a][s] = subset s for att a */short	*Subsets;	/* Subsets[a] = no. subsets for att a *//*************************************************************************//*									 *//*  Evaluate subsetting a discrete attribute and form the chosen	 *//*  subsets Subset[Att][], setting Subsets[Att] to the number of	 *//*  subsets, and the Info[] and Gain[] of a test on the attribute	 *//*									 *//*************************************************************************/    EvalSubset(Att, Fp, Lp, Items)/*  ----------  */     Attribute Att;    ItemNo Fp, Lp;     ItemCount Items;{     DiscrValue V1, V2, BestV1, BestV2, Barred;    ItemCount KnownItems;    ClassNo c;    float BaseInfo, MinGain, ThisGain, ThisInfo,	Val, BestVal, BestGain, BestInfo,	PrevVal, PrevGain, PrevInfo,	DiscrKnownBaseInfo(), Worth(), ComputeGain(), TotalInfo();    short Blocks=0, MissingValues=0, ReasonableSubsets, Bytes, b;    Boolean MergedSubsets = false;    int SaveMINOBJS;    SaveMINOBJS = MINOBJS;    MINOBJS = 1;    /*  First compute Freq[][], ValFreq[], base info, and the gain	and total info of a split on discrete attribute Att  */    ComputeFrequencies(Att, Fp, Lp);    KnownItems = Items - ValFreq[0];    if ( KnownItems < Epsilon )    {	Verbosity(2) printf("\tAtt %s: no known values\n", AttName[Att]);	Gain[Att] = -Epsilon;	Info[Att] = 0;	return;    }    BaseInfo = DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]);    PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], MaxAttVal[Att],KnownItems);    PrevInfo = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items;    PrevVal = Worth(PrevInfo, PrevGain, Epsilon);    Verbosity(2)    {	printf("\tAtt %s", AttName[Att]);	Verbosity(3) PrintDistribution(Att, MaxAttVal[Att], true);	printf("\tinf %.3f, gain %.3f, val=%.3f\n",		PrevInfo, PrevGain, PrevVal);    }    /*  Eliminate unrepresented attribute values from Freq[] and ValFreq[]	and form a separate subset for each represented attribute value  */    Bytes = (MaxAttVal[Att]>>3) + 1;    ClearBits(Bytes, Subset[Att][0]);    ForEach(V1, 1, MaxAttVal[Att])    {	if ( ValFreq[V1] > 0.5 )	{	    if ( ++Blocks < V1 )	    {		ValFreq[Blocks] = ValFreq[V1];		ForEach(c, 0, MaxClass)		{		    Freq[Blocks][c] = Freq[V1][c];		}	    }	    ClearBits(Bytes, Subset[Att][Blocks]);	    SetBit(V1, Subset[Att][Blocks]);	}	else	{	    SetBit(V1, Subset[Att][0]);	    MissingValues++;	}    }    /*  Merge any single-class subsets with others of the same class  */    /*  Note: have ValFreq[V] > 0 for all V  */    ForEach(V1, 1, Blocks-1)    {	for ( c = 0 ; Freq[V1][c] < 0.1 ; c++ )	    ;	if ( Freq[V1][c] < ValFreq[V1] - 0.1 ) continue;	/*  Now have a single class -- look for others  */	for ( V2 = V1+1 ; V2 <= Blocks ; )	{	    if ( Freq[V2][c] < ValFreq[V2] - 0.1 )	    {		V2++;	    }	    else	    {		/*  Merge these subsets  */		Combine(V1, V2, Blocks);		ForEach(b, 0, Bytes-1)		{		    Subset[Att][V1][b] |= Subset[Att][V2][b];		    Subset[Att][V2][b] = Subset[Att][Blocks][b];		}		Blocks--;		MergedSubsets = true;	    }	}    }    if ( MergedSubsets )    {	PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems);	PrevInfo = TotalInfo(ValFreq, 0, Blocks) / Items;	PrevVal = Worth(PrevInfo, PrevGain, Epsilon);	Verbosity(2)	{	    printf("\tAfter merging single-class subsets:");	    Verbosity(3) PrintDistribution(Att, Blocks, false);	    printf("\tinf %.3f, gain %.3f, val=%.3f\n",		    PrevInfo, PrevGain, PrevVal);	}    }    /*  Examine possible pair mergers and hill-climb  */    MinGain = PrevGain / 2;    while ( Blocks > 2 )    {	BestVal = BestV1 = 0;	BestGain = -Epsilon;	/*  Check reasonable subsets; if less than 3, bar mergers	    involving the largest block  */	ReasonableSubsets = 0;	Barred = 1;	ForEach(V1, 1, Blocks)	{	    if ( ValFreq[V1] >= SaveMINOBJS ) ReasonableSubsets++;	    if ( ValFreq[V1] > ValFreq[Barred] ) Barred = V1;	}	if ( ReasonableSubsets >= 3 ) Barred = 0;	/*  For each possible pair of values, calculate the gain and	    total info of a split in which they are treated as one.	    Keep track of the pair with the best gain.  */	ForEach(V1, 1, Blocks-1)	{	    ForEach(V2, V1+1, Blocks)	    {		if ( V1 == Barred || V2 == Barred ) continue;		Combine(V1, V2, Blocks);		ThisGain = ComputeGain(BaseInfo, UnknownRate[Att],					Blocks-1, KnownItems);		ThisInfo = TotalInfo(ValFreq, 0, Blocks-1) / Items;		Val      = Worth(ThisInfo, ThisGain, Epsilon);		Verbosity(4)		{		    printf("\tcombine %d %d info %.3f gain %.3f val %.3f",		           V1, V2, ThisInfo, ThisGain, Val);		    PrintDistribution(Att, Blocks-1, false);		}		/*  Force a split if			less than two reasonable subsets, or			using GAIN criterion		    Prefer this split to the previous one if			gain >= MinGain (and previous < MinGain), or			val >= previous best val  */		if ( ThisGain >= MinGain && BestGain < MinGain ||		     Val >= BestVal ||		     ! BestV1 && ( ! GAINRATIO || ReasonableSubsets < 2 ) )		{		    BestVal  = Val;		    BestGain = ThisGain;		    BestInfo = ThisInfo;		    BestV1   = V1;		    BestV2   = V2;		}		Uncombine(V1, V2);	    }	}	if ( GAINRATIO &&	     ReasonableSubsets >= 2 &&	     ( ! BestV1 ||	       BestVal < PrevVal + 1E-5 ||	       BestVal == PrevVal && BestGain < PrevGain ) ) break;	PrevGain = BestGain;	PrevInfo = BestInfo;        PrevVal = BestVal;	Combine(BestV1, BestV2, Blocks);	ForEach(b, 0, Bytes-1)	{	    Subset[Att][BestV1][b] |= Subset[Att][BestV2][b];	    Subset[Att][BestV2][b] = Subset[Att][Blocks][b];	}	Blocks--;	Verbosity(2)	{	    printf("\t\tform subset ");	    PrintSubset(Att, Subset[Att][BestV1]);	    printf(": %d subsets, inf %.3f, gain %.3f, val %.3f\n",		   Blocks, BestInfo, BestGain, BestVal);	    Verbosity(3)	    {		printf("\t\tcombine %d, %d", BestV1, BestV2);		PrintDistribution(Att, Blocks, false);	    }	}    }    MINOBJS = SaveMINOBJS;    if ( PrevVal <= 0 )    {	Gain[Att] = -Epsilon;	Info[Att] = 0;    }    else    {	Gain[Att] = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems);	Info[Att] = PrevInfo;	if ( MissingValues )	{	    Blocks++;	    CopyBits(Bytes, Subset[Att][0], Subset[Att][Blocks]);	}	Subsets[Att] = Blocks;	Verbosity(2) printf("\tFinal subsets:");	Verbosity(3) PrintDistribution(Att, Blocks, false);	Verbosity(2)	    printf("\tinf %.3f gain %.3f val %.3f\n", 		   Info[Att], Gain[Att], Worth(Info[Att], Gain[Att], Epsilon));    }}/*************************************************************************//*									 *//*  Combine the distribution figures of discrete attribute values	 *//*  x and y, putting the combined figures in Freq[x][] and		 *//*  ValFreq[x][], and saving old values in Slice1 and Slice2		 *//*									 *//*************************************************************************/    Combine(x, y, Last)/*  -------  */    DiscrValue x, y, Last;{    ClassNo c;    ForEach(c, 0, MaxClass)    {	Slice1[c] = Freq[x][c];	Slice2[c] = Freq[y][c];	Freq[x][c] += Freq[y][c];	Freq[y][c]  = Freq[Last][c];    }    Slice1[MaxClass+1] = ValFreq[x];    Slice2[MaxClass+1] = ValFreq[y];    ValFreq[x] += ValFreq[y];    ValFreq[y]  = ValFreq[Last];}/*************************************************************************//*									 *//*  Restore old class distribution figures of discrete attribute	 *//*  values x and y from Slice1 and Slice2				 *//*									 *//*************************************************************************/    Uncombine(x, y)/*  ---------  */    DiscrValue x, y;{    ClassNo c;    ForEach(c, 0, MaxClass)    {	Freq[x][c] = Slice1[c];	Freq[y][c] = Slice2[c];    }    ValFreq[x] = Slice1[MaxClass+1];    ValFreq[y] = Slice2[MaxClass+1];}/*************************************************************************//*									 *//*  Print the values of attribute Att which are in the subset Ss	 *//*									 *//*************************************************************************/    PrintSubset(Att, Ss)/*  -----------  */    Attribute Att;    Set Ss;{    DiscrValue V1;    Boolean First=true;    ForEach(V1, 1, MaxAttVal[Att])    {	if ( In(V1, Ss) )	{	    if ( First )	    {		First = false;	    }	    else	    {		printf(", ");	    }	    printf("%s", AttValName[Att][V1]);	}    }}/*************************************************************************//*									 *//*  Construct and return a node for a test on a subset of values	 *//*									 *//*************************************************************************/    SubsetTest(Node, Att)/*  -----------  */    Tree Node;    Attribute Att;{     ItemCount CountItems();    short S, Bytes;    Sprout(Node, Subsets[Att]);    Node->NodeType	= BrSubset;    Node->Tested	= Att;    Node->Errors	= 0;        Bytes = (MaxAttVal[Att]>>3) + 1;    Node->Subset = (Set *) calloc(Subsets[Att] + 1, sizeof(Set));    ForEach(S, 1, Node->Forks)    {	Node->Subset[S] = (Set) malloc(Bytes);	CopyBits(Bytes, Subset[Att][S], Node->Subset[S]);    }} 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -