⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 c45dt.cpp

📁 这是一个改进的C4.5决策数算法C++类
💻 CPP
📖 第 1 页 / 共 5 页
字号:

    fclose(Df);
    MaxItem = i - 1;
}

/*************************************************************************/
/*									 */
/*  Read a raw case description from file Df.				 */
/*									 */
/*  For each attribute, read the attribute value from the file.		 */
/*  If it is a discrete valued attribute, find the associated no.	 */
/*  of this attribute value (if the value is unknown this is 0).	 */
/*									 */
/*  Returns the Description of the case (i.e. the array of		 */
/*  attribute values).							 */
/*									 */
/*************************************************************************/


Description C45DT::GetDescription(FILE *Df)
{
    Attribute Att;
    char name[500], *endname;

    int Dv;
    float Cv;
    Description Dvec;

	/*逐个属性提取属性值*/
    if ( ReadName(Df, name) )
    {/*根据属性个数分配存储每个属性对应值的数组*/
		Dvec = (Description) calloc(MaxAtt+2, sizeof(AttValue));

		ForEach(Att, 0, MaxAtt)
		{
			/*首先检测属性状态可用否*/
			if ( SpecialStatus[Att] == IGNORE )
			{
				/*  Skip this value  */
				DVal(Dvec, Att) = 0;/*令离散属性值=0*/
			}
			else if ( MaxAttVal[Att] || SpecialStatus[Att] == DISCRETE )
			{
			/*  如果是Discrete value,则 MaxAttVal[Att]必>0 */ 

				if ( ! ( strcmp(name, "?") ) )/*是?*/
				{
					Dv = 0;/*令离散属性值=0*/
				}
				else/*如果两位以上*/
				{	/*在每个属性的值域中查找当前值的位置编号*/

					Dv = Which(name, AttValName[Att], 1, MaxAttVal[Att]);
					if ( ! Dv )/*如果是0,说明读出的属性值不在定义的列表中*/
					{
						if ( SpecialStatus[Att] == DISCRETE )
						{
							/*  Add value to list  */
							/*如果属性值标志为离散值,将它追加入此属性的值列表中*/

							Dv = ++MaxAttVal[Att];
							if ( Dv > (int) AttValName[Att][0] )
							{
							printf("\nToo many values for %s (max %d)\n",
								AttName[Att], (int) AttValName[Att][0]);
							exit(1);
							}

							AttValName[Att][Dv] = CopyString(name);
						}
						else
						{
							Quinlan_Error(4, AttName[Att], name);
						}
					}
				}
				DVal(Dvec, Att) = Dv;/*从数据文件获得离散属性值编号信息*/
			}
			else
			{
			/*  Continuous value  */

				if ( ! ( strcmp(name, "?") ) )
					{
						Cv = Unknown;
					}
				else
					{
						Cv = (float)strtod(name, &endname);/*Convert C45_Strings to a double-precision value.*/
						if ( endname == name || *endname != '\0' )/*若endname == name相同表明字符串非数字*/
							Quinlan_Error(4, AttName[Att], name);
					}
				CVal(Dvec, Att) = Cv;/*从数据文件获得连续属性值信息*/
			}

			ReadName(Df, name);
		}/*end foreach*/
		/*最后读出的数据是案例属于哪一类*/
		/*在类中找出当前案例所属类的编号*/
		if ( (Dv = Which(name, ClassName, 0, MaxClass)) < 0 )
		{
			Quinlan_Error(5, "", name);
			Dv = 0;
		}

		Class(Dvec) = Dv;/*将得到的类信息存入Item数组的最后一项,The Class Index=0 means that this item has some errors.*/
		return Dvec;
	}
	else
	{
		return Nil;
	}
}

/*************************************************************************/
/*								  	 */
/*	Routine for printing confusion matrices				 */
/*	---------------------------------------				 */
/*	comfmat.c							  	 */
/*************************************************************************/

void C45DT::PrintConfusionMatrix(ItemNo *ConfusionMat)
{
    short Row, Col;

    if ( MaxClass > 20 ) return;  /* Don't print nonsensical matrices */

    /*  Print the heading, then each row  */

    printf("\n\n\t");
    ForEach(Col, 0, MaxClass)
    {
	printf("  (%c)", 'a' + Col);
    }

    printf("\t<-classified as\n\t");
    ForEach(Col, 0, MaxClass)
    {
	printf(" ----");
    }
    printf("\n");

    ForEach(Row, 0, MaxClass)
    {
	printf("\t");
	ForEach(Col, 0, MaxClass)
	{
	    if ( ConfusionMat[Row*(MaxClass+1) + Col] )
	    {
		printf("%5d", ConfusionMat[Row*(MaxClass+1) + Col]);
	    }
	    else
	    {
		printf("     ");
	    }
	}
	printf("\t(%c): class %s\n", 'a' + Row, ClassName[Row]);
    }
    printf("\n");
}

void C45DT::PrintConfusionMatrix(ItemNo *ConfusionMat,FILE* fp)
{
    short Row, Col;

    if ( MaxClass > 20 ) return;  /* Don't print nonsensical matrices */

    /*  Print the heading, then each row  */

    fprintf(fp,"\n\n\t");
    ForEach(Col, 0, MaxClass)
    {
		fprintf(fp,"  (%c)", 'a' + Col);
    }

    fprintf(fp,"\t<-classified as\n\t");
    ForEach(Col, 0, MaxClass)
    {
		fprintf(fp," ----");
    }
    fprintf(fp,"\n");

    ForEach(Row, 0, MaxClass)
    {
		fprintf(fp,"\t");
		ForEach(Col, 0, MaxClass)
		{
			if ( ConfusionMat[Row*(MaxClass+1) + Col] )
			{
				fprintf(fp,"%5d", ConfusionMat[Row*(MaxClass+1) + Col]);
			}
			else
			{
				fprintf(fp,"     ");
			}
		}
		fprintf(fp,"\t(%c): class %s\n", 'a' + Row, ClassName[Row]);
    }
    fprintf(fp,"\n");
}

void C45DT::PrintConfusionMatrixConclusion(ItemNo *ConfusionMat,char *strTestConclusion)
{
    short Row, Col;
	char buffer[100];

    if ( MaxClass > 20 ) return;  /* Don't print nonsensical matrices */

    /*  Print the heading, then each row  */

    
    ForEach(Col, 0, MaxClass)
    {
		sprintf(buffer,"  (%c)", 'a' + Col);
		strcat(strTestConclusion,buffer);
    }

    strcat(strTestConclusion,"\t<-决策树分类为;");
    ForEach(Col, 0, MaxClass)
    {
		strcat(strTestConclusion," ------");
    }
    strcat(strTestConclusion,";");

    ForEach(Row, 0, MaxClass)
    {
		strcat(strTestConclusion,"\t");
		ForEach(Col, 0, MaxClass)
		{
			if ( ConfusionMat[Row*(MaxClass+1) + Col] )
			{
				sprintf(buffer,"%5d", ConfusionMat[Row*(MaxClass+1) + Col]);
				strcat(strTestConclusion,buffer);
			}
			else
			{
				strcat(strTestConclusion,"     ");
			}
		}
		sprintf(buffer,"\t(%c): class %s\n", 'a' + Row, ClassName[Row]);
		strcat(strTestConclusion,buffer);
    }
    strcat(strTestConclusion,";");
}

/*************************************************************************/
/*                                                              	 */
/*  Determine the class of a case description from a decision tree	 */
/*  --------------------------------------------------------------	 */
/*  classify.c                                                            	 */
/*************************************************************************/
/*************************************************************************/
/*                                                              	 */
/*  Classify a case description using the given subtree by adjusting	 */
/*  the value ClassSum for each class					 */
/*                                                              	 */
/*************************************************************************/

void C45DT::Classify(Description CaseDesc,Tree T,float Weight)
{
    DiscrValue v, dv;
    float Cv;
    Attribute a;
    ClassNo c;

    switch ( T->NodeType )
    {
        case 0:  /* leaf */

	    if ( T->Items > 0 )
	    {
		/*  Update from ALL classes  */

		ForEach(c, 0, MaxClass)
		{
		    if ( T->ClassDist[c] )
		    {
			ClassSum[c] += Weight * T->ClassDist[c] / T->Items;
		    }
		}
	    }
	    else
	    {
		ClassSum[T->Leaf] += Weight;
	    }

	    return;

	case BrDiscr:  /* test of discrete attribute */

	    a = T->Tested;
	    v = DVal(CaseDesc, a);

	    if ( v && v <= T->Forks )	/*  Make sure not new discrete value  */
	    {
			Classify(CaseDesc, T->Branch[v], Weight);
	    }
	    else
	    {
			ForEach(v, 1, T->Forks)
			{
				Classify(CaseDesc, T->Branch[v], 
					 (Weight * T->Branch[v]->Items) / T->Items);
			}
	    }

	    return;

	case ThreshContin:  /* test of continuous attribute */

	    a = T->Tested;
	    Cv = CVal(CaseDesc, a);

	    if ( Cv == Unknown )
	    {
		ForEach(v, 1, 2)
		{
		    Classify(CaseDesc, T->Branch[v], 
			     (Weight * T->Branch[v]->Items) / T->Items);
		}
	    }
	    else
	    {
		v = ( Cv <= T->Cut ? 1 : 2 );
		Classify(CaseDesc, T->Branch[v], Weight);
	    }

	    return;

	case BrSubset:  /* subset test on discrete attribute  */

	    a = T->Tested;
	    dv = DVal(CaseDesc, a);

	    if ( dv )
	    {
		ForEach(v, 1, T->Forks)
		{
		    if ( In(dv, T->Subset[v]) )
		    {
			Classify(CaseDesc, T->Branch[v], Weight);

			return;
		    }
		}
	    }

	    /*  Value unknown or not found in any of the subsets  */

	    ForEach(v, 1, T->Forks)
	    {
		Classify(CaseDesc, T->Branch[v], 
		         (Weight * T->Branch[v]->Items) / T->Items);
	    }

	    return;
    } 
}


/*************************************************************************/
/*                                                              	 */
/*  Categorize a case description using the given decision tree		 */
/*                                                              	 */
/*************************************************************************/

ClassNo C45DT::Category(Description CaseDesc,Tree DecisionTree)
{ 
    ClassNo c, BestClass;

    if ( ! ClassSum )
    {
		ClassSum = (float *) malloc((MaxClass+1) * sizeof(float));
    }

    ForEach(c, 0, MaxClass)
    {
		ClassSum[c] = 0;
    }

    Classify(CaseDesc, DecisionTree, 1.0);

    BestClass = 0;
    ForEach(c, 0, MaxClass)
    {
	Verbosity(5) printf("class %s weight %.2f\n", ClassName[c], ClassSum[c]);

	if ( ClassSum[c] > ClassSum[BestClass] ) BestClass = c;
    }

    return BestClass;
}

/*************************************************************************/
/*									 */
/*  Statistical routines for C4.5					 */
/*  -----------------------------					 */
/*	stats.c								 */
/*************************************************************************/
								
/*************************************************************************/
/*									 */
/*  Compute the additional errors if the error rate increases to the	 */
/*  upper limit of the confidence level.  The coefficient is the	 */
/*  square of the number of standard deviations corresponding to the	 */
/*  selected confidence level.  (Taken from Documenta Geigy Scientific	 */
/*  Tables (Sixth Edition), p185 (with modifications).)			 */
/*									 */
/*************************************************************************/

float C45DT::AddErrs(ItemCount N,ItemCount e)
{
    //static float Coeff=0;
	float Coeff=0;
    float Val0, Pr;

	float Val[] = {  0,  0.001, 0.005, 0.01, 0.05, 0.10, 0.20, 0.40, 1.00},
	Dev[] = {4.0,  3.09,  2.58,  2.33, 1.65, 1.28, 0.84, 0.25, 0.00};
	
    if ( ! Coeff )
    {
	/*  Compute and retain the coefficient value, interpolating from
	    the values in Val and Dev  */

	int i;

	i = 0;
	while ( CF > Val[i] ) i++;

	Coeff = Dev[i-1] +
		  (Dev[i] - Dev[i-1]) * (CF - Val[i-1]) /(Val[i] - Val[i-1]);
	Coeff = Coeff * Coeff;
    }

    if ( e < 1E-6 )
    {
		return (float)(N * (1 - exp(log(CF) / N)));
    }
    else
    if ( e < 0.9999 )
    {
		Val0 = (float)(N * (1 - exp(log(CF) / N)));
		return Val0 + e * (AddErrs(N, 1.0) - Val0);
    }
    else
    if ( e + 0.5 >= N )
    {
		return (float)(0.67 * (N - e));
    }
    else
    {
		Pr = (float)(e + 0.5 + Coeff/2
				+ sqrt(Coeff * ((e + 0.5) * (1 - (e + 0.5)/N) + Coeff/4)) )
				 / (N + Coeff);
		return (N * Pr - e);
    }
}

/*************************************************************************/
/*									 */
/*	Calculate information, information gain, and print dists	 */
/*	--------------------------------------------------------	 */
/*	info.c								 */
/*************************************************************************/
/*************************************************************************/
/*									 */
/*  Determine the worth of a particular split according to the		 */
/*  operative criterion							 */
/*									 */
/*	    Parameters:							 */
/*		SplitInfo:	potential info of the split		 */
/*		SplitGain:	gain in info of the split		 */
/*		MinGain:	gain above which the Gain Ratio		 */
/*				may be used				 */
/*									 */
/*  If the Gain criterion is being used, the information gain of	 */
/*  the split is returned, but if the Gain Ratio criterion is		 */
/*  being used, the ratio of the information gain of the split to	 */
/*  its potential information is returned.				 */
/*									 */
/*************************************************************************/

float C45DT::Worth(float ThisInfo,float ThisGain,float MinGain)
{
    if ( GAINRATIO )/*=DTTrue*/
    {
		if ( ThisGain >= MinGain - Epsilon && ThisInfo > Epsilon )
		{
			return ThisGain / ThisInfo;
		}
		else
		{
			return (float)-Epsilon;
		}
    }
    else
    {
		return ( ThisInfo > 0 && ThisGain > -Epsilon ? ThisGain : (float)-Epsilon );
    }
}



/*************************************************************************/
/*									 */
/*  Zero the frequency tables Freq[][] and ValFreq[] up to MaxVal	 */
/*									 */
/*************************************************************************/


void    C45DT::ResetFreq(DiscrValue MaxVal)
{
    DiscrValue v;
    ClassNo c;

    ForEach(v, 0, MaxVal)
    { 
		ForEach(c, 0, MaxClass)
		{
			Freq[v][c] = 0;
		}
		ValFreq[v] = 0;
    } 
}



⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -