📄 c45dt.cpp
字号:
fclose(Df);
MaxItem = i - 1;
}
/*************************************************************************/
/* */
/* Read a raw case description from file Df. */
/* */
/* For each attribute, read the attribute value from the file. */
/* If it is a discrete valued attribute, find the associated no. */
/* of this attribute value (if the value is unknown this is 0). */
/* */
/* Returns the Description of the case (i.e. the array of */
/* attribute values). */
/* */
/*************************************************************************/
Description C45DT::GetDescription(FILE *Df)
{
Attribute Att;
char name[500], *endname;
int Dv;
float Cv;
Description Dvec;
/*逐个属性提取属性值*/
if ( ReadName(Df, name) )
{/*根据属性个数分配存储每个属性对应值的数组*/
Dvec = (Description) calloc(MaxAtt+2, sizeof(AttValue));
ForEach(Att, 0, MaxAtt)
{
/*首先检测属性状态可用否*/
if ( SpecialStatus[Att] == IGNORE )
{
/* Skip this value */
DVal(Dvec, Att) = 0;/*令离散属性值=0*/
}
else if ( MaxAttVal[Att] || SpecialStatus[Att] == DISCRETE )
{
/* 如果是Discrete value,则 MaxAttVal[Att]必>0 */
if ( ! ( strcmp(name, "?") ) )/*是?*/
{
Dv = 0;/*令离散属性值=0*/
}
else/*如果两位以上*/
{ /*在每个属性的值域中查找当前值的位置编号*/
Dv = Which(name, AttValName[Att], 1, MaxAttVal[Att]);
if ( ! Dv )/*如果是0,说明读出的属性值不在定义的列表中*/
{
if ( SpecialStatus[Att] == DISCRETE )
{
/* Add value to list */
/*如果属性值标志为离散值,将它追加入此属性的值列表中*/
Dv = ++MaxAttVal[Att];
if ( Dv > (int) AttValName[Att][0] )
{
printf("\nToo many values for %s (max %d)\n",
AttName[Att], (int) AttValName[Att][0]);
exit(1);
}
AttValName[Att][Dv] = CopyString(name);
}
else
{
Quinlan_Error(4, AttName[Att], name);
}
}
}
DVal(Dvec, Att) = Dv;/*从数据文件获得离散属性值编号信息*/
}
else
{
/* Continuous value */
if ( ! ( strcmp(name, "?") ) )
{
Cv = Unknown;
}
else
{
Cv = (float)strtod(name, &endname);/*Convert C45_Strings to a double-precision value.*/
if ( endname == name || *endname != '\0' )/*若endname == name相同表明字符串非数字*/
Quinlan_Error(4, AttName[Att], name);
}
CVal(Dvec, Att) = Cv;/*从数据文件获得连续属性值信息*/
}
ReadName(Df, name);
}/*end foreach*/
/*最后读出的数据是案例属于哪一类*/
/*在类中找出当前案例所属类的编号*/
if ( (Dv = Which(name, ClassName, 0, MaxClass)) < 0 )
{
Quinlan_Error(5, "", name);
Dv = 0;
}
Class(Dvec) = Dv;/*将得到的类信息存入Item数组的最后一项,The Class Index=0 means that this item has some errors.*/
return Dvec;
}
else
{
return Nil;
}
}
/*************************************************************************/
/* */
/* Routine for printing confusion matrices */
/* --------------------------------------- */
/* comfmat.c */
/*************************************************************************/
void C45DT::PrintConfusionMatrix(ItemNo *ConfusionMat)
{
short Row, Col;
if ( MaxClass > 20 ) return; /* Don't print nonsensical matrices */
/* Print the heading, then each row */
printf("\n\n\t");
ForEach(Col, 0, MaxClass)
{
printf(" (%c)", 'a' + Col);
}
printf("\t<-classified as\n\t");
ForEach(Col, 0, MaxClass)
{
printf(" ----");
}
printf("\n");
ForEach(Row, 0, MaxClass)
{
printf("\t");
ForEach(Col, 0, MaxClass)
{
if ( ConfusionMat[Row*(MaxClass+1) + Col] )
{
printf("%5d", ConfusionMat[Row*(MaxClass+1) + Col]);
}
else
{
printf(" ");
}
}
printf("\t(%c): class %s\n", 'a' + Row, ClassName[Row]);
}
printf("\n");
}
void C45DT::PrintConfusionMatrix(ItemNo *ConfusionMat,FILE* fp)
{
short Row, Col;
if ( MaxClass > 20 ) return; /* Don't print nonsensical matrices */
/* Print the heading, then each row */
fprintf(fp,"\n\n\t");
ForEach(Col, 0, MaxClass)
{
fprintf(fp," (%c)", 'a' + Col);
}
fprintf(fp,"\t<-classified as\n\t");
ForEach(Col, 0, MaxClass)
{
fprintf(fp," ----");
}
fprintf(fp,"\n");
ForEach(Row, 0, MaxClass)
{
fprintf(fp,"\t");
ForEach(Col, 0, MaxClass)
{
if ( ConfusionMat[Row*(MaxClass+1) + Col] )
{
fprintf(fp,"%5d", ConfusionMat[Row*(MaxClass+1) + Col]);
}
else
{
fprintf(fp," ");
}
}
fprintf(fp,"\t(%c): class %s\n", 'a' + Row, ClassName[Row]);
}
fprintf(fp,"\n");
}
void C45DT::PrintConfusionMatrixConclusion(ItemNo *ConfusionMat,char *strTestConclusion)
{
short Row, Col;
char buffer[100];
if ( MaxClass > 20 ) return; /* Don't print nonsensical matrices */
/* Print the heading, then each row */
ForEach(Col, 0, MaxClass)
{
sprintf(buffer," (%c)", 'a' + Col);
strcat(strTestConclusion,buffer);
}
strcat(strTestConclusion,"\t<-决策树分类为;");
ForEach(Col, 0, MaxClass)
{
strcat(strTestConclusion," ------");
}
strcat(strTestConclusion,";");
ForEach(Row, 0, MaxClass)
{
strcat(strTestConclusion,"\t");
ForEach(Col, 0, MaxClass)
{
if ( ConfusionMat[Row*(MaxClass+1) + Col] )
{
sprintf(buffer,"%5d", ConfusionMat[Row*(MaxClass+1) + Col]);
strcat(strTestConclusion,buffer);
}
else
{
strcat(strTestConclusion," ");
}
}
sprintf(buffer,"\t(%c): class %s\n", 'a' + Row, ClassName[Row]);
strcat(strTestConclusion,buffer);
}
strcat(strTestConclusion,";");
}
/*************************************************************************/
/* */
/* Determine the class of a case description from a decision tree */
/* -------------------------------------------------------------- */
/* classify.c */
/*************************************************************************/
/*************************************************************************/
/* */
/* Classify a case description using the given subtree by adjusting */
/* the value ClassSum for each class */
/* */
/*************************************************************************/
void C45DT::Classify(Description CaseDesc,Tree T,float Weight)
{
DiscrValue v, dv;
float Cv;
Attribute a;
ClassNo c;
switch ( T->NodeType )
{
case 0: /* leaf */
if ( T->Items > 0 )
{
/* Update from ALL classes */
ForEach(c, 0, MaxClass)
{
if ( T->ClassDist[c] )
{
ClassSum[c] += Weight * T->ClassDist[c] / T->Items;
}
}
}
else
{
ClassSum[T->Leaf] += Weight;
}
return;
case BrDiscr: /* test of discrete attribute */
a = T->Tested;
v = DVal(CaseDesc, a);
if ( v && v <= T->Forks ) /* Make sure not new discrete value */
{
Classify(CaseDesc, T->Branch[v], Weight);
}
else
{
ForEach(v, 1, T->Forks)
{
Classify(CaseDesc, T->Branch[v],
(Weight * T->Branch[v]->Items) / T->Items);
}
}
return;
case ThreshContin: /* test of continuous attribute */
a = T->Tested;
Cv = CVal(CaseDesc, a);
if ( Cv == Unknown )
{
ForEach(v, 1, 2)
{
Classify(CaseDesc, T->Branch[v],
(Weight * T->Branch[v]->Items) / T->Items);
}
}
else
{
v = ( Cv <= T->Cut ? 1 : 2 );
Classify(CaseDesc, T->Branch[v], Weight);
}
return;
case BrSubset: /* subset test on discrete attribute */
a = T->Tested;
dv = DVal(CaseDesc, a);
if ( dv )
{
ForEach(v, 1, T->Forks)
{
if ( In(dv, T->Subset[v]) )
{
Classify(CaseDesc, T->Branch[v], Weight);
return;
}
}
}
/* Value unknown or not found in any of the subsets */
ForEach(v, 1, T->Forks)
{
Classify(CaseDesc, T->Branch[v],
(Weight * T->Branch[v]->Items) / T->Items);
}
return;
}
}
/*************************************************************************/
/* */
/* Categorize a case description using the given decision tree */
/* */
/*************************************************************************/
ClassNo C45DT::Category(Description CaseDesc,Tree DecisionTree)
{
ClassNo c, BestClass;
if ( ! ClassSum )
{
ClassSum = (float *) malloc((MaxClass+1) * sizeof(float));
}
ForEach(c, 0, MaxClass)
{
ClassSum[c] = 0;
}
Classify(CaseDesc, DecisionTree, 1.0);
BestClass = 0;
ForEach(c, 0, MaxClass)
{
Verbosity(5) printf("class %s weight %.2f\n", ClassName[c], ClassSum[c]);
if ( ClassSum[c] > ClassSum[BestClass] ) BestClass = c;
}
return BestClass;
}
/*************************************************************************/
/* */
/* Statistical routines for C4.5 */
/* ----------------------------- */
/* stats.c */
/*************************************************************************/
/*************************************************************************/
/* */
/* Compute the additional errors if the error rate increases to the */
/* upper limit of the confidence level. The coefficient is the */
/* square of the number of standard deviations corresponding to the */
/* selected confidence level. (Taken from Documenta Geigy Scientific */
/* Tables (Sixth Edition), p185 (with modifications).) */
/* */
/*************************************************************************/
float C45DT::AddErrs(ItemCount N,ItemCount e)
{
//static float Coeff=0;
float Coeff=0;
float Val0, Pr;
float Val[] = { 0, 0.001, 0.005, 0.01, 0.05, 0.10, 0.20, 0.40, 1.00},
Dev[] = {4.0, 3.09, 2.58, 2.33, 1.65, 1.28, 0.84, 0.25, 0.00};
if ( ! Coeff )
{
/* Compute and retain the coefficient value, interpolating from
the values in Val and Dev */
int i;
i = 0;
while ( CF > Val[i] ) i++;
Coeff = Dev[i-1] +
(Dev[i] - Dev[i-1]) * (CF - Val[i-1]) /(Val[i] - Val[i-1]);
Coeff = Coeff * Coeff;
}
if ( e < 1E-6 )
{
return (float)(N * (1 - exp(log(CF) / N)));
}
else
if ( e < 0.9999 )
{
Val0 = (float)(N * (1 - exp(log(CF) / N)));
return Val0 + e * (AddErrs(N, 1.0) - Val0);
}
else
if ( e + 0.5 >= N )
{
return (float)(0.67 * (N - e));
}
else
{
Pr = (float)(e + 0.5 + Coeff/2
+ sqrt(Coeff * ((e + 0.5) * (1 - (e + 0.5)/N) + Coeff/4)) )
/ (N + Coeff);
return (N * Pr - e);
}
}
/*************************************************************************/
/* */
/* Calculate information, information gain, and print dists */
/* -------------------------------------------------------- */
/* info.c */
/*************************************************************************/
/*************************************************************************/
/* */
/* Determine the worth of a particular split according to the */
/* operative criterion */
/* */
/* Parameters: */
/* SplitInfo: potential info of the split */
/* SplitGain: gain in info of the split */
/* MinGain: gain above which the Gain Ratio */
/* may be used */
/* */
/* If the Gain criterion is being used, the information gain of */
/* the split is returned, but if the Gain Ratio criterion is */
/* being used, the ratio of the information gain of the split to */
/* its potential information is returned. */
/* */
/*************************************************************************/
float C45DT::Worth(float ThisInfo,float ThisGain,float MinGain)
{
if ( GAINRATIO )/*=DTTrue*/
{
if ( ThisGain >= MinGain - Epsilon && ThisInfo > Epsilon )
{
return ThisGain / ThisInfo;
}
else
{
return (float)-Epsilon;
}
}
else
{
return ( ThisInfo > 0 && ThisGain > -Epsilon ? ThisGain : (float)-Epsilon );
}
}
/*************************************************************************/
/* */
/* Zero the frequency tables Freq[][] and ValFreq[] up to MaxVal */
/* */
/*************************************************************************/
void C45DT::ResetFreq(DiscrValue MaxVal)
{
DiscrValue v;
ClassNo c;
ForEach(v, 0, MaxVal)
{
ForEach(c, 0, MaxClass)
{
Freq[v][c] = 0;
}
ValFreq[v] = 0;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -