📄 c45dt.cpp
字号:
/*************************************************************************/
/* */
/* Given tables Freq[][] and ValFreq[], compute the information gain. */
/* */
/* Parameters: */
/* BaseInfo: average information for all items with */
/* known values of the test attribute */
/* UnknownRate: fraction of items with unknown ditto */
/* MaxVal: number of forks ,determined by MaxAttVal[Att] */
/* TotalItems: number of items with known values of */
/* test att */
/* */
/* where Freq[x][y] contains the no. of cases with vafor lue x a */
/* particular attribute that are members of class y, */
/* and ValFreq[x] contains the no. of cases with value x for a */
/* particular attribute */
/* */
/*
Gain[Att] = ComputeGain(DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]),
UnknownRate[Att], MaxAttVal[Att], KnownItems);
*/
/*************************************************************************/
float C45DT::ComputeGain(float BaseInfo,float UnknFrac,DiscrValue MaxVal,ItemCount TotalItems)
{
DiscrValue v;
float ThisInfo=0.0, ThisGain;
short ReasonableSubsets=0;
/* Check whether all values are unknown or the same */
if ( ! TotalItems ) return (float)-Epsilon;/*如果已知属性值样本数为0*/
/*subsets are divided by value of the attitude*/
ForEach(v, 1, MaxVal)
{
if ( ValFreq[v] >= MINOBJS ) ReasonableSubsets++;
}
if ( ReasonableSubsets < 2 ) return (float)-Epsilon;/* There must be at least two subsets with MINOBJS items */
/* Compute total info after split, by summing the
info of each of the subsets formed by the test */
ForEach(v, 1, MaxVal)
{
ThisInfo += TotalInfo(Freq[v], 0, MaxClass);
}
/* Set the gain in information for all items, adjusted for unknowns */
ThisGain = (1 - UnknFrac) * (BaseInfo - ThisInfo / TotalItems);
Verbosity(5)
printf("ComputeThisGain: items %.1f info %.3f base %.3f unkn %.3f result %.3f\n",
TotalItems + ValFreq[0], ThisInfo, BaseInfo, UnknFrac, ThisGain);
return ThisGain;
}
/*************************************************************************/
/* */
/* Compute the total information in V[ MinVal..MaxVal ] */
/* */
/*************************************************************************/
/*Info[Att] = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items;*/
float C45DT::TotalInfo(ItemCount V[],DiscrValue MinVal,DiscrValue MaxVal)
{
DiscrValue v;
float Sum=0.0;
ItemCount N, TotalItems=0;
ForEach(v, MinVal, MaxVal)
{
N = V[v];
Sum +=(float)( N * Log(N));
TotalItems += N;
}
return (float)(TotalItems*Log(TotalItems) - Sum);
}
/*************************************************************************/
/* */
/* Print distribution table for given attribute */
/* */
/*************************************************************************/
void C45DT::PrintDistribution(Attribute Att,DiscrValue MaxVal,Boolean ShowNames)
{
DiscrValue v;
ClassNo c;
C45_String Val;
printf("\n\t\t\t ");
ForEach(c, 0, MaxClass)
{
printf("%7.6s", ClassName[c]);
}
printf("\n");
ForEach(v, 0, MaxVal)
{
if ( ShowNames )
{
Val = ( !v ? "unknown" :
MaxAttVal[Att] ? AttValName[Att][v] :
v == 1 ? "below" : "above" );
printf("\t\t[%-7.7s:", Val);
}
else
{
printf("\t\t[%-7d:", v);
}
ForEach(c, 0, MaxClass)
{
printf(" %6.1f", Freq[v][c]);
}
printf("]\n");
}
}
/*************************************************************************/
/* */
/* Sorting utilities */
/* ----------------- */
/* sort.c */
/*************************************************************************/
/*************************************************************************/
/* */
/* Sort items from Fp to Lp on attribute a */
/* */
/*************************************************************************/
void C45DT::Quicksort(ItemNo Fp,ItemNo Lp,Attribute Att)
{
// register ItemNo Lower, Middle;
// register float Thresh;
// register ItemNo i;
// register Description Hold;
// register ItemCount HoldW;
register ItemNo Lower, Middle;
register float Thresh;
register ItemNo i;
Description Hold;
ItemCount HoldW;
if ( Fp < Lp )
{
Thresh = CVal(Item[Lp], Att);
/* 隔离Isolate all items with values <= threshold */
/*Item[Middle]之前的值<= threshold*/
Middle = Fp;
for ( i = Fp ; i < Lp ; i++ )
{
if ( CVal(Item[i], Att) <= Thresh )
{
if ( i != Middle )
{
Hold = Item[Middle];
Item[Middle] = Item[i];
Item[i] = Hold;
HoldW = Weight[Middle];
Weight[Middle] = Weight[i];
Weight[i] = HoldW;
}
Middle++;
}
}
/* Extract all values equal to the threshold */
Lower = Middle - 1;
for ( i = Lower ; i >= Fp ; i-- )
{
if ( CVal(Item[i], Att) == Thresh )
{
if ( i != Lower )
{
// (*Exchange)(Lower, i);
Hold = Item[Lower];
Item[Lower] = Item[i];
Item[i] = Hold;
HoldW = Weight[Lower];
Weight[Lower] = Weight[i];
Weight[i] = HoldW;
}
Lower--;
}
}
/* Item[Fp]- Item[Lower]: <= threshold
Item[Lower]- Item[Middle]: = threshold
Item[Middle]- Item[Lp]: > threshold
*/
/* Sort the lower values */
Quicksort(Fp, Lower, Att);
/* Position the middle element */
// (*Exchange)(Middle, Lp);
Hold = Item[Middle];
Item[Middle] = Item[Lp];
Item[Lp] = Hold;
HoldW = Weight[Middle];
Weight[Middle] = Weight[Lp];
Weight[Lp] = HoldW;
/* Sort the higher values */
Quicksort(Middle+1, Lp, Att);
}
}
/*************************************************************************/
/* */
/* Routines for displaying, building, saving and restoring trees */
/* ------------------------------------------------------------- */
/* trees.c */
/*************************************************************************/
/* by wuxing, 2003,4,21 */
void C45DT::PrintHeaderToFile(char *Title)
{
char TitleLine[80];
time_t clock;
short Underline;
time(&clock);
sprintf(TitleLine, "C4.5 [release %s] %s", "8.1", Title);
fprintf(TRf,"\n%s\t%s", TitleLine, ctime(&clock));
Underline = strlen(TitleLine);
while ( Underline-- ) fprintf(TRf,"%c",'-');
fprintf(TRf,"%s","\n");
//-------------------------------------------------------------
fprintf(TRf,"\nRead %d cases (%d attributes) from %s.data\n",
MaxItem+1, MaxAtt+1, FileName);
}
void C45DT::PrintHeaderToBuffer(char *Title)
{
char TitleLine[80];
time_t clock;
short Underline;
char buffer[100];
time(&clock);
sprintf(TitleLine, "标准C4.5算法[版本 %s] %s", "8.1", Title);
sprintf(buffer,"\n%s\t%s", TitleLine, ctime(&clock));
strcat(sShowTree,buffer);
Underline = strlen(TitleLine);
while ( Underline-- )
{
sprintf(buffer,"%c",'-');
strcat(sShowTree,buffer);
}
sprintf(buffer,"%s","\n");
strcat(sShowTree,buffer);
//-------------------------------------------------------------
sprintf(buffer,"\n学习样本 %d 个 (%d 属性)\n",MaxItem+1, MaxAtt+1);
strcat(sShowTree,buffer);
}
/*************************************************************************/
/* */
/* Print a node T with offset Sh, branch value v, and continue */
/* */
/*************************************************************************/
void C45DT::ShowBranch(short Sh,Tree T,DiscrValue v)
{
DiscrValue Pv, Last;
Attribute Att;
Boolean FirstValue;
short TextWidth, Skip, Values=0, i;
Att = T->Tested;
switch ( T->NodeType )//非叶节点
{
case BrDiscr:
Indent(Sh, Tab);
printf("%s = %s:", AttName[Att], AttValName[Att][v]);
break;
case ThreshContin:
Indent(Sh, Tab);
printf("%s %s %g ",AttName[Att], ( v == 1 ? "<=" : ">" ), T->Cut);
if ( T->Lower != T->Upper )
{
printf("[%g,%g]", T->Lower, T->Upper);
}
printf(":");
break;
case BrSubset:
/* Count values at this branch */
ForEach(Pv, 1, MaxAttVal[Att])
{
if ( In(Pv, T->Subset[v]) )
{
Last = Pv;
Values++;
}
}
if ( ! Values ) return;
Indent(Sh, Tab);
if ( Values == 1 )
{
printf("%s = %s:", AttName[Att], AttValName[Att][Last]);
break;
}
printf("%s in {", AttName[Att]);
FirstValue = DTTrue;
Skip = TextWidth = strlen(AttName[Att]) + 5;
ForEach(Pv, 1, MaxAttVal[Att])
{
if ( In(Pv, T->Subset[v]) )
{
if ( ! FirstValue &&
TextWidth + strlen(AttValName[Att][Pv]) + 11 > Width )
{
Indent(Sh, Tab);
ForEach(i, 1, Skip) putchar(' ');
TextWidth = Skip;
FirstValue = DTTrue;
}
printf("%s%c", AttValName[Att][Pv], Pv == Last ? '}' : ',');
TextWidth += strlen(AttValName[Att][Pv]) + 1;
FirstValue = DTFalse;
}
}
putchar(':');
break;
}
//有待进一步分析
Show(T->Branch[v], Sh+1);//显示分支的下一层,递归调用
//Show(T, Sh+1);//显示分支的下一层,递归调用
}
/*************************************************************************/
/* */
/* Print a node T with offset Sh, branch value v, and continue to file */
/* by wuxing, 2003,4,21 */
/* */
/*************************************************************************/
void C45DT::ShowBranchToFile(short Sh,Tree T,DiscrValue v)
{
DiscrValue Pv, Last;
Attribute Att;
Boolean FirstValue;
short TextWidth, Skip, Values=0, i;
Att = T->Tested;
switch ( T->NodeType )//非叶节点
{
case BrDiscr:
IndentToFile(Sh, Tab);
fprintf(TRf,"%s = %s:", AttName[Att], AttValName[Att][v]);
break;
case ThreshContin:
IndentToFile(Sh, Tab);
fprintf(TRf,"%s %s %g ",AttName[Att], ( v == 1 ? "<=" : ">" ), T->Cut);
if ( T->Lower != T->Upper )
{
fprintf(TRf,"[%g,%g]", T->Lower, T->Upper);
}
fprintf(TRf,"%s",":");
break;
case BrSubset:
/* Count values at this branch */
ForEach(Pv, 1, MaxAttVal[Att])
{
if ( In(Pv, T->Subset[v]) )
{
Last = Pv;
Values++;
}
}
if ( ! Values ) return;
IndentToFile(Sh, Tab);
if ( Values == 1 )
{
fprintf(TRf,"%s = %s:", AttName[Att], AttValName[Att][Last]);
break;
}
fprintf(TRf,"%s in {", AttName[Att]);
FirstValue = DTTrue;
Skip = TextWidth = strlen(AttName[Att]) + 5;
ForEach(Pv, 1, MaxAttVal[Att])
{
if ( In(Pv, T->Subset[v]) )
{
if ( ! FirstValue &&
TextWidth + strlen(AttValName[Att][Pv]) + 11 > Width )
{
IndentToFile(Sh, Tab);
ForEach(i, 1, Skip) fprintf(TRf,"%c",' ');
TextWidth = Skip;
FirstValue = DTTrue;
}
fprintf(TRf,"%s%c", AttValName[Att][Pv], Pv == Last ? '}' : ',');
TextWidth += strlen(AttValName[Att][Pv]) + 1;
FirstValue = DTFalse;
}
}
fprintf(TRf,"%c",':');
break;
}
//有待进一步分析
ShowToFile(T->Branch[v], Sh+1);//显示分支的下一层,递归调用
//Show(T, Sh+1);//显示分支的下一层,递归调用
}
void C45DT::ShowBranchToBuffer(short Sh,Tree T,DiscrValue v)
{
DiscrValue Pv, Last;
Attribute Att;
Boolean FirstValue;
short TextWidth, Skip, Values=0, i;
char buffer[100];
Att = T->Tested;
switch ( T->NodeType )//非叶节点
{
case BrDiscr:
IndentToBuffer(Sh, Tab);
sprintf(buffer,"%s = %s:", AttName[Att], AttValName[Att][v]);
strcat(sShowTree,buffer);
break;
case ThreshContin:
IndentToBuffer(Sh, Tab);
sprintf(buffer,"%s %s %g ",AttName[Att], ( v == 1 ? "<=" : ">" ), T->Cut);
strcat(sShowTree,buffer);
if ( T->Lower != T->Upper )
{
sprintf(buffer,"[%g,%g]", T->Lower, T->Upper);
strcat(sShowTree,buffer);
}
sprintf(buffer,"%s",":");
strcat(sShowTree,buffer);
break;
case BrSubset:
/* Count values at this branch */
ForEach(Pv, 1, MaxAttVal[Att])
{
if ( In(Pv, T->Subset[v]) )
{
Last = Pv;
Values++;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -