📄 contin.cpp
字号:
/*************************************************************************/
/* */
/* Evaluation of a test on a continuous valued attribute */
/* ----------------------------------------------------- */
/* */
/*************************************************************************/
#include "stdafx.h"
#include "MyBase.h"
#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif
extern FILE *fLog;
extern ItemCount
*Weight, /* Weight[i] = current fraction of item i */
**Freq, /* Freq[x][c] = no. items of class c with outcome x */
*ValFreq; /* ValFreq[x] = no. items with att value v */
extern float
*Gain, /* Gain[a] = info gain by split on att a */
*Info, /* Info[a] = potential info from split on att a */
*Bar, /* Bar[a] = best threshold for contin att a */
*UnknownRate; /* UnknownRate[a] = current unknown rate for att a */
extern char
*Tested; /* Tested[a] = true if att a already tested */
float
*SplitGain, /* SplitGain[i] = gain with att value of item i as threshold */
*SplitInfo; /* SplitInfo[i] = potential info ditto */
/*************************************************************************/
/* */
/* Continuous attributes are treated as if they have possible values */
/* 0 (unknown), 1 (less than cut), 2(greater than cut) */
/* This routine finds the best cut for items Fp through Lp and sets */
/* Info[], Gain[] and Bar[] */
/* */
/*************************************************************************/
void EvalContinuousAtt(Attribute Att, ItemNo Fp, ItemNo Lp)
{
ItemNo i, BestI, Xp, Tries=0;
ItemCount Items, KnownItems, LowItems, MinSplit;
ClassNo c;
float AvGain=0, Val, BestVal, BaseInfo, ThreshCost;
//Verbosity(2) fprintf(fLog,"\tAtt %s", AttName[Att]);
//Verbosity(3) fprintf(fLog,"\n");
ResetFreq(2);
/* Omit and count unknown values */
Items = CountItems(Fp, Lp);
Xp = Fp;
ForEach(i, Fp, Lp)
{
if ( CVal(Item[i],Att) == Unknown )
{
Freq[ 0 ][ Class(Item[i]) ] += Weight[i];
Swap(Xp, i);
Xp++;
}
}
ValFreq[0] = 0;
ForEach(c, 0, MaxClass)
{
ValFreq[0] += Freq[0][c];
}
KnownItems = Items - ValFreq[0];
UnknownRate[Att] = 1.0 - KnownItems / Items;
/* Special case when very few known values */
if ( KnownItems < 2 * MINOBJS )
{
//Verbosity(2) fprintf(fLog,"\tinsufficient cases with known values\n");
Gain[Att] = -Epsilon;
Info[Att] = 0.0;
return;
}
Quicksort(Xp, Lp, Att, 0);
/* Count base values and determine base information */
ForEach(i, Xp, Lp)
{
Freq[ 2 ][ Class(Item[i]) ] += Weight[i];
SplitGain[i] = -Epsilon;
SplitInfo[i] = 0;
}
BaseInfo = TotalInfo(Freq[2], 0, MaxClass) / KnownItems;
/* Try possible cuts between items i and i+1, and determine the
information and gain of the split in each case. We have to be wary
of splitting a small number of items off one end, as we can always
split off a single item, but this has little predictive power. */
MinSplit = 0.10 * KnownItems / (MaxClass + 1);
if ( MinSplit <= MINOBJS )
MinSplit = MINOBJS;
else if ( MinSplit > 25 )
MinSplit = 25;
LowItems = 0;
ForEach(i, Xp, Lp - 1)
{
c = Class(Item[i]);
LowItems += Weight[i];
Freq[1][c] += Weight[i];
Freq[2][c] -= Weight[i];
if ( LowItems < MinSplit ) continue;
else
if ( LowItems > KnownItems - MinSplit ) break;
if ( CVal(Item[i],Att) < CVal(Item[i+1],Att) - 1E-5 )
{
ValFreq[1] = LowItems;
ValFreq[2] = KnownItems - LowItems;
SplitGain[i] = ComputeGain(BaseInfo, UnknownRate[Att], 2, KnownItems);
SplitInfo[i] = TotalInfo(ValFreq, 0, 2) / Items;
AvGain += SplitGain[i];
Tries++;
Verbosity(3)
{/**
fprintf(fLog,"\t\tCut at %.3f (gain %.3f, val %.3f):",
( CVal(Item[i],Att) + CVal(Item[i+1],Att) ) / 2,
SplitGain[i],
Worth(SplitInfo[i], SplitGain[i], Epsilon));
PrintDistribution(Att, 2, true);/**/
}
}
}
/* Find the best attribute according to the given criterion */
ThreshCost = Log(Tries) / Items;
BestVal = 0;
BestI = None;
ForEach(i, Xp, Lp - 1)
{
if ( (Val = SplitGain[i] - ThreshCost) > BestVal )
{
BestI = i;
BestVal = Val;
}
}
/* If a test on the attribute is able to make a gain,
set the best break point, gain and information */
if ( BestI == None )
{
Gain[Att] = -Epsilon;
Info[Att] = 0.0;
Verbosity(2) fprintf(fLog,"\tno gain\n");
}
else
{
Bar[Att] = (CVal(Item[BestI],Att) + CVal(Item[BestI+1],Att)) / 2;
Gain[Att] = BestVal;
Info[Att] = SplitInfo[BestI];
/**
Verbosity(2)
fprintf(fLog,"\tcut=%.3f, inf %.3f, gain %.3f\n",
Bar[Att], Info[Att], Gain[Att]);/**/
}
}
/*************************************************************************/
/* */
/* Change a leaf into a test on a continuous attribute */
/* */
/*************************************************************************/
void ContinTest(Tree Node, Attribute Att)
{
float Thresh;
Sprout(Node, 2);
Thresh = GreatestValueBelow(Att, Bar[Att]);
Node->NodeType = ThreshContin;
Node->Tested = Att;
Node->Cut =
Node->Lower =
Node->Upper = Thresh;
Node->Errors = 0;
}
/*************************************************************************/
/* */
/* Return the greatest value of attribute Att below threshold t */
/* */
/*************************************************************************/
float GreatestValueBelow(Attribute Att, float t)
{
ItemNo i;
float v, Best;
Boolean NotYet=true;
ForEach(i, 0, MaxItem)
{
v = CVal(Item[i], Att);
if ( v != Unknown && v <= t && ( NotYet || v > Best ) )
{
Best = v;
NotYet = false;
}
}
return Best;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -