📄 discretize.cpp
字号:
PVariable TEquiNDiscretization::operator()(const TContDistribution &distr, PVariable var) const
{ PIntervalDiscretizer discretizer=mlnew TIntervalDiscretizer;
if (recursiveDivision && false) // XXX remove when the routine is finished
cutoffsByDivision(discretizer, distr);
else
cutoffsByCounting(discretizer, distr);
return discretizer->constructVar(var);
}
void TEquiNDiscretization::cutoffsByCounting(PIntervalDiscretizer discretizer, const TContDistribution &distr) const
{
if (numberOfIntervals<=0)
raiseError("invalid number of intervals (%i)", numberOfIntervals);
float N = distr.abs;
int toGo = numberOfIntervals;
float inthis = 0, prevel = -1; // initialized to avoid warnings
float inone = N/toGo;
for(map<float, float>::const_iterator db(distr.begin()), di(db), de(distr.end()); (toGo>1) && (di!=de); di++) {
inthis += (*di).second;
if ((inthis<inone) || (di==db))
prevel = (*di).first;
else {
discretizer->points->push_back((prevel+(*di).first)/2);
N -= inthis;
inthis = 0;
if (--toGo)
inone = N/toGo;
}
}
}
void TEquiNDiscretization::cutoffsByDivision(PIntervalDiscretizer discretizer, const TContDistribution &distr) const
{ cutoffsByDivision(numberOfIntervals, discretizer->points.getReference(), distr.begin(), distr.end(), distr.abs); }
void TEquiNDiscretization::cutoffsByDivision(const int &, TFloatList &,
map<float, float>::const_iterator, map<float, float>::const_iterator,
const float &) const
{ /*XXX to be finished
if (noInt & 1) {
if (noInt & 2) {
noIntLeft = (noInt-1)/2;
noIntRight = (noInt+1)/2;
}
else {
noIntLeft = (noInt+1)/2;
noIntRight = (noInt+1)/2;
}
float Nleft = N * noIntLeft / (noIntLeft + noIntRight);
float Nright = N - Nleft;
if ((Nleft<1) || (Nright<1))
return; // should set a cut-off, but couldn't -- N=1...
map<float, float>::const_iterator fii = fbeg;
while ((Nn<Nleft) && (fii!=fend))
Nn += (*fii).second;
Nn -= (*fii).second;
if (fii==fend) {
}
}
else {
float N2 = N/2, Nn = 0.0;
if (N2<1)
return; // should set a cut-off, but couldn't -- N=1...
map<float, float>::const_iterator fii = fbeg;
while ((Nn<N2) && (fii!=fend))
Nn += (*fii).second;
Nn -= (*fii).second;
if (fii==fend) {
fii--;
if (fii==fbeg)
return; // should set a cut-off, but there's only one value
else {
map<float, float>::const_iterator fjj = fii;
fjj--;
points.push_back(((*fjj).first + (*fii).first) / 2.0);
return;
}
}
if (noInt>2) {
cutoffsByDivision(noInt/2, points, fbeg, fii, Nn);
map<float, float>::const_iterator fjj = fii;
fjj--;
points.push_back(((*fjj).first + (*fii).first) / 2.0);
cutoffsByDivision(noInt/2, points, fii, fend, N-Nn);
}
}*/
}
PVariable TEquiNDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &weightID)
{ if (var->varType!=TValue::FLOATVAR)
raiseError("attribute '%s' is not continuous", var->name.c_str());
int varPos=gen->domain->getVarNum(var);
TExampleIterator first(gen->begin());
while(first && (*first)[varPos].isSpecial() )
++first;
if (!first)
raiseError("attribute has '%s' no known values.", var->name.c_str());
TContDistribution distr(var);
do {
TValue &val=(*first)[varPos];
if (!val.isSpecial())
distr.addfloat(float(val), WEIGHT(*first));
} while (++first);
return operator()(distr, var);
}
// Defined in measures.cpp
float getEntropy(const vector<float> &);
TEntropyDiscretization::TEntropyDiscretization()
: maxNumberOfIntervals(0),
forceAttribute(false)
{}
PVariable TEntropyDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &weightID)
{ if (!gen->domain->classVar)
raiseError("class-less domain");
if (gen->domain->classVar!=TValue::INTVAR)
raiseError("class '%s' is not discrete", gen->domain->classVar->name.c_str());
if (var->varType!=TValue::FLOATVAR)
raiseError("attribute '%s' is not continuous", var->name.c_str());
int varPos=gen->domain->getVarNum(var);
TS S;
TDiscDistribution all;
PEITERATE(ei, gen) {
TValue &val = (*ei)[varPos];
if (!val.isSpecial()) {
const TValue &eclass = (*ei).getClass();
if (!eclass.isSpecial()) {
float weight = WEIGHT(*ei);
S[float(val)].addint(int(eclass), weight);
all.addint(int(eclass), weight);
}
}
}
/* No need to initialize seed by number of examples.
Different number will obviously result in different decisions. */
TSimpleRandomGenerator rgen;
return operator()(S, all, var, weightID, rgen);
}
PVariable TEntropyDiscretization::operator()(const TS &S, const TDiscDistribution &all, PVariable var, const long &, TSimpleRandomGenerator &rgen) const
{
int k=0;
const_ITERATE(TDiscDistribution, ci, all)
if (*ci>0)
k++;
if (!k)
raiseError("no examples or all values of attribute '%s' are unknown", var->name.c_str());
vector<pair<float, float> > points;
divide(S.begin(), S.end(), all, float(getEntropy(all)), k, points, rgen);
/* This is not correct: if, for instance, we have two cut-off points we should always remove
the one that was added later... */
if ((maxNumberOfIntervals>0) && (int(points.size())+1>maxNumberOfIntervals)) {
random_sort(points.begin(), points.end(), predOn2nd<pair<float, float>, less<float> >(), predOn2nd<pair<float, float>, equal_to<float> >(), rgen);
points.erase(points.begin()+maxNumberOfIntervals-1, points.end());
sort(points.begin(), points.end(), predOn1st<pair<float, float>, less<float> >());
}
PIntervalDiscretizer discretizer = mlnew TIntervalDiscretizer();
TFloatList &dpoints = dynamic_cast<TFloatList &>(discretizer->points.getReference());
if (points.size()) {
vector<pair<float, float> >::const_iterator fi(points.begin()), fe(points.end());
discretizer->points->push_back((*(fi++)).first);
for(; fi!=fe; fi++)
if ((*fi).first != dpoints.back())
discretizer->points->push_back((*fi).first);
}
return discretizer->constructVar(var);
}
void TEntropyDiscretization::divide(
const TS::const_iterator &first, const TS::const_iterator &last,
const TDiscDistribution &distr, float entropy, int k,
vector<pair<float, float> > &points,
TSimpleRandomGenerator &rgen) const
{
TDiscDistribution S1dist, S2dist = distr, bestS1, bestS2;
float bestE = -1.0;
float N = distr.abs;
int wins = 0;
TS::const_iterator Ti = first, bestT;
for(; Ti!=last; Ti++) {
S1dist += (*Ti).second;
S2dist -= (*Ti).second;
if (S2dist.abs==0)
break;
float entro1 = S1dist.abs*float(getEntropy(S1dist))/N;
float entro2 = S2dist.abs*float(getEntropy(S2dist))/N;
float E = entro1+entro2;
if ( (!wins || (E<bestE)) && ((wins=1)==1)
|| (E==bestE) && rgen.randbool(++wins)) {
bestS1 = S1dist;
bestS2 = S2dist;
bestE = E;
bestT = Ti;
}
}
if (!wins)
return;
int k1 = 0, k2 = 0;
ITERATE(TDiscDistribution, ci1, bestS1)
if (*ci1>0)
k1++;
ITERATE(TDiscDistribution, ci2, bestS2)
if (*ci2>0)
k2++;
float entropy1 = float(getEntropy(bestS1));
float entropy2 = float(getEntropy(bestS2));
float MDL = log(float(N-1))/log(2.0)/N
+ (log(exp(k*log(3.0))-2)/log(2.0) - (k*entropy - k1*entropy1 - k2*entropy2))/N;
float gain = entropy-bestE;
float cutoff = (*bestT).first;
bestT++;
// cout << cutoff << ", info gain=" << gain << ", MDL=" << MDL << endl;
if (gain>MDL) {
if ((k1>1) && (first!=bestT))
divide(first, bestT, bestS1, entropy1, k1, points, rgen);
points.push_back(pair<float, float>(cutoff, gain-MDL));
if ((k2>1) && (bestT!=last))
divide(bestT, last, bestS2, entropy2, k2, points, rgen);
}
else if (forceAttribute && !points.size())
points.push_back(pair<float, float>(cutoff, gain-MDL));
}
template<class T> inline T sqr(const T &t)
{ return t*t; }
TBiModalDiscretization::TBiModalDiscretization(const bool sit)
: splitInTwo(sit)
{}
PVariable TBiModalDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &weightID)
{ if (var->varType!=TValue::FLOATVAR)
raiseError("attribute '%s' is not continuous", var->name.c_str());
if (gen->domain->classVar!=TValue::INTVAR)
raiseError("class '%s' is not discrete", gen->domain->classVar->name.c_str());
TContingencyAttrClass ccont(gen, var, weightID);
int nClasses = gen->domain->classVar->noOfValues();
float best1, best2;
float bestEval = -99999;
PDistribution classDist = getClassDistribution(gen, weightID);
TDiscDistribution &totDist = dynamic_cast<TDiscDistribution &>(classDist.getReference());
totDist.normalize();
// middle will contain sum of distributions from cut1 (exclusive) to cut2 (inclusive)
for(TDistributionMap::iterator cut1(ccont.continuous->begin()), cute(ccont.continuous->end()); cut1!=cute; cut1++) {
TDiscDistribution middle(nClasses);
TDistributionMap::iterator cut2 = cut1;
for(cut2++; cut2!=cute; cut2++) {
middle += (*cut2).second;
float chisq = 0.0;
float tabs = middle.abs;
int N = nClasses;
for(TDiscDistribution::const_iterator toti = totDist.begin(), midi = middle.begin(); N--; toti++, midi++) {
const float E = tabs**toti;
const float &n = *midi;
chisq += sqr( fabs(E - n) - 0.5 ) / E;
}
if (chisq > bestEval) {
bestEval = chisq;
best1 = (*cut1).first;
best2 = (*cut2).first;
}
}
}
PDiscretizer discretizer;
if (splitInTwo)
discretizer = mlnew TBiModalDiscretizer(best1, best2);
else {
TIntervalDiscretizer *idisc = mlnew TIntervalDiscretizer;
discretizer = idisc;
idisc->points->push_back(best1);
idisc->points->push_back(best2);
}
return discretizer->constructVar(var);
}
TDomainDiscretization::TDomainDiscretization(PDiscretization adisc)
: discretization(adisc)
{}
PDomain TDomainDiscretization::equiDistDomain(PExampleGenerator gen)
{
PDomain newDomain = mlnew TDomain();
newDomain->metas = gen->domain->metas;
TDomainBasicAttrStat valStats(gen);
const TEquiDistDiscretization &discs = dynamic_cast<TEquiDistDiscretization &>(discretization.getReference());
TVarList::iterator vi=gen->domain->variables->begin();
ITERATE(TDomainBasicAttrStat, si, valStats)
if (*si) {
PVariable evar=discs(*si, *vi);
newDomain->variables->push_back(evar);
newDomain->attributes->push_back(evar);
vi++;
}
else {
newDomain->variables->push_back(*vi);
newDomain->attributes->push_back(*vi);
vi++;
}
if (gen->domain->classVar) {
newDomain->classVar=newDomain->variables->back();
newDomain->attributes->erase(newDomain->attributes->end()-1);
}
return newDomain;
}
PDomain TDomainDiscretization::equiNDomain(PExampleGenerator gen, const long &weightID)
{
PDomain newDomain = mlnew TDomain();
newDomain->metas = gen->domain->metas;
TDomainDistributions valDs(gen, weightID);
const TEquiNDiscretization &discs = dynamic_cast<TEquiNDiscretization &>(discretization.getReference());
TVarList::iterator vi=gen->domain->variables->begin();
ITERATE(TDomainDistributions, si, valDs)
if ((*si)->variable->varType==TValue::FLOATVAR) {
PVariable evar = discs(CAST_TO_CONTDISTRIBUTION(*si), *vi);
newDomain->variables->push_back(evar);
newDomain->attributes->push_back(evar);
vi++;
}
else {
newDomain->variables->push_back(*vi);
newDomain->attributes->push_back(*vi);
vi++;
}
if (gen->domain->classVar) {
newDomain->classVar = newDomain->variables->back();
newDomain->attributes->erase(newDomain->attributes->end()-1);
}
return newDomain;
}
PDomain TDomainDiscretization::otherDomain(PExampleGenerator gen, const long &weightID)
{
PDomain newDomain = mlnew TDomain();
newDomain->metas = gen->domain->metas;
PITERATE(TVarList, vi, gen->domain->variables)
if ((*vi)->varType==TValue::FLOATVAR) {
PVariable evar=discretization->operator()(gen, *vi, weightID);
newDomain->variables->push_back(evar);
newDomain->attributes->push_back(evar);
}
else {
newDomain->variables->push_back(*vi);
newDomain->attributes->push_back(*vi);
}
if (gen->domain->classVar) {
newDomain->classVar=newDomain->variables->back();
newDomain->attributes->erase(newDomain->attributes->end()-1);
}
return newDomain;
}
PDomain TDomainDiscretization::operator()(PExampleGenerator gen, const long &weightID)
{ checkProperty(discretization);
if (discretization.is_derived_from(TEquiDistDiscretization))
return equiDistDomain(gen);
if (discretization.is_derived_from(TEquiNDiscretization))
return equiNDomain(gen, weightID);
return otherDomain(gen, weightID);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -