📄 tdidt_split.cpp
字号:
/*
This file is part of Orange.
Orange is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Orange is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Orange; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Authors: Janez Demsar, Blaz Zupan, 1996--2002
Contact: janez.demsar@fri.uni-lj.si
*/
#include "measures.hpp"
#include "random.hpp"
#include "classfromvar.hpp"
#include "discretize.hpp"
#include "table.hpp"
#include "boolcnt.hpp"
#include "tdidt.hpp"
#include "tdidt_stop.hpp"
#include "tdidt_split.ppp"
TTreeSplitConstructor::TTreeSplitConstructor(const float &aml)
: minSubset(aml>0 ? aml : 1e-20)
{}
TTreeSplitConstructor_Measure::TTreeSplitConstructor_Measure(PMeasureAttribute meas, const float &aworst, const float &aml)
: TTreeSplitConstructor(aml),
measure(meas),
worstAcceptable(aworst)
{}
TTreeSplitConstructor_Combined::TTreeSplitConstructor_Combined(PTreeSplitConstructor discreteSplit, PTreeSplitConstructor continuousSplit, const float &aminSubset)
: TTreeSplitConstructor(aminSubset),
discreteSplitConstructor(discreteSplit),
continuousSplitConstructor(continuousSplit)
{}
PClassifier TTreeSplitConstructor_Combined::operator()(
PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,
PExampleGenerator gen, const int &weightID ,
PDomainContingency dcont, PDistribution apriorClass,
const vector<bool> &candidates,
PClassifier nodeClassifier
)
{ checkProperty(discreteSplitConstructor);
checkProperty(continuousSplitConstructor);
vector<bool> discrete, continuous;
bool cse = candidates.size()==0;
vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
TVarList::const_iterator vi(gen->domain->attributes->begin()), ve(gen->domain->attributes->end());
for(; (cse || (ci!=ce)) && (vi!=ve); vi++) {
if (cse || *(ci++))
if ((*vi)->varType == TValue::INTVAR) {
discrete.push_back(true);
continuous.push_back(false);
continue;
}
else if ((*vi)->varType == TValue::FLOATVAR) {
discrete.push_back(false);
continuous.push_back(true);
continue;
}
discrete.push_back(false);
continuous.push_back(false);
}
float discQuality;
PStringList discDescriptions;
PDiscDistribution discSizes;
int discSpent;
PClassifier discSplit = discreteSplitConstructor->call(discDescriptions, discSizes, discQuality, discSpent,
gen, weightID, dcont, apriorClass, discrete, nodeClassifier);
float contQuality;
PStringList contDescriptions;
PDiscDistribution contSizes;
int contSpent;
PClassifier contSplit = continuousSplitConstructor->call(contDescriptions, contSizes, contQuality, contSpent,
gen, weightID, dcont, apriorClass, continuous, nodeClassifier);
int N = gen ? gen->numberOfExamples() : -1;
if (N<0)
N = dcont->classes->cases;
if ( discSplit
&& ( !contSplit
|| (discQuality>contQuality)
|| (discQuality==contQuality) && (N%2>0))) {
quality = discQuality;
descriptions = discDescriptions;
subsetSizes = discSizes;
spentAttribute = discSpent;
return discSplit;
}
else if (contSplit) {
quality = contQuality;
descriptions = contDescriptions;
subsetSizes = contSizes;
spentAttribute = contSpent;
return contSplit;
}
else
return returnNothing(descriptions, subsetSizes, quality, spentAttribute);
}
TTreeSplitConstructor_Attribute::TTreeSplitConstructor_Attribute(PMeasureAttribute meas, const float &worst, const float &ms)
: TTreeSplitConstructor_Measure(meas, worst, ms)
{}
// if there are less than two branches with at least minSubset examples, skip the attribute
// also, skip it if all examples are in the same branch
bool checkDistribution(const TDiscDistribution &dist, const float &minSubset)
{
int nonzero = 0;
for(TDiscDistribution::const_iterator dvi(dist.begin()), dve(dist.end()); (dvi!=dve) && (nonzero<2); dvi++)
if ((*dvi > 0) && (*dvi >= minSubset))
nonzero++;
return nonzero >= 2;
}
inline bool noCandidates(const vector<bool> &candidates)
{
vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
while(ci!=ce && !*ci)
ci++;
return ci==ce;
}
PClassifier TTreeSplitConstructor_Attribute::operator()(
PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,
PExampleGenerator gen, const int &weightID,
PDomainContingency dcont, PDistribution apriorClass,
const vector<bool> &candidates,
PClassifier nodeClassifier
)
{ checkProperty(measure);
measure->checkClassTypeExc(gen->domain->classVar->varType);
bool cse = candidates.size()==0;
vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
if (!cse) {
if (noCandidates(candidates))
return returnNothing(descriptions, subsetSizes, quality, spentAttribute);
ci = candidates.begin();
}
int N = gen ? gen->numberOfExamples() : -1;
if (N<0)
N = dcont->classes->cases;
TSimpleRandomGenerator rgen(N);
int thisAttr = 0, bestAttr = -1, wins = 0;
quality = 0.0;
if (measure->needs == TMeasureAttribute::Contingency_Class) {
vector<bool> myCandidates;
if (cse) {
myCandidates.reserve(gen->domain->attributes->size());
PITERATE(TVarList, vi, gen->domain->attributes)
myCandidates.push_back((*vi)->varType == TValue::INTVAR);
}
else {
myCandidates.reserve(candidates.size());
TVarList::const_iterator vi(gen->domain->attributes->begin());
for(; ci != ce; ci++, vi++)
myCandidates.push_back(*ci && ((*vi)->varType == TValue::INTVAR));
}
if (!dcont || dcont->classIsOuter)
dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID, myCandidates));
ci = myCandidates.begin();
ce = myCandidates.end();
TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
for(; (ci != ce) && (dci!=dce); dci++, ci++, thisAttr++)
if (*ci && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) {
float thisMeas = measure->call(thisAttr, dcont, apriorClass);
if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1))
|| ((thisMeas==quality) && rgen.randbool(++wins))) {
quality = thisMeas;
subsetSizes = (*dci)->outerDistribution;
bestAttr = thisAttr;
}
}
}
else if (measure->needs == TMeasureAttribute::DomainContingency) {
if (!dcont || dcont->classIsOuter)
dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID));
TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++)
if ( (cse || *(ci++))
&& ((*dci)->outerVariable->varType==TValue::INTVAR)
&& checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) {
float thisMeas = measure->call(thisAttr, dcont, apriorClass);
if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1))
|| ((thisMeas==quality) && rgen.randbool(++wins))) {
quality = thisMeas;
subsetSizes = (*dci)->outerDistribution;
bestAttr = thisAttr;
}
}
}
else {
TDomainDistributions ddist(gen, weightID);
TDomainDistributions::iterator ddi(ddist.begin()), dde(ddist.end()-1);
for(; (cse || (ci!=ce)) && (ddi!=dde); ddi++, thisAttr++)
if (cse || *(ci++)) {
TDiscDistribution *discdist = (*ddi).AS(TDiscDistribution);
if (discdist && checkDistribution(*discdist, minSubset)) {
float thisMeas = measure->call(thisAttr, gen, apriorClass, weightID);
if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1))
|| ((thisMeas==quality) && rgen.randbool(++wins))) {
quality = thisMeas;
subsetSizes = PDiscDistribution(*ddi); // not discdist - this would be double wrapping!
bestAttr = thisAttr;
}
}
}
}
if (!wins)
return returnNothing(descriptions, subsetSizes, quality, spentAttribute);
if (quality<worstAcceptable)
return returnNothing(descriptions, subsetSizes, spentAttribute);
PVariable attribute = gen->domain->attributes->at(bestAttr);
TEnumVariable *evar = attribute.AS(TEnumVariable);
if (evar)
descriptions = mlnew TStringList(evar->values.getReference());
else
descriptions = mlnew TStringList(subsetSizes->size(), "");
spentAttribute = bestAttr;
return mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes);
}
TTreeSplitConstructor_ExhaustiveBinary::TTreeSplitConstructor_ExhaustiveBinary(PMeasureAttribute meas, const float &aworst, const float &aml)
: TTreeSplitConstructor_Measure(meas, aworst, aml)
{}
PClassifier TTreeSplitConstructor_ExhaustiveBinary::operator()(
PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,
PExampleGenerator gen, const int &weightID ,
PDomainContingency dcont, PDistribution apriorClass,
const vector<bool> &candidates,
PClassifier
)
{
checkProperty(measure);
measure->checkClassTypeExc(gen->domain->classVar->varType);
PIntList bestMapping;
int wins, bestAttr;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -