📄 tdidt_split.cpp
字号:
PVariable bvar;
if (measure->needs==TMeasureAttribute::Generator) {
bool cse = candidates.size()==0;
bool haveCandidates = false;
vector<bool> myCandidates;
myCandidates.reserve(gen->domain->attributes->size());
vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
TVarList::const_iterator vi, ve(gen->domain->attributes->end());
for(vi = gen->domain->attributes->begin(); vi != ve; vi++) {
bool co = (*vi)->varType == TValue::INTVAR && (!cse || (ci!=ce) && *ci);
myCandidates.push_back(co);
haveCandidates = haveCandidates || co;
}
if (!haveCandidates)
return returnNothing(descriptions, subsetSizes, quality, spentAttribute);
PDistribution thisSubsets;
float thisQuality;
wins = 0;
int thisAttr = 0;
int N = gen->numberOfExamples();
TSimpleRandomGenerator rgen(N);
ci = myCandidates.begin();
for(vi = gen->domain->attributes->begin(); vi != ve; ci++, vi++, thisAttr++) {
if (*ci) {
thisSubsets = NULL;
PIntList thisMapping =
/*throughCont ? measure->bestBinarization(thisSubsets, thisQuality, *dci, dcont->classes, apriorClass, minSubset)
: */measure->bestBinarization(thisSubsets, thisQuality, *vi, gen, apriorClass, weightID, minSubset);
if (thisMapping
&& ( (!wins || (thisQuality>quality)) && ((wins=1)==1)
|| (thisQuality==quality) && rgen.randbool(++wins))) {
bestAttr = thisAttr;
quality = thisQuality;
subsetSizes = thisSubsets;
bestMapping = thisMapping;
}
}
/*if (thoughCont)
dci++; */
}
if (!wins)
return returnNothing(descriptions, subsetSizes, quality, spentAttribute);
if (quality<worstAcceptable)
return returnNothing(descriptions, subsetSizes, spentAttribute);
if (subsetSizes && subsetSizes->variable)
bvar = subsetSizes->variable;
else {
TEnumVariable *evar = mlnew TEnumVariable("");
evar->addValue("0");
evar->addValue("1");
bvar = evar;
}
}
else {
bool cse = candidates.size()==0;
if (!cse && noCandidates(candidates))
return returnNothing(descriptions, subsetSizes, quality, spentAttribute);
if (!dcont || dcont->classIsOuter) {
dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID));
raiseWarningWho("TreeSplitConstructor_ExhaustiveBinary", "this class is not optimized for 'candidates' list and can be very slow");
}
int N = gen ? gen->numberOfExamples() : -1;
if (N<0)
N = dcont->classes->cases;
TSimpleRandomGenerator rgen(N);
PDistribution classDistribution = dcont->classes;
vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
TDiscDistribution *dis0, *dis1;
TContDistribution *con0, *con1;
int thisAttr = 0;
bestAttr = -1;
wins = 0;
quality = 0.0;
float leftExamples, rightExamples;
TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++) {
// We consider the attribute only if it is a candidate, discrete and has at least two values
if ((cse || *(ci++)) && ((*dci)->outerVariable->varType==TValue::INTVAR) && ((*dci)->discrete->size()>=2)) {
const TDistributionVector &distr = *(*dci)->discrete;
if (distr.size()>10)
raiseError("'%s' has more than 10 values, cannot exhaustively binarize", gen->domain->attributes->at(thisAttr)->name.c_str());
// If the attribute is binary, we check subsetSizes and assess the quality if they are OK
if (distr.size()==2) {
if ((distr.front()->abs<minSubset) || (distr.back()->abs<minSubset))
continue; // next attribute
else {
float thisMeas = measure->call(thisAttr, dcont, apriorClass);
if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1))
|| ((thisMeas==quality) && rgen.randbool(++wins))) {
bestAttr = thisAttr;
quality = thisMeas;
leftExamples = distr.front()->abs;
rightExamples = distr.back()->abs;
bestMapping = mlnew TIntList(2, 0);
bestMapping->at(1) = 1;
}
continue;
}
}
vector<int> valueIndices;
int ind = 0;
for(TDistributionVector::const_iterator dvi(distr.begin()), dve(distr.end()); (dvi!=dve); dvi++, ind++)
if ((*dvi)->abs>0)
valueIndices.push_back(ind);
if (valueIndices.size()<2)
continue;
PContingency cont = prepareBinaryCheat(classDistribution, *dci, bvar, dis0, dis1, con0, con1);
// A real job: go through all splits
int binWins = 0;
float binQuality = -1.0;
float binLeftExamples = -1.0, binRightExamples = -1.0;
// Selection: each element correspons to a value of the original attribute and is 1, if the value goes right
// The first value always goes left (and has no corresponding bit in selection.
TBoolCount selection(valueIndices.size()-1), bestSelection(0);
// First for discrete classes
if (dis0) {
do {
*dis0 = CAST_TO_DISCDISTRIBUTION(distr[valueIndices[0]]);
*dis1 *= 0;
vector<int>::const_iterator ii(valueIndices.begin());
for(TBoolCount::const_iterator bi(selection.begin()), be(selection.end()); bi!=be; bi++, ii++)
*(*bi ? dis1 : dis0) += distr[*ii];
if ((dis0->abs<minSubset) || (dis1->abs<minSubset))
continue; // cannot split like that, to few examples in one of the branches
float thisMeas = measure->operator()(cont, classDistribution, apriorClass);
if ( ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1)
|| (thisMeas==binQuality) && rgen.randbool(++binWins)) {
bestSelection = selection;
binQuality = thisMeas;
binLeftExamples = dis0->abs;
binRightExamples = dis1->abs;
}
} while (selection.next());
}
// And then exactly the same for continuous classes
else {
do {
*con0 = CAST_TO_CONTDISTRIBUTION(distr[0]);
*con1 = TContDistribution();
vector<int>::const_iterator ii(valueIndices.begin());
for(TBoolCount::const_iterator bi(selection.begin()), be(selection.end()); bi!=be; bi++, ii++)
*(*bi ? con1 : con0) += distr[*ii];
if ((con0->abs<minSubset) || (con1->abs<minSubset))
continue; // cannot split like that, to few examples in one of the branches
float thisMeas = measure->operator()(cont, classDistribution, apriorClass);
if ( ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1)
|| (thisMeas==binQuality) && rgen.randbool(++binWins)) {
bestSelection = selection;
binQuality = thisMeas;
binLeftExamples = con0->abs;
binRightExamples = con1->abs;
}
} while (selection.next());
}
if ( binWins
&& ( (!wins || (binQuality>quality)) && ((wins=1)==1)
|| (binQuality==quality) && rgen.randbool(++wins))) {
bestAttr = thisAttr;
quality = binQuality;
leftExamples = binLeftExamples;
rightExamples = binRightExamples;
bestMapping = mlnew TIntList(distr.size(), -1);
vector<int>::const_iterator ii = valueIndices.begin();
bestMapping->at(*(ii++)) = 0;
ITERATE(TBoolCount, bi, selection)
bestMapping->at(*(ii++)) = *bi ? 1 : 0;
}
}
}
if (!wins)
return returnNothing(descriptions, subsetSizes, quality, spentAttribute);
subsetSizes = mlnew TDiscDistribution();
subsetSizes->addint(0, leftExamples);
subsetSizes->addint(1, rightExamples);
}
PVariable attribute = gen->domain->attributes->at(bestAttr);
if (attribute->noOfValues() == 2) {
spentAttribute = bestAttr;
descriptions = mlnew TStringList(attribute.AS(TEnumVariable)->values.getReference());
return mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes);
}
string s0, s1;
int ns0 = 0, ns1 = 0;
TValue ev;
attribute->firstValue(ev);
PITERATE(TIntList, mi, bestMapping) {
string str;
attribute->val2str(ev, str);
if (*mi==1) {
s1 += string(ns1 ? ", " : "") + str;
ns1++;
}
else if (*mi==0) {
s0 += string(ns0 ? ", " : "") + str;
ns0++;
}
attribute->nextValue(ev);
}
descriptions = mlnew TStringList();
descriptions->push_back(ns0>1 ? "in ["+s0+"]" : s0);
descriptions->push_back(ns1>1 ? "in ["+s1+"]" : s1);
bvar->name = gen->domain->attributes->at(bestAttr)->name;
spentAttribute = (ns0==1) && (ns1==1) ? bestAttr : -1;
return mlnew TClassifierFromVarFD(bvar, gen->domain, bestAttr, subsetSizes, mlnew TMapIntValue(bestMapping));
}
TTreeSplitConstructor_Threshold::TTreeSplitConstructor_Threshold(PMeasureAttribute meas, const float &worst, const float &aml)
: TTreeSplitConstructor_Measure(meas, worst, aml)
{}
PClassifier TTreeSplitConstructor_Threshold::operator()(
PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,
PExampleGenerator gen, const int &weightID ,
PDomainContingency dcont, PDistribution apriorClass,
const vector<bool> &candidates,
PClassifier
)
{
checkProperty(measure);
measure->checkClassTypeExc(gen->domain->classVar->varType);
bool cse = candidates.size()==0;
bool haveCandidates = false;
vector<bool> myCandidates;
myCandidates.reserve(gen->domain->attributes->size());
vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
TVarList::const_iterator vi, ve(gen->domain->attributes->end());
for(vi = gen->domain->attributes->begin(); vi != ve; vi++) {
bool co = (*vi)->varType == TValue::FLOATVAR && (!cse || (ci!=ce) && *ci);
myCandidates.push_back(co);
haveCandidates = haveCandidates || co;
}
if (!haveCandidates)
return returnNothing(descriptions, subsetSizes, quality, spentAttribute);
int N = gen ? gen->numberOfExamples() : -1;
if (N < 0)
N = dcont->classes->cases;
TSimpleRandomGenerator rgen(N);
PDistribution thisSubsets;
float thisQuality, bestThreshold;
ci = myCandidates.begin();
int wins = 0, thisAttr = 0, bestAttr;
TDomainContingency::iterator dci, dce;
bool throughCont = (dcont && !dcont->classIsOuter && (measure->needs <= measure->DomainContingency));
if (throughCont) {
dci = dcont->begin();
dce = dcont->end();
}
for(vi = gen->domain->attributes->begin(); vi != ve; ci++, vi++, thisAttr++) {
if (*ci) {
thisSubsets = NULL;
const float thisThreshold =
throughCont ? measure->bestThreshold(thisSubsets, thisQuality, *dci, dcont->classes, apriorClass, minSubset)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -