📄 measures.cpp
字号:
/*
This file is part of Orange.
Orange is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Orange is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Orange; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Authors: Janez Demsar, Blaz Zupan, 1996--2002
Contact: janez.demsar@fri.uni-lj.si
*/
// to include Python.h before STL defines a template set (doesn't work with VC 6.0)
#include "garbage.hpp"
#include <math.h>
#include <set>
#include "stladdon.hpp"
#include "student.hpp"
#include "random.hpp"
#include "vars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "examplegen.hpp"
#include "table.hpp"
#include "distance.hpp"
#include "contingency.hpp"
#include "classify.hpp"
#include "symmatrix.hpp"
#include "cost.hpp"
#include <vector>
#include "relief.ppp"
#include "measures.ppp"
void checkDiscrete(const PContingency &cont, char *measure)
{ if (cont->varType!=TValue::INTVAR)
if (cont->outerVariable)
raiseErrorWho(measure, "cannot evaluate the non-discrete attribute '%s'", cont->outerVariable->name.c_str());
else
raiseErrorWho(measure, "cannot evaluate continuous attributes");
if (cont->innerVariable) {
if (cont->innerVariable->varType != TValue::INTVAR)
raiseErrorWho(measure, "cannot work with continuous outcome '%s'", cont->innerVariable->name.c_str());
}
else
if (!cont->innerDistribution.is_derived_from(TDiscDistribution))
raiseErrorWho(measure, "expects discrete class attribute");
}
void checkDiscreteContinuous(const PContingency &cont, char *measure)
{ if (cont->varType!=TValue::INTVAR)
if (cont->outerVariable)
raiseErrorWho(measure, "cannot evaluate the non-discrete attribute '%s'", cont->outerVariable->name.c_str());
else
raiseErrorWho(measure, "cannot evaluate continuous attributes");
if (cont->innerVariable) {
if (cont->innerVariable->varType != TValue::FLOATVAR)
raiseErrorWho(measure, "cannot work with discrete outcome '%s'", cont->innerVariable->name.c_str());
}
else
if (!cont->innerDistribution.is_derived_from(TContDistribution))
raiseErrorWho(measure, "expects continuous outcome");
}
/* Prepares the common stuff for binarization through attribute quality assessment:
- a binary attribute
- a contingency matrix for this attribute
- a DomainContingency that contains this matrix at position newpos (the last)
- dis0 and dis1 (or con0 and con1, if the class is continuous) that point to distributions
for the left and the right branch
*/
PContingency prepareBinaryCheat(PDistribution classDistribution, PContingency origContingency,
PVariable &bvar,
TDiscDistribution *&dis0, TDiscDistribution *&dis1,
TContDistribution *&con0, TContDistribution *&con1)
{
TEnumVariable *ebvar = mlnew TEnumVariable("");
bvar = ebvar;
ebvar->addValue("0");
ebvar->addValue("1");
/* An ugly cheat that is prone to cause problems when Contingency class is changed.
It is fast, though :) */
TContingencyClass *cont = mlnew TContingencyAttrClass(bvar, classDistribution->variable);
cont->innerDistribution = classDistribution;
cont->operator[](1);
TDiscDistribution *outerDistribution = cont->outerDistribution.AS(TDiscDistribution);
outerDistribution->cases = origContingency->outerDistribution->cases;
outerDistribution->abs = origContingency->outerDistribution->abs;
outerDistribution->normalized = origContingency->outerDistribution->normalized;
if (classDistribution->variable->varType == TValue::INTVAR) {
dis0 = cont->discrete->front().AS(TDiscDistribution);
dis1 = cont->discrete->back().AS(TDiscDistribution);
con0 = con1 = NULL;
}
else {
con0 = cont->discrete->front().AS(TContDistribution);
con1 = cont->discrete->back().AS(TContDistribution);
dis0 = dis1 = NULL;
}
return cont;
}
TMeasureAttribute::TMeasureAttribute(const int aneeds, const bool hd, const bool hc, const bool ts)
: needs(aneeds),
handlesDiscrete(hd),
handlesContinuous(hc),
computesThresholds(ts)
{}
float TMeasureAttribute::operator()(PContingency, PDistribution, PDistribution)
{ raiseError("cannot evaluate attribute from contingencies only");
return 0.0;
}
float TMeasureAttribute::operator()(int attrNo, PDomainContingency domainContingency, PDistribution apriorClass)
{ if (needs>Contingency_Class)
raiseError("cannot evaluate attribute from domain contingency only");
if (attrNo>int(domainContingency->size()))
raiseError("attribute index out of range");
return operator()(domainContingency->operator[](attrNo), domainContingency->classes, apriorClass ? apriorClass : domainContingency->classes);
}
float TMeasureAttribute::operator()(int attrNo, PExampleGenerator gen, PDistribution apriorClass, int weightID)
{
if (needs>DomainContingency)
return operator()(gen->domain->attributes->at(attrNo), gen, apriorClass, weightID);
_ASSERT(gen && gen->domain);
if (!gen->domain->classVar)
raiseError("can't evaluate attributes on class-less domains");
if (attrNo>int(gen->domain->attributes->size()))
raiseError("attribute index out of range");
if (needs==Contingency_Class) {
TContingencyAttrClass contingency(gen, attrNo, weightID);
PDistribution classDistribution = CLONE(TDistribution, contingency.innerDistribution);
classDistribution->operator+= (contingency.innerDistributionUnknown);
return operator()(PContingency(contingency), classDistribution, apriorClass ? apriorClass : classDistribution);
}
TDomainContingency domcont(gen, weightID);
return operator()(attrNo, PDomainContingency(domcont), apriorClass ? apriorClass : domcont.classes);
}
float TMeasureAttribute::operator ()(PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID)
{ if (!gen->domain->classVar)
raiseError("can't evaluate attributes on class-less domains");
if (needs>DomainContingency)
raiseError("invalid 'needs'");
int attrNo=gen->domain->getVarNum(var, false);
if (attrNo != ILLEGAL_INT)
return operator()(attrNo, gen, apriorClass, weightID);
if (needs>Contingency_Class)
raiseError("invalid 'needs'");
TContingencyAttrClass contingency(gen, var, weightID);
PDistribution classDistribution = CLONE(TDistribution, contingency.innerDistribution);
classDistribution->operator+= (contingency.innerDistributionUnknown);
return operator()(PContingency(contingency), PDistribution(classDistribution), apriorClass ? apriorClass : classDistribution);
}
float TMeasureAttribute::operator ()(PDistribution dist) const
{ TDiscDistribution *discdist = dist.AS(TDiscDistribution);
if (discdist)
return operator()(*discdist);
TContDistribution *contdist = dist.AS(TContDistribution);
if (contdist)
return operator()(*contdist);
raiseError("invalid distribution");
return 0.0;
}
float TMeasureAttribute::operator ()(const TDiscDistribution &) const
{ raiseError("cannot evaluate discrete attributes");
return 0.0;
}
float TMeasureAttribute::operator ()(const TContDistribution &) const
{ raiseError("cannot evaluate continuous attributes");
return 0.0;
}
void TMeasureAttribute::thresholdFunction(TFloatFloatList &res, PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID)
{
if (!computesThresholds || (needs > Contingency_Class))
raiseError("cannot compute thresholds");
if (!gen->domain->classVar)
raiseError("can't evaluate attributes on class-less domains");
TContingencyAttrClass contingency(gen, var, weightID);
PDistribution classDistribution = CLONE(TDistribution, contingency.innerDistribution);
classDistribution->operator+= (contingency.innerDistributionUnknown);
thresholdFunction(res, PContingency(contingency), classDistribution, apriorClass ? apriorClass : classDistribution);
}
float TMeasureAttribute::bestThreshold(PDistribution &left_right, float &score, PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID, const float &minSubset)
{
if (needs > Contingency_Class)
raiseError("cannot compute thresholds");
if (!gen->domain->classVar)
raiseError("can't evaluate attributes on class-less domains");
TContingencyAttrClass contingency(gen, var, weightID);
PDistribution classDistribution = CLONE(TDistribution, contingency.innerDistribution);
classDistribution->operator+= (contingency.innerDistributionUnknown);
return bestThreshold(left_right, score, PContingency(contingency), classDistribution, apriorClass ? apriorClass : classDistribution, minSubset);
}
template<class TRecorder>
bool traverseThresholds(TMeasureAttribute *measure, TRecorder &recorder, PVariable &bvar, PContingency origContingency, PDistribution classDistribution, PDistribution apriorClass)
{
if (measure->needs > measure->Contingency_Class)
raiseError("cannot compute thresholds from contingencies");
PVariable var = origContingency->outerVariable;
if (var->varType != TValue::FLOATVAR)
raiseError("cannot search for thresholds of a non-continuous variable");
if (origContingency->continuous->size() < 2)
return false;
TDiscDistribution *dis0, *dis1;
TContDistribution *con0, *con1;
PContingency cont = prepareBinaryCheat(classDistribution, origContingency, bvar, dis0, dis1, con0, con1);
TDiscDistribution *outerDistribution = cont->outerDistribution.AS(TDiscDistribution);
const TDistributionMap &distr = *(origContingency->continuous);
TMeasureAttributeFromProbabilities *mp = dynamic_cast<TMeasureAttributeFromProbabilities *>(measure);
if (mp && (mp->unknownsTreatment == mp->IgnoreUnknowns))
classDistribution = cont->innerDistribution;
if (dis0) { // class is discrete
*dis0 = TDiscDistribution();
*dis1 = CAST_TO_DISCDISTRIBUTION(origContingency->innerDistribution);
const float &left = dis0->abs, &right = dis1->abs;
const_ITERATE(TDistributionMap, threshi, distr) {
*dis0 += threshi->second;
*dis1 -= threshi->second;
if (!recorder.acceptable(threshi->first, left, right))
continue;
outerDistribution->distribution[0] = left;
outerDistribution->distribution[1] = right;
recorder.record(threshi->first, measure->call(cont, classDistribution, apriorClass), left, right);
}
}
else { // class is continuous
*con0 = TContDistribution();
*con1 = CAST_TO_CONTDISTRIBUTION(origContingency->innerDistribution);
const float &left = con0->abs, &right = con1->abs;
const_ITERATE(TDistributionMap, threshi, distr) {
*con0 += threshi->second;
*con1 -= threshi->second;
if (!recorder.acceptable(threshi->first, left, right))
continue;
cont->outerDistribution->setint(0, left);
cont->outerDistribution->setint(1, right);
recorder.record(threshi->first, measure->call(cont, classDistribution, apriorClass), left, right);
}
}
return true;
}
class TRecordThresholds {
public:
TFloatFloatList &res;
TRecordThresholds(TFloatFloatList &ares)
: res(ares)
{}
inline bool acceptable(const float &, const float &, const float &)
{ return true; }
inline void record(const float &threshold, const float &score, const float &left, const float &right)
{ if (res.size())
res.back().first = (res.back().first + threshold) / 2.0;
res.push_back(make_pair(threshold, score));
}
};
void TMeasureAttribute::thresholdFunction(TFloatFloatList &res, PContingency origContingency, PDistribution classDistribution, PDistribution apriorClass)
{
PVariable bvar;
TRecordThresholds recorder(res);
if (!traverseThresholds(this, recorder, bvar, origContingency, classDistribution, apriorClass))
res.clear();
res.erase(res.end()-1);
}
class TRecordMaximalThreshold {
public:
float minSubset;
int wins;
float bestThreshold, bestScore, bestLeft, bestRight;
//float lastThreshold;
bool fixLast;
TRandomGenerator &rgen;
TRecordMaximalThreshold(TRandomGenerator &rg, const float &minSub = -1)
: minSubset(minSub),
wins(0),
rgen(rg)
{}
inline bool acceptable(const float &threshold, const float &left, const float &right)
{
if (fixLast) {
bestThreshold = (bestThreshold + threshold) / 2.0;
fixLast = false;
}
return (left >= minSubset) && (right >= minSubset);
}
void record(const float &threshold, const float &score, const float &left, const float &right)
{
if ( (!wins || (score > bestScore)) && ((wins=1)==1)
|| (score == bestScore) && rgen.randbool(++wins)) {
bestThreshold = threshold;
fixLast = true;
bestScore = score;
bestLeft = left;
bestRight = right;
}
}
};
float TMeasureAttribute::bestThreshold(PDistribution &subsetSizes, float &score, PContingency origContingency, PDistribution classDistribution, PDistribution apriorClass, const float &minSubset)
{
PVariable bvar;
TRandomGenerator rgen(classDistribution->abs);
TRecordMaximalThreshold recorder(rgen, minSubset);
if ( !traverseThresholds(this, recorder, bvar, origContingency, classDistribution, apriorClass)
|| !recorder.wins)
return ILLEGAL_FLOAT;
subsetSizes = mlnew TDiscDistribution(bvar);
subsetSizes->addint(0, recorder.bestLeft);
subsetSizes->addint(1, recorder.bestRight);
score = recorder.bestScore;
return recorder.bestThreshold;
}
PIntList TMeasureAttribute::bestBinarization(PDistribution &, float &score, PContingency origContingency, PDistribution classDistribution, PDistribution apriorClass, const float &minSubset)
{
if (needs > Contingency_Class)
raiseError("cannot compute thresholds from contingencies");
PVariable var = origContingency->outerVariable;
if (var->varType != TValue::INTVAR)
raiseError("cannot search for thresholds of a non-continuous variable");
if (origContingency->continuous->size() < 2)
return NULL;
raiseError("this has not been implemented yet");
return NULL;
}
PIntList TMeasureAttribute::bestBinarization(PDistribution &subsets, float &score, PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID, const float &minSubset)
{
if (!computesThresholds || (needs > Contingency_Class))
raiseError("cannot compute binarization");
if (!gen->domain->classVar)
raiseError("can't evaluate attributes on class-less domains");
TContingencyAttrClass contingency(gen, var, weightID);
PDistribution classDistribution = CLONE(TDistribution, contingency.innerDistribution);
classDistribution->operator+= (contingency.innerDistributionUnknown);
return bestBinarization(subsets, score, PContingency(contingency), classDistribution, apriorClass ? apriorClass : classDistribution, minSubset);
}
bool TMeasureAttribute::checkClassType(const int &varType)
{
return ((varType==TValue::INTVAR) && handlesDiscrete)
|| ((varType==TValue::FLOATVAR) && handlesContinuous);
}
void TMeasureAttribute::checkClassTypeExc(const int &varType)
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -