⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 measures.cpp

📁 orange源码 数据挖掘技术
💻 CPP
📖 第 1 页 / 共 4 页
字号:
/*
    This file is part of Orange.

    Orange is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Orange is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Orange; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Authors: Janez Demsar, Blaz Zupan, 1996--2002
    Contact: janez.demsar@fri.uni-lj.si
*/


// to include Python.h before STL defines a template set (doesn't work with VC 6.0)
#include "garbage.hpp" 

#include <math.h>
#include <set>

#include "stladdon.hpp"
#include "student.hpp"
#include "random.hpp"

#include "vars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "examplegen.hpp"
#include "table.hpp"
#include "distance.hpp"
#include "contingency.hpp"
#include "classify.hpp"
#include "symmatrix.hpp"

#include "cost.hpp"
#include <vector>

#include "relief.ppp"
#include "measures.ppp"


void checkDiscrete(const PContingency &cont, char *measure)
{ if (cont->varType!=TValue::INTVAR)
    if (cont->outerVariable)
      raiseErrorWho(measure, "cannot evaluate the non-discrete attribute '%s'", cont->outerVariable->name.c_str());
    else
      raiseErrorWho(measure, "cannot evaluate continuous attributes");

  if (cont->innerVariable) {
    if (cont->innerVariable->varType != TValue::INTVAR)
      raiseErrorWho(measure, "cannot work with continuous outcome '%s'", cont->innerVariable->name.c_str());
  }
  else
    if (!cont->innerDistribution.is_derived_from(TDiscDistribution))
      raiseErrorWho(measure, "expects discrete class attribute");
}


void checkDiscreteContinuous(const PContingency &cont, char *measure)
{ if (cont->varType!=TValue::INTVAR)
    if (cont->outerVariable)
      raiseErrorWho(measure, "cannot evaluate the non-discrete attribute '%s'", cont->outerVariable->name.c_str());
    else
      raiseErrorWho(measure, "cannot evaluate continuous attributes");

  if (cont->innerVariable) {
    if (cont->innerVariable->varType != TValue::FLOATVAR)
      raiseErrorWho(measure, "cannot work with discrete outcome '%s'", cont->innerVariable->name.c_str());
  }
  else
    if (!cont->innerDistribution.is_derived_from(TContDistribution))
      raiseErrorWho(measure, "expects continuous outcome");
}


/* Prepares the common stuff for binarization through attribute quality assessment:
   - a binary attribute 
   - a contingency matrix for this attribute
   - a DomainContingency that contains this matrix at position newpos (the last)
   - dis0 and dis1 (or con0 and con1, if the class is continuous) that point to distributions
     for the left and the right branch
*/
PContingency prepareBinaryCheat(PDistribution classDistribution, PContingency origContingency,
                                PVariable &bvar, 
                                TDiscDistribution *&dis0, TDiscDistribution *&dis1,
                                TContDistribution *&con0, TContDistribution *&con1)
{
  TEnumVariable *ebvar = mlnew TEnumVariable("");
  bvar = ebvar;
  ebvar->addValue("0");
  ebvar->addValue("1");

  /* An ugly cheat that is prone to cause problems when Contingency class is changed.
     It is fast, though :) */
  TContingencyClass *cont = mlnew TContingencyAttrClass(bvar, classDistribution->variable);
  cont->innerDistribution = classDistribution;
  cont->operator[](1);

  TDiscDistribution *outerDistribution = cont->outerDistribution.AS(TDiscDistribution);
  outerDistribution->cases = origContingency->outerDistribution->cases;
  outerDistribution->abs = origContingency->outerDistribution->abs;
  outerDistribution->normalized = origContingency->outerDistribution->normalized;

  if (classDistribution->variable->varType == TValue::INTVAR) {
    dis0 = cont->discrete->front().AS(TDiscDistribution);
    dis1 = cont->discrete->back().AS(TDiscDistribution);
    con0 = con1 = NULL;
  }
  else {
    con0 = cont->discrete->front().AS(TContDistribution);
    con1 = cont->discrete->back().AS(TContDistribution);
    dis0 = dis1 = NULL;
  }

  return cont;
}



TMeasureAttribute::TMeasureAttribute(const int aneeds, const bool hd, const bool hc, const bool ts)
: needs(aneeds),
  handlesDiscrete(hd),
  handlesContinuous(hc),
  computesThresholds(ts)
{}


float TMeasureAttribute::operator()(PContingency, PDistribution, PDistribution)
{ raiseError("cannot evaluate attribute from contingencies only"); 
  return 0.0;
}


float TMeasureAttribute::operator()(int attrNo, PDomainContingency domainContingency, PDistribution apriorClass)
{ if (needs>Contingency_Class) 
    raiseError("cannot evaluate attribute from domain contingency only");
  if (attrNo>int(domainContingency->size()))
    raiseError("attribute index out of range");
  return operator()(domainContingency->operator[](attrNo), domainContingency->classes, apriorClass ? apriorClass : domainContingency->classes); 
}


float TMeasureAttribute::operator()(int attrNo, PExampleGenerator gen, PDistribution apriorClass, int weightID)
{ 
  if (needs>DomainContingency)
    return operator()(gen->domain->attributes->at(attrNo), gen, apriorClass, weightID);

  _ASSERT(gen && gen->domain);
  if (!gen->domain->classVar)
    raiseError("can't evaluate attributes on class-less domains");
  if (attrNo>int(gen->domain->attributes->size()))
    raiseError("attribute index out of range");

  if (needs==Contingency_Class) {
    TContingencyAttrClass contingency(gen, attrNo, weightID);

    PDistribution classDistribution = CLONE(TDistribution, contingency.innerDistribution);
    classDistribution->operator+= (contingency.innerDistributionUnknown);

    return operator()(PContingency(contingency), classDistribution, apriorClass ? apriorClass : classDistribution);
  }
   
 TDomainContingency domcont(gen, weightID);
 return operator()(attrNo, PDomainContingency(domcont), apriorClass ? apriorClass : domcont.classes);
}


float TMeasureAttribute::operator ()(PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID)
{ if (!gen->domain->classVar)
    raiseError("can't evaluate attributes on class-less domains");
  
  if (needs>DomainContingency)
   raiseError("invalid 'needs'");

  int attrNo=gen->domain->getVarNum(var, false);
  if (attrNo != ILLEGAL_INT)
    return operator()(attrNo, gen, apriorClass, weightID);

  if (needs>Contingency_Class)
    raiseError("invalid 'needs'");

  TContingencyAttrClass contingency(gen, var, weightID);

  PDistribution classDistribution = CLONE(TDistribution, contingency.innerDistribution);
  classDistribution->operator+= (contingency.innerDistributionUnknown);

  return operator()(PContingency(contingency), PDistribution(classDistribution), apriorClass ? apriorClass : classDistribution);
}


float TMeasureAttribute::operator ()(PDistribution dist) const
{ TDiscDistribution *discdist = dist.AS(TDiscDistribution);
  if (discdist)
    return operator()(*discdist);
  
  TContDistribution *contdist = dist.AS(TContDistribution);
  if (contdist)
    return operator()(*contdist);
    
  raiseError("invalid distribution");
  return 0.0;
}

float TMeasureAttribute::operator ()(const TDiscDistribution &) const
{ raiseError("cannot evaluate discrete attributes");
  return 0.0;
}

float TMeasureAttribute::operator ()(const TContDistribution &) const
{ raiseError("cannot evaluate continuous attributes");
  return 0.0;
}


void TMeasureAttribute::thresholdFunction(TFloatFloatList &res, PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID)
{ 
  if (!computesThresholds || (needs > Contingency_Class))
    raiseError("cannot compute thresholds");
  if (!gen->domain->classVar)
    raiseError("can't evaluate attributes on class-less domains");

  TContingencyAttrClass contingency(gen, var, weightID);

  PDistribution classDistribution = CLONE(TDistribution, contingency.innerDistribution);
  classDistribution->operator+= (contingency.innerDistributionUnknown);

  thresholdFunction(res, PContingency(contingency), classDistribution, apriorClass ? apriorClass : classDistribution);
}


float TMeasureAttribute::bestThreshold(PDistribution &left_right, float &score, PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID, const float &minSubset)
{ 
  if (needs > Contingency_Class)
    raiseError("cannot compute thresholds");
  if (!gen->domain->classVar)
    raiseError("can't evaluate attributes on class-less domains");

  TContingencyAttrClass contingency(gen, var, weightID);

  PDistribution classDistribution = CLONE(TDistribution, contingency.innerDistribution);
  classDistribution->operator+= (contingency.innerDistributionUnknown);

  return bestThreshold(left_right, score, PContingency(contingency), classDistribution, apriorClass ? apriorClass : classDistribution, minSubset);
}


template<class TRecorder>
bool traverseThresholds(TMeasureAttribute *measure, TRecorder &recorder, PVariable &bvar, PContingency origContingency, PDistribution classDistribution, PDistribution apriorClass)
{
  if (measure->needs > measure->Contingency_Class)
    raiseError("cannot compute thresholds from contingencies");

  PVariable var = origContingency->outerVariable;
  if (var->varType != TValue::FLOATVAR)
    raiseError("cannot search for thresholds of a non-continuous variable");

  if (origContingency->continuous->size() < 2)
    return false;

  TDiscDistribution *dis0, *dis1;
  TContDistribution *con0, *con1;
  PContingency cont = prepareBinaryCheat(classDistribution, origContingency, bvar, dis0, dis1, con0, con1);
  TDiscDistribution *outerDistribution = cont->outerDistribution.AS(TDiscDistribution);
  
  const TDistributionMap &distr = *(origContingency->continuous);

  TMeasureAttributeFromProbabilities *mp = dynamic_cast<TMeasureAttributeFromProbabilities *>(measure);
  if (mp && (mp->unknownsTreatment == mp->IgnoreUnknowns))
    classDistribution = cont->innerDistribution;

  if (dis0) { // class is discrete
    *dis0 = TDiscDistribution();
    *dis1 = CAST_TO_DISCDISTRIBUTION(origContingency->innerDistribution);
    const float &left = dis0->abs, &right = dis1->abs;
  
    const_ITERATE(TDistributionMap, threshi, distr) {
      *dis0 += threshi->second;
      *dis1 -= threshi->second;

      if (!recorder.acceptable(threshi->first, left, right))
        continue;

      outerDistribution->distribution[0] = left;
      outerDistribution->distribution[1] = right;

      recorder.record(threshi->first, measure->call(cont, classDistribution, apriorClass), left, right);
    }
  }

  else { // class is continuous
    *con0 = TContDistribution();
    *con1 = CAST_TO_CONTDISTRIBUTION(origContingency->innerDistribution);
    const float &left = con0->abs, &right = con1->abs;

    const_ITERATE(TDistributionMap, threshi, distr) {
      *con0 += threshi->second;
      *con1 -= threshi->second;

      if (!recorder.acceptable(threshi->first, left, right))
        continue;

      cont->outerDistribution->setint(0, left);
      cont->outerDistribution->setint(1, right);
        
      recorder.record(threshi->first, measure->call(cont, classDistribution, apriorClass), left, right);
    }
  }

  return true;
}


class TRecordThresholds {
public:
  TFloatFloatList &res;

  TRecordThresholds(TFloatFloatList &ares)
  : res(ares)
  {}

  inline bool acceptable(const float &, const float &, const float &)
  { return true; }

  inline void record(const float &threshold, const float &score, const float &left, const float &right)
  { if (res.size())
      res.back().first = (res.back().first + threshold) / 2.0;
    res.push_back(make_pair(threshold, score)); 
  }
};


void TMeasureAttribute::thresholdFunction(TFloatFloatList &res, PContingency origContingency, PDistribution classDistribution, PDistribution apriorClass)
{
  PVariable bvar;
  TRecordThresholds recorder(res);
  if (!traverseThresholds(this, recorder, bvar, origContingency, classDistribution, apriorClass))
    res.clear();
  res.erase(res.end()-1);
}


class TRecordMaximalThreshold {
public:
  float minSubset;

  int wins;
  float bestThreshold, bestScore, bestLeft, bestRight;
  //float lastThreshold;
  bool fixLast;
  TRandomGenerator &rgen;

  TRecordMaximalThreshold(TRandomGenerator &rg, const float &minSub = -1)
  : minSubset(minSub),
    wins(0),
    rgen(rg)
  {}

  inline bool acceptable(const float &threshold, const float &left, const float &right)
  { 
    if (fixLast) {
      bestThreshold = (bestThreshold + threshold) / 2.0;
      fixLast = false;
    }
    return (left >= minSubset) && (right >= minSubset);
  }

  void record(const float &threshold, const float &score, const float &left, const float &right)
  {
    if (   (!wins || (score > bestScore)) && ((wins=1)==1)
        || (score == bestScore) && rgen.randbool(++wins)) {
        bestThreshold = threshold;
        fixLast = true;
      bestScore = score;
      bestLeft = left;
      bestRight = right;
    }
  }
};


float TMeasureAttribute::bestThreshold(PDistribution &subsetSizes, float &score, PContingency origContingency, PDistribution classDistribution, PDistribution apriorClass, const float &minSubset)
{
  PVariable bvar;
  TRandomGenerator rgen(classDistribution->abs);
  TRecordMaximalThreshold recorder(rgen, minSubset);
  if (   !traverseThresholds(this, recorder, bvar, origContingency, classDistribution, apriorClass)
      || !recorder.wins)
    return ILLEGAL_FLOAT;

  subsetSizes = mlnew TDiscDistribution(bvar);
  subsetSizes->addint(0, recorder.bestLeft);
  subsetSizes->addint(1, recorder.bestRight);

  score = recorder.bestScore;
  return recorder.bestThreshold;
}


PIntList TMeasureAttribute::bestBinarization(PDistribution &, float &score, PContingency origContingency, PDistribution classDistribution, PDistribution apriorClass, const float &minSubset)
{
  if (needs > Contingency_Class)
    raiseError("cannot compute thresholds from contingencies");

  PVariable var = origContingency->outerVariable;
  if (var->varType != TValue::INTVAR)
    raiseError("cannot search for thresholds of a non-continuous variable");

  if (origContingency->continuous->size() < 2)
    return NULL;

  raiseError("this has not been implemented yet");
  return NULL;
}


PIntList TMeasureAttribute::bestBinarization(PDistribution &subsets, float &score, PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID, const float &minSubset)
{ 
  if (!computesThresholds || (needs > Contingency_Class))
    raiseError("cannot compute binarization");
  if (!gen->domain->classVar)
    raiseError("can't evaluate attributes on class-less domains");

  TContingencyAttrClass contingency(gen, var, weightID);

  PDistribution classDistribution = CLONE(TDistribution, contingency.innerDistribution);
  classDistribution->operator+= (contingency.innerDistributionUnknown);

  return bestBinarization(subsets, score, PContingency(contingency), classDistribution, apriorClass ? apriorClass : classDistribution, minSubset);
}


bool TMeasureAttribute::checkClassType(const int &varType)
{
  return    ((varType==TValue::INTVAR) && handlesDiscrete)
         || ((varType==TValue::FLOATVAR) && handlesContinuous);
}


void TMeasureAttribute::checkClassTypeExc(const int &varType)
{

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -