⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tdidt_split.cpp

📁 orange源码 数据挖掘技术
💻 CPP
📖 第 1 页 / 共 3 页
字号:
/*
    This file is part of Orange.

    Orange is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Orange is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Orange; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Authors: Janez Demsar, Blaz Zupan, 1996--2002
    Contact: janez.demsar@fri.uni-lj.si
*/


#include "measures.hpp"
#include "random.hpp"
#include "classfromvar.hpp"
#include "discretize.hpp"
#include "table.hpp"

#include "boolcnt.hpp"
#include "tdidt.hpp"
#include "tdidt_stop.hpp"

#include "tdidt_split.ppp"


TTreeSplitConstructor::TTreeSplitConstructor(const float &aml)
: minSubset(aml>0 ? aml : 1e-20)
{}



TTreeSplitConstructor_Measure::TTreeSplitConstructor_Measure(PMeasureAttribute meas, const float &aworst, const float &aml)
: TTreeSplitConstructor(aml),
  measure(meas),
  worstAcceptable(aworst)
{}



TTreeSplitConstructor_Combined::TTreeSplitConstructor_Combined(PTreeSplitConstructor discreteSplit, PTreeSplitConstructor continuousSplit, const float &aminSubset)
: TTreeSplitConstructor(aminSubset),
  discreteSplitConstructor(discreteSplit),
  continuousSplitConstructor(continuousSplit)
{}



PClassifier TTreeSplitConstructor_Combined::operator()(
                             PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,

                             PExampleGenerator gen, const int &weightID ,
                             PDomainContingency dcont, PDistribution apriorClass,
                             const vector<bool> &candidates,
                             PClassifier nodeClassifier
                            )
{ checkProperty(discreteSplitConstructor);
  checkProperty(continuousSplitConstructor);

  vector<bool> discrete, continuous;
 
  bool cse = candidates.size()==0;
  vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
  TVarList::const_iterator vi(gen->domain->attributes->begin()), ve(gen->domain->attributes->end());

  for(; (cse || (ci!=ce)) && (vi!=ve); vi++) {
    if (cse || *(ci++))
      if ((*vi)->varType == TValue::INTVAR) {
        discrete.push_back(true);
        continuous.push_back(false);
        continue;
      }
      else if ((*vi)->varType == TValue::FLOATVAR) {
        discrete.push_back(false);
        continuous.push_back(true);
        continue;
      }
    discrete.push_back(false);
    continuous.push_back(false);
  }

  float discQuality;
  PStringList discDescriptions;
  PDiscDistribution discSizes;
  int discSpent;
  PClassifier discSplit = discreteSplitConstructor->call(discDescriptions, discSizes, discQuality, discSpent,
                                                               gen, weightID, dcont, apriorClass, discrete, nodeClassifier);

  float contQuality;
  PStringList contDescriptions;
  PDiscDistribution contSizes;
  int contSpent;
  PClassifier contSplit = continuousSplitConstructor->call(contDescriptions, contSizes, contQuality, contSpent,
                                                                 gen, weightID, dcont, apriorClass, continuous, nodeClassifier);

  int N = gen ? gen->numberOfExamples() : -1;
  if (N<0)
    N = dcont->classes->cases;

  if (   discSplit
      && (   !contSplit
          || (discQuality>contQuality)
          || (discQuality==contQuality) && (N%2>0))) {
    quality = discQuality;
    descriptions = discDescriptions;
    subsetSizes = discSizes;
    spentAttribute = discSpent;
    return discSplit;
  }
  else if (contSplit) {
    quality = contQuality;
    descriptions = contDescriptions;
    subsetSizes = contSizes;
    spentAttribute = contSpent;
    return contSplit;
  }
  else 
    return returnNothing(descriptions, subsetSizes, quality, spentAttribute);
}



TTreeSplitConstructor_Attribute::TTreeSplitConstructor_Attribute(PMeasureAttribute meas, const float &worst, const float &ms)
: TTreeSplitConstructor_Measure(meas, worst, ms)
{}


// if there are less than two branches with at least minSubset examples, skip the attribute
// also, skip it if all examples are in the same branch
bool checkDistribution(const TDiscDistribution &dist, const float &minSubset)
{
  int nonzero = 0;
  for(TDiscDistribution::const_iterator dvi(dist.begin()), dve(dist.end()); (dvi!=dve) && (nonzero<2); dvi++)
    if ((*dvi > 0) && (*dvi >= minSubset))
      nonzero++;

  return nonzero >= 2;
}



inline bool noCandidates(const vector<bool> &candidates)
{
  vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
  while(ci!=ce && !*ci)
    ci++;
  return ci==ce;
}

PClassifier TTreeSplitConstructor_Attribute::operator()(
                             PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,

                             PExampleGenerator gen, const int &weightID,
                             PDomainContingency dcont, PDistribution apriorClass,
                             const vector<bool> &candidates,
                             PClassifier nodeClassifier
                            )
{ checkProperty(measure);

  measure->checkClassTypeExc(gen->domain->classVar->varType);

  bool cse = candidates.size()==0;
  vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
  if (!cse) {
    if (noCandidates(candidates))
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    ci = candidates.begin();
  }

  int N = gen ? gen->numberOfExamples() : -1;
  if (N<0)
    N = dcont->classes->cases;
  TSimpleRandomGenerator rgen(N);

  int thisAttr = 0, bestAttr = -1, wins = 0;
  quality = 0.0;

  if (measure->needs == TMeasureAttribute::Contingency_Class) {
    vector<bool> myCandidates;
    if (cse) {
      myCandidates.reserve(gen->domain->attributes->size());
      PITERATE(TVarList, vi, gen->domain->attributes)
        myCandidates.push_back((*vi)->varType == TValue::INTVAR);
    }
    else {
      myCandidates.reserve(candidates.size());
      TVarList::const_iterator vi(gen->domain->attributes->begin());
      for(; ci != ce; ci++, vi++)
        myCandidates.push_back(*ci && ((*vi)->varType == TValue::INTVAR));
    }

    if (!dcont || dcont->classIsOuter)
      dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID, myCandidates));

    ci = myCandidates.begin();
    ce = myCandidates.end();
    TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
    for(; (ci != ce) && (dci!=dce); dci++, ci++, thisAttr++)
      if (*ci && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) {
        float thisMeas = measure->call(thisAttr, dcont, apriorClass);

        if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
            || ((thisMeas==quality) && rgen.randbool(++wins))) {
          quality = thisMeas;
          subsetSizes = (*dci)->outerDistribution;
          bestAttr = thisAttr;
        }
      }
  }

  else if (measure->needs == TMeasureAttribute::DomainContingency) {
    if (!dcont || dcont->classIsOuter)
      dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID));

    TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
    for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++)
      if (    (cse || *(ci++))
           && ((*dci)->outerVariable->varType==TValue::INTVAR)
           && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) {
        float thisMeas = measure->call(thisAttr, dcont, apriorClass);

        if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
            || ((thisMeas==quality) && rgen.randbool(++wins))) {
          quality = thisMeas;
          subsetSizes = (*dci)->outerDistribution;
          bestAttr = thisAttr;
        }
      }
  }

  else {
    TDomainDistributions ddist(gen, weightID);

    TDomainDistributions::iterator ddi(ddist.begin()), dde(ddist.end()-1);
    for(; (cse || (ci!=ce)) && (ddi!=dde); ddi++, thisAttr++)
      if (cse || *(ci++)) {
        TDiscDistribution *discdist = (*ddi).AS(TDiscDistribution);
        if (discdist && checkDistribution(*discdist, minSubset)) {
          float thisMeas = measure->call(thisAttr, gen, apriorClass, weightID);

          if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
              || ((thisMeas==quality) && rgen.randbool(++wins))) {
            quality = thisMeas;
            subsetSizes = PDiscDistribution(*ddi); // not discdist - this would be double wrapping!
            bestAttr = thisAttr;
          }
        }
      }
    
  }

  if (!wins)
    return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

  if (quality<worstAcceptable)
    return returnNothing(descriptions, subsetSizes, spentAttribute);

  PVariable attribute = gen->domain->attributes->at(bestAttr);
  TEnumVariable *evar = attribute.AS(TEnumVariable);
  if (evar)
    descriptions = mlnew TStringList(evar->values.getReference());
  else
    descriptions = mlnew TStringList(subsetSizes->size(), "");

  spentAttribute = bestAttr;

  return mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes);
}




TTreeSplitConstructor_ExhaustiveBinary::TTreeSplitConstructor_ExhaustiveBinary(PMeasureAttribute meas, const float &aworst, const float &aml)
: TTreeSplitConstructor_Measure(meas, aworst, aml)
{}



PClassifier TTreeSplitConstructor_ExhaustiveBinary::operator()(
                             PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,

                             PExampleGenerator gen, const int &weightID ,
                             PDomainContingency dcont, PDistribution apriorClass,
                             const vector<bool> &candidates,
                             PClassifier
                            )
{ 
  checkProperty(measure);
  measure->checkClassTypeExc(gen->domain->classVar->varType);

  PIntList bestMapping;
  int wins, bestAttr;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -