⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tdidt.cpp

📁 orange源码 数据挖掘技术
💻 CPP
📖 第 1 页 / 共 2 页
字号:
/*
    This file is part of Orange.

    Orange is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Orange is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Orange; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Authors: Janez Demsar, Blaz Zupan, 1996--2002
    Contact: janez.demsar@fri.uni-lj.si
*/


#include "stladdon.hpp"
#include "random.hpp"

#include "vars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "table.hpp"

#include "contingency.hpp"
#include "distvars.hpp"

#include "measures.hpp"

#include "majority.hpp"

#include "tdidt_split.hpp"
#include "tdidt_stop.hpp"

#include "tdidt.ppp"


DEFINE_TOrangeVector_classDescription(PTreeNode, "TTreeNodeList", true, ORANGE_API)


/* Default components for split constructor -- split constructors for
   discrete and continuous attributes and classes, and the corresponding
   attribute measures.

   To avoid problems under gcc, these are initialized by an explicit call
   to tdidt_cpp_gcUnsafeInitialization made from initOrange at the end
   of module initialization.
*/

PTreeSplitConstructor defaultDiscreteTreeSplitConstructor;
PTreeSplitConstructor defaultContinuousTreeSplitConstructor;
PTreeStopCriteria defaultStop;

void tdidt_cpp_gcUnsafeInitialization()
{ PMeasureAttribute defaultDiscreteMeasure = mlnew TMeasureAttribute_gainRatio;
  PMeasureAttribute defaultContinuousMeasure = mlnew TMeasureAttribute_MSE;

  defaultDiscreteTreeSplitConstructor 
    = mlnew TTreeSplitConstructor_Combined(
         mlnew TTreeSplitConstructor_Attribute(defaultDiscreteMeasure, 0.0, 2.0),
         mlnew TTreeSplitConstructor_Threshold(defaultDiscreteMeasure, 0.0, 5.0));

  defaultContinuousTreeSplitConstructor 
    = mlnew TTreeSplitConstructor_Combined(
         mlnew TTreeSplitConstructor_Attribute(defaultContinuousMeasure, 0.0, 2.0),
         mlnew TTreeSplitConstructor_Threshold(defaultContinuousMeasure, 0.0, 5.0));

  defaultStop = mlnew TTreeStopCriteria_common();
}



int TTreeNode::treeSize() const
{ int sum = 1;
  if (branches)
    const_PITERATE(TTreeNodeList, bi, branches)
      if (*bi)
        sum += (*bi)->treeSize();
  return sum;
}


void TTreeNode::removeStoredInfo()
{ distribution = PDistribution();
  contingency = PDomainContingency();
  examples = PExampleGenerator();

  if (branches)
    const_PITERATE(TTreeNodeList, bi, branches)
      if (*bi)
        (*bi)->treeSize();
}



TTreeLearner::TTreeLearner()
: storeExamples(false),
  storeDistributions(true),
  storeContingencies(false),
  storeNodeClassifier(true),
  maxDepth(100)
{}



PClassifier TTreeLearner::operator()(PExampleGenerator ogen, const int &weight)
{ if (!ogen)
    raiseError("invalid example generator");

  PVariable &classVar = ogen->domain->classVar;

  if (!classVar)
    raiseError("class-less domain");

  bool tempSplit = !split;
  if (tempSplit)
    if (classVar->varType == TValue::INTVAR)
      split = defaultDiscreteTreeSplitConstructor;
    else if (classVar->varType == TValue::FLOATVAR)
      split = defaultContinuousTreeSplitConstructor;
    else
      raiseError("invalid class type (discrete or continuous expected)");

  bool tempStop = !stop;
  if (tempStop)
    stop = defaultStop;

  bool tempSplitter = !exampleSplitter;
  if (tempSplitter)
    exampleSplitter = mlnew TTreeExampleSplitter_UnknownsAsSelector;

  try {
    PExampleGenerator examples;
    /* If we don't intend to store them, we'll copy them if they're not in a table. 
       If we must store examples, we'll copy them in any case... */ 
    if (storeExamples)
      examples = mlnew TExampleTable(ogen);
    else
      examples = toExampleTable(ogen);

    PDistribution apriorClass = getClassDistribution (examples);
    if (apriorClass->abs == 0)
      raiseError("no examples");

    vector<bool> candidates(examples->domain->attributes->size(), true);

    PTreeNode root = call(examples, weight, apriorClass, candidates, 0);
    if (storeExamples)
      root->examples = examples;

    if (tempSplit)
      split = PTreeSplitConstructor();
    if (tempStop)
      stop = PTreeSplitConstructor();
    if (tempSplitter)
      exampleSplitter = PTreeExampleSplitter();

    return mlnew TTreeClassifier(examples->domain, root, 
                               descender ? descender : mlnew TTreeDescender_UnknownMergeAsSelector);
  }
  catch (exception) {
    if (tempSplit)
      split = PTreeSplitConstructor();
    if (tempStop)
      stop = PTreeSplitConstructor();
    if (tempSplitter)
      exampleSplitter = PTreeExampleSplitter();
    throw;
  }
}



PTreeNode TTreeLearner::operator()(PExampleGenerator examples, const int &weightID, PDistribution apriorClass, vector<bool> &candidates, const int &depth)
{
  PDomainContingency contingency;
  PDomainDistributions domainDistributions;
  PDistribution classDistribution;

  if (!examples->numberOfExamples())
    return PTreeNode();

  if (contingencyComputer)
    contingency = contingencyComputer->call(examples, weightID);

  if (storeContingencies)
    contingency = mlnew TDomainContingency(examples, weightID);

  if (contingency)
    classDistribution = contingency->classes;
  else if (storeDistributions)
    classDistribution = getClassDistribution(examples, weightID);

  if (classDistribution) {
    if (!classDistribution->abs)
      return PTreeNode();
  }
  else
    if (examples->weightOfExamples() < 1e-10)
      return PTreeNode();
  
  TTreeNode *utreeNode = mlnew TTreeNode();
  PTreeNode treeNode = utreeNode;

  utreeNode->weightID = weightID;

  bool isLeaf = ((maxDepth>=0) && (depth == maxDepth)) || stop->call(examples, weightID, contingency);

  if (isLeaf || storeNodeClassifier) {
    utreeNode->nodeClassifier = nodeLearner
                              ? nodeLearner->smartLearn(examples, weightID, contingency, domainDistributions, classDistribution)
                              : TMajorityLearner().smartLearn(examples, weightID, contingency, domainDistributions, classDistribution);

    if (isLeaf) {
      if (storeContingencies)
        utreeNode->contingency = contingency;
      if (storeDistributions)
        utreeNode->distribution = classDistribution;

      return treeNode;
    }
  }

  utreeNode->contingency = contingency;
  utreeNode->distribution = classDistribution;

  float quality;
  int spentAttribute;
  utreeNode->branchSelector = split->call(utreeNode->branchDescriptions, utreeNode->branchSizes, quality, spentAttribute,
                                          examples, weightID, contingency,
                                          apriorClass ? apriorClass : classDistribution,
                                          candidates, utreeNode->nodeClassifier);

  isLeaf = !utreeNode->branchSelector;
  bool hasNullNodes = false;

  if (!isLeaf) {
    if (spentAttribute>=0)
      if (candidates[spentAttribute])
        candidates[spentAttribute] = false;
      else
        spentAttribute = -1;
    /* BEWARE: If you add an additional 'return' in the code below,
               do not forget to restore the candidate's flag. */

    utreeNode->branches = mlnew TTreeNodeList();

    vector<int> newWeights;

    PExampleGeneratorList subsets = exampleSplitter->call(treeNode, examples, weightID, newWeights);

    if (!utreeNode->branchSizes)
      utreeNode->branchSizes = branchSizesFromSubsets(subsets, weightID, newWeights);

    if (!storeContingencies)
      utreeNode->contingency = PDomainContingency();
    if (!storeDistributions)
      utreeNode->distribution = PDistribution();

    vector<int>::iterator nwi = newWeights.begin(), nwe = newWeights.end();
    PITERATE(TExampleGeneratorList, gi, subsets) {
      if ((*gi)->numberOfExamples()) {
        utreeNode->branches->push_back(call(*gi, (nwi!=nwe) ? *nwi : weightID, apriorClass, candidates, depth+1));
        // Can store pointers to examples: the original is stored in the root
        if (storeExamples && utreeNode->branches->back())
          utreeNode->branches->back()->examples = *gi;
        else if ((nwi!=nwe) && *nwi && (*nwi != weightID))
            examples->removeMetaAttribute(*nwi);
      }
      else {
        utreeNode->branches->push_back(PTreeNode());
        hasNullNodes = true;
      }

      if (nwi!=nwe)
        nwi++;
    }

    /* If I set it to false, it must had been true before
       (otherwise, my TreeSplitConstructor wouldn't be allowed to spend it).
       Hence, I can simply set it back to true... */
    if (spentAttribute>=0)
      candidates[spentAttribute] = true;
  }

  else {  // no split constructed
    if (!utreeNode->contingency)
      utreeNode->contingency = PDomainContingency();
    if (!storeDistributions)
      utreeNode->distribution = PDistribution();
  }


  if (isLeaf || hasNullNodes) {
    if (!utreeNode->nodeClassifier)
      utreeNode->nodeClassifier = nodeLearner
                              ? nodeLearner->smartLearn(examples, weightID, contingency, domainDistributions, classDistribution)
                              : TMajorityLearner().smartLearn(examples, weightID, contingency, domainDistributions, classDistribution);
  }

  return treeNode;
}



PDiscDistribution TTreeLearner::branchSizesFromSubsets(PExampleGeneratorList subsets, const int &weightID, const vector<int> &weights) const
{
  TDiscDistribution *bs = mlnew TDiscDistribution();
  PDiscDistribution branchSizes = bs;

  vector<int>::const_iterator wi (weights.begin());
  bool hasWeights = wi != weights.end();

  PITERATE(TExampleGeneratorList, egli, subsets) {
    float totw = 0.0;
    int wID = hasWeights ? *(wi++) : weightID;

    if (!wID)
      totw = (*egli)->numberOfExamples();

    if (wID || (totw<0))
      PEITERATE(ei, *egli)
        totw += WEIGHT2(*ei, wID);

    bs->push_back(totw);
  }

  return branchSizes;
}


TTreeClassifier::TTreeClassifier(PDomain dom, PTreeNode atree, PTreeDescender adescender)
: TClassifierFD(dom),
  tree(atree), 
  descender(adescender)
{}



/* Classification is somewhat complex due to descender component.

   If descender does not require voting (i.e. always descends to a single
   node), methods simply use descender to find a node and use the
   corresponding nodeClassifier.

   If descender requires a vote, this is indicated by returning
   branch weights. The method responds by calling 'vote'. Voting is
   always done by a weighted sum class probabilities (even when
   the outcome should be single value, not distribution).

   'vote' method thus calls classDistribution for each subnode.
   classDistribution again uses descender to descend from the subnode.
   If it requires vote again, classDistribution will again call
   'vote'. We thus have a mutual (recursive) calls of 'vote'
   and 'classDistribution' - not at all levels of the tree but
   only at those where descender takes a break... */


PDistribution TTreeClassifier::findNodeDistribution(PTreeNode node, const TExample &exam)
{
  if (node->distribution)
    return node->distribution;

  if (node->contingency && node->contingency->classes)
    return node->contingency->classes;

  if (node->nodeClassifier) {
    PDistribution dist = node->nodeClassifier->classDistribution(exam);
    if (dist)
      return dist;
  }

  if (classVar->varType == TValue::INTVAR) {
    const int nval = classVar->noOfValues();
    if (nval)
      return mlnew TDiscDistribution(nval, 1.0/nval);
  }

  return PDistribution();
}


TValue TTreeClassifier::findNodeValue(PTreeNode node, const TExample &exam)
{
  PDistribution dist = findNodeDistribution(node, exam);
  if (dist)
    return dist->highestProbValue(exam);
  else
    return TValue(0.0);
}


TValue TTreeClassifier::operator()(const TExample &exam)
{
  checkProperty(descender);

  const bool convertEx = domain && (exam.domain != domain);
  TExample convex = convertEx ? TExample(domain, exam) : TExample();
  const TExample &refexam = convertEx ? convex : exam;
  PDiscDistribution branchWeights;
  PTreeNode node = descender->call(tree, refexam, branchWeights);
  if (!branchWeights) {
    if (node->nodeClassifier) 
      return node->nodeClassifier->call(refexam);
  }
  else {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -