⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 c4.5.cpp

📁 orange源码 数据挖掘技术
💻 CPP
📖 第 1 页 / 共 2 页
字号:
	    case 't':  
        trials = atoi((*oi).second.c_str());
		    batch = false;
        if ((trials<1) || (trials>10000)) {
          trials=10;
          raiseError("parseCommandLine: invalid argument for -t");
        }
        break;

      case 'w':  
        window = atoi((*oi).second.c_str());
        batch = false;
        if ((window<1) || (window>1000000)) {
          window = 0;
          raiseError("parseCommandLine: invalid argument for -w");
        }
        break;

	    case 'i':
        increment = atoi((*oi).second.c_str());
		    batch = false;
        if ((increment<1) || (increment>1000000)) {
          increment = 0;
          raiseError("parseCommandLine: invalid argument for -i");
        }
        break;

	    case 'g':  
        gainRatio = false;
        break;

	    case 's':  
        subset = true;
        break;

      case 'm':   
        minObjs = atoi((*oi).second.c_str());
        if ((minObjs<1) || (minObjs>1000000)) {
          minObjs = 2;
          raiseError("parseCommandLine: invalid argument for -m");
        }
        break;

      case 'c':   
        cf = atof((*oi).second.c_str());
        if ((cf<=0) || (cf>100)) {
          cf = 0.25;
          raiseError("parseCommandLine: invalid argument for -c");
        }
        break;
    }

  return true;
}


bool TC45Learner::convertParameters()
{ return true;
}




PClassifier TC45Learner::operator ()(PExampleGenerator gen, const int &weight)
{   if (!gen->domain->classVar)
      raiseError("class-less domain");
 
    convertGenerator(gen);
    Tree tree = (Tree)c45learn(trials, gainRatio, subset, batch, probThresh, minObjs, window, increment, cf, prune);

    PC45TreeNode root = mlnew TC45TreeNode(tree, gen->domain);
    TC45Classifier *c45classifier = mlnew TC45Classifier(gen->domain, root);
    PClassifier res = c45classifier;

    c45garbage();
    clearGenerator();
    return convertToOrange ? PClassifier(c45classifier->asTreeClassifier(gen, weight, storeContingencies, storeExamples)) : res;
}



TC45TreeNode::TC45TreeNode()
: nodeType(4),
  leaf(TValue::INTVAR),
  items(-1),
  classDist(),
  tested(),
  cut(0),
  lower(0),
  upper(0),
  mapping(),
  branch()
{}


TC45TreeNode::TC45TreeNode(const Tree &node, PDomain domain)
: nodeType(node->NodeType),
  leaf(TValue(node->Leaf)),
  items(node->Items),
  classDist(mlnew TDiscDistribution(domain->classVar)),
  tested(nodeType != Leaf ? domain->attributes->operator[](node->Tested) : PVariable()),
  cut(node->Cut),
  lower(node->Lower),
  upper(node->Upper),
  mapping(),
  branch()
{ 
  float *cd = node->ClassDist; // no +1
  int i, e;
  for(i = 0, e = domain->classVar.AS(TEnumVariable)->values->size(); i!=e; i++, cd++)
    classDist->setint(i, float(*cd));

  if (nodeType != Leaf) {
    branch = mlnew TC45TreeNodeList;
    Tree *bi = node->Branch+1;
    for(i = node->Forks; i--; bi++)
      branch->push_back(mlnew TC45TreeNode(*bi, domain));
  }

  if (nodeType == Subset) {
    int ve = tested.AS(TEnumVariable)->values->size();
    mapping = mlnew TIntList(ve, -1);
    char **si = node->Subset+1;
    for(i = 0, e = node->Forks; i!=e; si++, i++)
      for(int vi = 0; vi<ve; vi++)
        if (In(vi+1, *si))
          mapping->operator [](vi) = i;
  }
}



PDiscDistribution TC45TreeNode::vote(const TExample &example, PVariable classVar)
{
  PDiscDistribution res = mlnew TDiscDistribution(classVar);
  PITERATE(TC45TreeNodeList, bi, branch) {
    PDiscDistribution vote = (*bi)->classDistribution(example, classVar);
    vote->operator *= ((*bi)->items);
    res->operator += (vote);
  }
  res->operator *= (1.0/items);
  return res;       
}


#undef min

PDiscDistribution TC45TreeNode::classDistribution(const TExample &example, PVariable classVar)
{
  if (nodeType == Leaf) {
    if (items > 0) {
      PDiscDistribution res = CLONE(TDiscDistribution, classDist);
      res->operator *= (1.0/items);
      return res;
    }
    else {
      PDiscDistribution res = mlnew TDiscDistribution(classVar);
      res->operator[](leaf.intV) = 1.0;
      return res;
    }
  }

  int varnum = example.domain->getVarNum(tested, false);
  const TValue &val = (varnum != ILLEGAL_INT) ? example[varnum] : tested->computeValue(example);
  if (val.isSpecial())
    return vote(example, classVar);

  switch (nodeType) {
//    case Leaf: - taken care of above

    case Branch:
      if (val.intV >= branch->size())
        return vote(example, classVar);
      else
        return branch->operator[](val.intV)->classDistribution(example, classVar);

    case Cut:
      return branch->operator[](val.floatV <= cut ? 0 : 1)->classDistribution(example, classVar);

    case Subset:
      if ((val.intV > mapping->size()) || (mapping->operator[](val.intV) < 0))
        return vote(example, classVar);
      else
        return branch->operator[](mapping->operator[](val.intV))->classDistribution(example, classVar);

    default:
      raiseError("invalid 'nodeType'");
  }

  return PDiscDistribution();
}


TC45Classifier::TC45Classifier(PDomain domain, PC45TreeNode atree)
: TClassifierFD(domain),
  tree(atree)
{}



/* We need to define this separately to ensure that the first class
   is selected in case of a tie */
TValue TC45Classifier::operator ()(const TExample &oexample)
{
  checkProperty(tree);
  
  PDiscDistribution classDist;
  if (oexample.domain != domain) {
    TExample example(domain, oexample);
    classDist = tree->classDistribution(example, classVar);
  }
  else
    classDist = tree->classDistribution(oexample, classVar);

  int bestClass = 0;
  float bestP = -1;
  TDiscDistribution::const_iterator pi(classDist->begin());
  for(int cl = 0, ce = classVar.AS(TEnumVariable)->values->size(); cl!=ce; cl++, pi++) {
    if (*pi > bestP) {
      bestP = *pi;
      bestClass = cl;
    }
  }

  return TValue(bestClass);
}



PDistribution TC45Classifier::classDistribution(const TExample &oexample)
{ 
  checkProperty(tree);

  PDiscDistribution classDist;
  if (oexample.domain != domain) {
    TExample example(domain, oexample);
    classDist = tree->classDistribution(example, classVar);
  }
  else
    classDist = tree->classDistribution(oexample, classVar);

  classDist->normalize();
  return classDist;
}


void TC45Classifier::predictionAndDistribution(const TExample &oexample, TValue &value, PDistribution &dist)
{
  checkProperty(tree);

  if (oexample.domain != domain) {
    TExample example(domain, oexample);
    dist = tree->classDistribution(example, classVar);
  }
  else
    dist = tree->classDistribution(oexample, classVar);

  int bestClass = 0;
  float bestP = -1;
  for(int cl = 0, ce = classVar.AS(TEnumVariable)->values->size(); cl!=ce; cl++) {
    float td = dist->atint(cl);
    if (td > bestP) {
      bestP = td;
      bestClass = cl;
    }
  }

  value = TValue(bestClass);
  dist->normalize();
}




#include "tdidt.hpp"
#include "classfromvar.hpp"
#include "discretize.hpp"
#include "tdidt_split.hpp"

PTreeNode TC45TreeNode::asTreeNode(PExampleGenerator examples, const int &weightID, bool storeContingencies, bool storeExamples)
{ 
  PTreeNode newNode = mlnew TTreeNode();
  newNode->distribution = classDist;
  newNode->distribution->normalize();

  if (items > 0)
    newNode->nodeClassifier = mlnew TDefaultClassifier(examples->domain->classVar, leaf, classDist);
  else {
    TDiscDistribution *dd = mlnew TDiscDistribution(examples->domain->classVar);
    dd->add(leaf, 1.0);
    newNode->nodeClassifier = mlnew TDefaultClassifier(examples->domain->classVar, leaf, dd);
  }

  if (storeExamples) {
    newNode->examples = examples;
    newNode->weightID = weightID;
  }

  if (nodeType == Leaf)
    return newNode;

  PDistribution branchSizes = mlnew TDiscDistribution;
  int i = 0;
  PITERATE(TC45TreeNodeList, li, branch)
    branchSizes->addint(i++, (*li)->items);
  newNode->branchSizes = branchSizes;
    
  TEnumVariable *dummyVar = mlnew TEnumVariable(tested->name);
  PVariable wdummyVar = dummyVar;

  switch (nodeType) {
    case Branch:
      newNode->branchSelector = mlnew TClassifierFromVar(tested, branchSizes);
      newNode->branchDescriptions = mlnew TStringList(tested.AS(TEnumVariable)->values.getReference());
      break;

    case Cut:
      newNode->branchDescriptions = mlnew TStringList;

      char str[128];
      sprintf(str, "<%3.3f", cut);
      newNode->branchDescriptions->push_back(str);
      dummyVar->values->push_back(str);
      sprintf(str, ">=%3.3f", cut);
      newNode->branchDescriptions->push_back(str);
      dummyVar->values->push_back(str);

      newNode->branchSelector = mlnew TClassifierFromVar(wdummyVar, tested, branchSizes, mlnew TThresholdDiscretizer(cut));
      break;

    case Subset:
      int noval = 1 + *max_element(mapping->begin(), mapping->end());
      dummyVar->values = mlnew TStringList(noval, "");
      TStringList::const_iterator tvi(tested.AS(TEnumVariable)->values->begin());
      PITERATE(TIntList, ni, mapping) {
        if (*ni >= 0) {
          string &val = dummyVar->values->at(*ni);
          if (val.length())
            val += ", ";
          val += *tvi;
        }
        tvi++;
      }
      PITERATE(TStringList, vi, dummyVar->values)
        if ((*vi).find(",") != string::npos)
          *vi = "in [" + *vi + "]";

      newNode->branchSelector = mlnew TClassifierFromVar(wdummyVar, tested, branchSizes,mlnew TMapIntValue(mapping));
      newNode->branchDescriptions = dummyVar->values;
      break;
  }

  vector<int> newWeights;
  PExampleGeneratorList subsets;
  TExampleGeneratorList::const_iterator si;
  if (storeExamples || storeContingencies) {
    subsets = TTreeExampleSplitter_UnknownsAsBranchSizes()(PTreeNode(newNode.getReference()), examples, weightID, newWeights);
    si = subsets->begin();
  }

  newNode->branches = mlnew TTreeNodeList;
  vector<int>::const_iterator wi(newWeights.begin()), we(newWeights.end());
  PITERATE(TC45TreeNodeList, c45bi, branch) {
    if (storeExamples || storeContingencies) {
      newNode->branches->push_back(*c45bi ? (*c45bi)->asTreeNode(*(si++), wi!=we ? *wi : weightID, storeContingencies, storeExamples) : PTreeNode());
      if (wi!=we) {
        examples->removeMetaAttribute(*wi);
        wi++;
      }
    }
    else
      // just call with 'examples' as argument -- they will only be used to extract examples->domain->classVar
      newNode->branches->push_back(*c45bi ? (*c45bi)->asTreeNode(examples, weightID, false, false) : PTreeNode());
  }

  

  return newNode;
}


PTreeClassifier TC45Classifier::asTreeClassifier(PExampleGenerator examples, const int &weightID, bool storeContingencies, bool storeExamples)
{
  if (storeContingencies)
    raiseWarning("'storeContingencies' not supported yet");

  PExampleTable exampleTable = toExampleTable(examples);

  PTreeNode orangeTree = tree->asTreeNode(examples, weightID, storeContingencies, storeExamples);
  return mlnew TTreeClassifier(examples->domain, orangeTree, mlnew TTreeDescender_UnknownMergeAsBranchSizes());
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -