📄 tdidt.cpp
字号:
/*
This file is part of Orange.
Orange is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Orange is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Orange; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Authors: Janez Demsar, Blaz Zupan, 1996--2002
Contact: janez.demsar@fri.uni-lj.si
*/
#include "stladdon.hpp"
#include "random.hpp"
#include "vars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "table.hpp"
#include "contingency.hpp"
#include "distvars.hpp"
#include "measures.hpp"
#include "majority.hpp"
#include "tdidt_split.hpp"
#include "tdidt_stop.hpp"
#include "tdidt.ppp"
DEFINE_TOrangeVector_classDescription(PTreeNode, "TTreeNodeList", true, ORANGE_API)
/* Default components for split constructor -- split constructors for
discrete and continuous attributes and classes, and the corresponding
attribute measures.
To avoid problems under gcc, these are initialized by an explicit call
to tdidt_cpp_gcUnsafeInitialization made from initOrange at the end
of module initialization.
*/
PTreeSplitConstructor defaultDiscreteTreeSplitConstructor;
PTreeSplitConstructor defaultContinuousTreeSplitConstructor;
PTreeStopCriteria defaultStop;
void tdidt_cpp_gcUnsafeInitialization()
{ PMeasureAttribute defaultDiscreteMeasure = mlnew TMeasureAttribute_gainRatio;
PMeasureAttribute defaultContinuousMeasure = mlnew TMeasureAttribute_MSE;
defaultDiscreteTreeSplitConstructor
= mlnew TTreeSplitConstructor_Combined(
mlnew TTreeSplitConstructor_Attribute(defaultDiscreteMeasure, 0.0, 2.0),
mlnew TTreeSplitConstructor_Threshold(defaultDiscreteMeasure, 0.0, 5.0));
defaultContinuousTreeSplitConstructor
= mlnew TTreeSplitConstructor_Combined(
mlnew TTreeSplitConstructor_Attribute(defaultContinuousMeasure, 0.0, 2.0),
mlnew TTreeSplitConstructor_Threshold(defaultContinuousMeasure, 0.0, 5.0));
defaultStop = mlnew TTreeStopCriteria_common();
}
int TTreeNode::treeSize() const
{ int sum = 1;
if (branches)
const_PITERATE(TTreeNodeList, bi, branches)
if (*bi)
sum += (*bi)->treeSize();
return sum;
}
void TTreeNode::removeStoredInfo()
{ distribution = PDistribution();
contingency = PDomainContingency();
examples = PExampleGenerator();
if (branches)
const_PITERATE(TTreeNodeList, bi, branches)
if (*bi)
(*bi)->treeSize();
}
TTreeLearner::TTreeLearner()
: storeExamples(false),
storeDistributions(true),
storeContingencies(false),
storeNodeClassifier(true),
maxDepth(100)
{}
PClassifier TTreeLearner::operator()(PExampleGenerator ogen, const int &weight)
{ if (!ogen)
raiseError("invalid example generator");
PVariable &classVar = ogen->domain->classVar;
if (!classVar)
raiseError("class-less domain");
bool tempSplit = !split;
if (tempSplit)
if (classVar->varType == TValue::INTVAR)
split = defaultDiscreteTreeSplitConstructor;
else if (classVar->varType == TValue::FLOATVAR)
split = defaultContinuousTreeSplitConstructor;
else
raiseError("invalid class type (discrete or continuous expected)");
bool tempStop = !stop;
if (tempStop)
stop = defaultStop;
bool tempSplitter = !exampleSplitter;
if (tempSplitter)
exampleSplitter = mlnew TTreeExampleSplitter_UnknownsAsSelector;
try {
PExampleGenerator examples;
/* If we don't intend to store them, we'll copy them if they're not in a table.
If we must store examples, we'll copy them in any case... */
if (storeExamples)
examples = mlnew TExampleTable(ogen);
else
examples = toExampleTable(ogen);
PDistribution apriorClass = getClassDistribution (examples);
if (apriorClass->abs == 0)
raiseError("no examples");
vector<bool> candidates(examples->domain->attributes->size(), true);
PTreeNode root = call(examples, weight, apriorClass, candidates, 0);
if (storeExamples)
root->examples = examples;
if (tempSplit)
split = PTreeSplitConstructor();
if (tempStop)
stop = PTreeSplitConstructor();
if (tempSplitter)
exampleSplitter = PTreeExampleSplitter();
return mlnew TTreeClassifier(examples->domain, root,
descender ? descender : mlnew TTreeDescender_UnknownMergeAsSelector);
}
catch (exception) {
if (tempSplit)
split = PTreeSplitConstructor();
if (tempStop)
stop = PTreeSplitConstructor();
if (tempSplitter)
exampleSplitter = PTreeExampleSplitter();
throw;
}
}
PTreeNode TTreeLearner::operator()(PExampleGenerator examples, const int &weightID, PDistribution apriorClass, vector<bool> &candidates, const int &depth)
{
PDomainContingency contingency;
PDomainDistributions domainDistributions;
PDistribution classDistribution;
if (!examples->numberOfExamples())
return PTreeNode();
if (contingencyComputer)
contingency = contingencyComputer->call(examples, weightID);
if (storeContingencies)
contingency = mlnew TDomainContingency(examples, weightID);
if (contingency)
classDistribution = contingency->classes;
else if (storeDistributions)
classDistribution = getClassDistribution(examples, weightID);
if (classDistribution) {
if (!classDistribution->abs)
return PTreeNode();
}
else
if (examples->weightOfExamples() < 1e-10)
return PTreeNode();
TTreeNode *utreeNode = mlnew TTreeNode();
PTreeNode treeNode = utreeNode;
utreeNode->weightID = weightID;
bool isLeaf = ((maxDepth>=0) && (depth == maxDepth)) || stop->call(examples, weightID, contingency);
if (isLeaf || storeNodeClassifier) {
utreeNode->nodeClassifier = nodeLearner
? nodeLearner->smartLearn(examples, weightID, contingency, domainDistributions, classDistribution)
: TMajorityLearner().smartLearn(examples, weightID, contingency, domainDistributions, classDistribution);
if (isLeaf) {
if (storeContingencies)
utreeNode->contingency = contingency;
if (storeDistributions)
utreeNode->distribution = classDistribution;
return treeNode;
}
}
utreeNode->contingency = contingency;
utreeNode->distribution = classDistribution;
float quality;
int spentAttribute;
utreeNode->branchSelector = split->call(utreeNode->branchDescriptions, utreeNode->branchSizes, quality, spentAttribute,
examples, weightID, contingency,
apriorClass ? apriorClass : classDistribution,
candidates, utreeNode->nodeClassifier);
isLeaf = !utreeNode->branchSelector;
bool hasNullNodes = false;
if (!isLeaf) {
if (spentAttribute>=0)
if (candidates[spentAttribute])
candidates[spentAttribute] = false;
else
spentAttribute = -1;
/* BEWARE: If you add an additional 'return' in the code below,
do not forget to restore the candidate's flag. */
utreeNode->branches = mlnew TTreeNodeList();
vector<int> newWeights;
PExampleGeneratorList subsets = exampleSplitter->call(treeNode, examples, weightID, newWeights);
if (!utreeNode->branchSizes)
utreeNode->branchSizes = branchSizesFromSubsets(subsets, weightID, newWeights);
if (!storeContingencies)
utreeNode->contingency = PDomainContingency();
if (!storeDistributions)
utreeNode->distribution = PDistribution();
vector<int>::iterator nwi = newWeights.begin(), nwe = newWeights.end();
PITERATE(TExampleGeneratorList, gi, subsets) {
if ((*gi)->numberOfExamples()) {
utreeNode->branches->push_back(call(*gi, (nwi!=nwe) ? *nwi : weightID, apriorClass, candidates, depth+1));
// Can store pointers to examples: the original is stored in the root
if (storeExamples && utreeNode->branches->back())
utreeNode->branches->back()->examples = *gi;
else if ((nwi!=nwe) && *nwi && (*nwi != weightID))
examples->removeMetaAttribute(*nwi);
}
else {
utreeNode->branches->push_back(PTreeNode());
hasNullNodes = true;
}
if (nwi!=nwe)
nwi++;
}
/* If I set it to false, it must had been true before
(otherwise, my TreeSplitConstructor wouldn't be allowed to spend it).
Hence, I can simply set it back to true... */
if (spentAttribute>=0)
candidates[spentAttribute] = true;
}
else { // no split constructed
if (!utreeNode->contingency)
utreeNode->contingency = PDomainContingency();
if (!storeDistributions)
utreeNode->distribution = PDistribution();
}
if (isLeaf || hasNullNodes) {
if (!utreeNode->nodeClassifier)
utreeNode->nodeClassifier = nodeLearner
? nodeLearner->smartLearn(examples, weightID, contingency, domainDistributions, classDistribution)
: TMajorityLearner().smartLearn(examples, weightID, contingency, domainDistributions, classDistribution);
}
return treeNode;
}
PDiscDistribution TTreeLearner::branchSizesFromSubsets(PExampleGeneratorList subsets, const int &weightID, const vector<int> &weights) const
{
TDiscDistribution *bs = mlnew TDiscDistribution();
PDiscDistribution branchSizes = bs;
vector<int>::const_iterator wi (weights.begin());
bool hasWeights = wi != weights.end();
PITERATE(TExampleGeneratorList, egli, subsets) {
float totw = 0.0;
int wID = hasWeights ? *(wi++) : weightID;
if (!wID)
totw = (*egli)->numberOfExamples();
if (wID || (totw<0))
PEITERATE(ei, *egli)
totw += WEIGHT2(*ei, wID);
bs->push_back(totw);
}
return branchSizes;
}
TTreeClassifier::TTreeClassifier(PDomain dom, PTreeNode atree, PTreeDescender adescender)
: TClassifierFD(dom),
tree(atree),
descender(adescender)
{}
/* Classification is somewhat complex due to descender component.
If descender does not require voting (i.e. always descends to a single
node), methods simply use descender to find a node and use the
corresponding nodeClassifier.
If descender requires a vote, this is indicated by returning
branch weights. The method responds by calling 'vote'. Voting is
always done by a weighted sum class probabilities (even when
the outcome should be single value, not distribution).
'vote' method thus calls classDistribution for each subnode.
classDistribution again uses descender to descend from the subnode.
If it requires vote again, classDistribution will again call
'vote'. We thus have a mutual (recursive) calls of 'vote'
and 'classDistribution' - not at all levels of the tree but
only at those where descender takes a break... */
PDistribution TTreeClassifier::findNodeDistribution(PTreeNode node, const TExample &exam)
{
if (node->distribution)
return node->distribution;
if (node->contingency && node->contingency->classes)
return node->contingency->classes;
if (node->nodeClassifier) {
PDistribution dist = node->nodeClassifier->classDistribution(exam);
if (dist)
return dist;
}
if (classVar->varType == TValue::INTVAR) {
const int nval = classVar->noOfValues();
if (nval)
return mlnew TDiscDistribution(nval, 1.0/nval);
}
return PDistribution();
}
TValue TTreeClassifier::findNodeValue(PTreeNode node, const TExample &exam)
{
PDistribution dist = findNodeDistribution(node, exam);
if (dist)
return dist->highestProbValue(exam);
else
return TValue(0.0);
}
TValue TTreeClassifier::operator()(const TExample &exam)
{
checkProperty(descender);
const bool convertEx = domain && (exam.domain != domain);
TExample convex = convertEx ? TExample(domain, exam) : TExample();
const TExample &refexam = convertEx ? convex : exam;
PDiscDistribution branchWeights;
PTreeNode node = descender->call(tree, refexam, branchWeights);
if (!branchWeights) {
if (node->nodeClassifier)
return node->nodeClassifier->call(refexam);
}
else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -