📄 c4.5.cpp
字号:
case 't':
trials = atoi((*oi).second.c_str());
batch = false;
if ((trials<1) || (trials>10000)) {
trials=10;
raiseError("parseCommandLine: invalid argument for -t");
}
break;
case 'w':
window = atoi((*oi).second.c_str());
batch = false;
if ((window<1) || (window>1000000)) {
window = 0;
raiseError("parseCommandLine: invalid argument for -w");
}
break;
case 'i':
increment = atoi((*oi).second.c_str());
batch = false;
if ((increment<1) || (increment>1000000)) {
increment = 0;
raiseError("parseCommandLine: invalid argument for -i");
}
break;
case 'g':
gainRatio = false;
break;
case 's':
subset = true;
break;
case 'm':
minObjs = atoi((*oi).second.c_str());
if ((minObjs<1) || (minObjs>1000000)) {
minObjs = 2;
raiseError("parseCommandLine: invalid argument for -m");
}
break;
case 'c':
cf = atof((*oi).second.c_str());
if ((cf<=0) || (cf>100)) {
cf = 0.25;
raiseError("parseCommandLine: invalid argument for -c");
}
break;
}
return true;
}
bool TC45Learner::convertParameters()
{ return true;
}
PClassifier TC45Learner::operator ()(PExampleGenerator gen, const int &weight)
{ if (!gen->domain->classVar)
raiseError("class-less domain");
convertGenerator(gen);
Tree tree = (Tree)c45learn(trials, gainRatio, subset, batch, probThresh, minObjs, window, increment, cf, prune);
PC45TreeNode root = mlnew TC45TreeNode(tree, gen->domain);
TC45Classifier *c45classifier = mlnew TC45Classifier(gen->domain, root);
PClassifier res = c45classifier;
c45garbage();
clearGenerator();
return convertToOrange ? PClassifier(c45classifier->asTreeClassifier(gen, weight, storeContingencies, storeExamples)) : res;
}
TC45TreeNode::TC45TreeNode()
: nodeType(4),
leaf(TValue::INTVAR),
items(-1),
classDist(),
tested(),
cut(0),
lower(0),
upper(0),
mapping(),
branch()
{}
TC45TreeNode::TC45TreeNode(const Tree &node, PDomain domain)
: nodeType(node->NodeType),
leaf(TValue(node->Leaf)),
items(node->Items),
classDist(mlnew TDiscDistribution(domain->classVar)),
tested(nodeType != Leaf ? domain->attributes->operator[](node->Tested) : PVariable()),
cut(node->Cut),
lower(node->Lower),
upper(node->Upper),
mapping(),
branch()
{
float *cd = node->ClassDist; // no +1
int i, e;
for(i = 0, e = domain->classVar.AS(TEnumVariable)->values->size(); i!=e; i++, cd++)
classDist->setint(i, float(*cd));
if (nodeType != Leaf) {
branch = mlnew TC45TreeNodeList;
Tree *bi = node->Branch+1;
for(i = node->Forks; i--; bi++)
branch->push_back(mlnew TC45TreeNode(*bi, domain));
}
if (nodeType == Subset) {
int ve = tested.AS(TEnumVariable)->values->size();
mapping = mlnew TIntList(ve, -1);
char **si = node->Subset+1;
for(i = 0, e = node->Forks; i!=e; si++, i++)
for(int vi = 0; vi<ve; vi++)
if (In(vi+1, *si))
mapping->operator [](vi) = i;
}
}
PDiscDistribution TC45TreeNode::vote(const TExample &example, PVariable classVar)
{
PDiscDistribution res = mlnew TDiscDistribution(classVar);
PITERATE(TC45TreeNodeList, bi, branch) {
PDiscDistribution vote = (*bi)->classDistribution(example, classVar);
vote->operator *= ((*bi)->items);
res->operator += (vote);
}
res->operator *= (1.0/items);
return res;
}
#undef min
PDiscDistribution TC45TreeNode::classDistribution(const TExample &example, PVariable classVar)
{
if (nodeType == Leaf) {
if (items > 0) {
PDiscDistribution res = CLONE(TDiscDistribution, classDist);
res->operator *= (1.0/items);
return res;
}
else {
PDiscDistribution res = mlnew TDiscDistribution(classVar);
res->operator[](leaf.intV) = 1.0;
return res;
}
}
int varnum = example.domain->getVarNum(tested, false);
const TValue &val = (varnum != ILLEGAL_INT) ? example[varnum] : tested->computeValue(example);
if (val.isSpecial())
return vote(example, classVar);
switch (nodeType) {
// case Leaf: - taken care of above
case Branch:
if (val.intV >= branch->size())
return vote(example, classVar);
else
return branch->operator[](val.intV)->classDistribution(example, classVar);
case Cut:
return branch->operator[](val.floatV <= cut ? 0 : 1)->classDistribution(example, classVar);
case Subset:
if ((val.intV > mapping->size()) || (mapping->operator[](val.intV) < 0))
return vote(example, classVar);
else
return branch->operator[](mapping->operator[](val.intV))->classDistribution(example, classVar);
default:
raiseError("invalid 'nodeType'");
}
return PDiscDistribution();
}
TC45Classifier::TC45Classifier(PDomain domain, PC45TreeNode atree)
: TClassifierFD(domain),
tree(atree)
{}
/* We need to define this separately to ensure that the first class
is selected in case of a tie */
TValue TC45Classifier::operator ()(const TExample &oexample)
{
checkProperty(tree);
PDiscDistribution classDist;
if (oexample.domain != domain) {
TExample example(domain, oexample);
classDist = tree->classDistribution(example, classVar);
}
else
classDist = tree->classDistribution(oexample, classVar);
int bestClass = 0;
float bestP = -1;
TDiscDistribution::const_iterator pi(classDist->begin());
for(int cl = 0, ce = classVar.AS(TEnumVariable)->values->size(); cl!=ce; cl++, pi++) {
if (*pi > bestP) {
bestP = *pi;
bestClass = cl;
}
}
return TValue(bestClass);
}
PDistribution TC45Classifier::classDistribution(const TExample &oexample)
{
checkProperty(tree);
PDiscDistribution classDist;
if (oexample.domain != domain) {
TExample example(domain, oexample);
classDist = tree->classDistribution(example, classVar);
}
else
classDist = tree->classDistribution(oexample, classVar);
classDist->normalize();
return classDist;
}
void TC45Classifier::predictionAndDistribution(const TExample &oexample, TValue &value, PDistribution &dist)
{
checkProperty(tree);
if (oexample.domain != domain) {
TExample example(domain, oexample);
dist = tree->classDistribution(example, classVar);
}
else
dist = tree->classDistribution(oexample, classVar);
int bestClass = 0;
float bestP = -1;
for(int cl = 0, ce = classVar.AS(TEnumVariable)->values->size(); cl!=ce; cl++) {
float td = dist->atint(cl);
if (td > bestP) {
bestP = td;
bestClass = cl;
}
}
value = TValue(bestClass);
dist->normalize();
}
#include "tdidt.hpp"
#include "classfromvar.hpp"
#include "discretize.hpp"
#include "tdidt_split.hpp"
PTreeNode TC45TreeNode::asTreeNode(PExampleGenerator examples, const int &weightID, bool storeContingencies, bool storeExamples)
{
PTreeNode newNode = mlnew TTreeNode();
newNode->distribution = classDist;
newNode->distribution->normalize();
if (items > 0)
newNode->nodeClassifier = mlnew TDefaultClassifier(examples->domain->classVar, leaf, classDist);
else {
TDiscDistribution *dd = mlnew TDiscDistribution(examples->domain->classVar);
dd->add(leaf, 1.0);
newNode->nodeClassifier = mlnew TDefaultClassifier(examples->domain->classVar, leaf, dd);
}
if (storeExamples) {
newNode->examples = examples;
newNode->weightID = weightID;
}
if (nodeType == Leaf)
return newNode;
PDistribution branchSizes = mlnew TDiscDistribution;
int i = 0;
PITERATE(TC45TreeNodeList, li, branch)
branchSizes->addint(i++, (*li)->items);
newNode->branchSizes = branchSizes;
TEnumVariable *dummyVar = mlnew TEnumVariable(tested->name);
PVariable wdummyVar = dummyVar;
switch (nodeType) {
case Branch:
newNode->branchSelector = mlnew TClassifierFromVar(tested, branchSizes);
newNode->branchDescriptions = mlnew TStringList(tested.AS(TEnumVariable)->values.getReference());
break;
case Cut:
newNode->branchDescriptions = mlnew TStringList;
char str[128];
sprintf(str, "<%3.3f", cut);
newNode->branchDescriptions->push_back(str);
dummyVar->values->push_back(str);
sprintf(str, ">=%3.3f", cut);
newNode->branchDescriptions->push_back(str);
dummyVar->values->push_back(str);
newNode->branchSelector = mlnew TClassifierFromVar(wdummyVar, tested, branchSizes, mlnew TThresholdDiscretizer(cut));
break;
case Subset:
int noval = 1 + *max_element(mapping->begin(), mapping->end());
dummyVar->values = mlnew TStringList(noval, "");
TStringList::const_iterator tvi(tested.AS(TEnumVariable)->values->begin());
PITERATE(TIntList, ni, mapping) {
if (*ni >= 0) {
string &val = dummyVar->values->at(*ni);
if (val.length())
val += ", ";
val += *tvi;
}
tvi++;
}
PITERATE(TStringList, vi, dummyVar->values)
if ((*vi).find(",") != string::npos)
*vi = "in [" + *vi + "]";
newNode->branchSelector = mlnew TClassifierFromVar(wdummyVar, tested, branchSizes,mlnew TMapIntValue(mapping));
newNode->branchDescriptions = dummyVar->values;
break;
}
vector<int> newWeights;
PExampleGeneratorList subsets;
TExampleGeneratorList::const_iterator si;
if (storeExamples || storeContingencies) {
subsets = TTreeExampleSplitter_UnknownsAsBranchSizes()(PTreeNode(newNode.getReference()), examples, weightID, newWeights);
si = subsets->begin();
}
newNode->branches = mlnew TTreeNodeList;
vector<int>::const_iterator wi(newWeights.begin()), we(newWeights.end());
PITERATE(TC45TreeNodeList, c45bi, branch) {
if (storeExamples || storeContingencies) {
newNode->branches->push_back(*c45bi ? (*c45bi)->asTreeNode(*(si++), wi!=we ? *wi : weightID, storeContingencies, storeExamples) : PTreeNode());
if (wi!=we) {
examples->removeMetaAttribute(*wi);
wi++;
}
}
else
// just call with 'examples' as argument -- they will only be used to extract examples->domain->classVar
newNode->branches->push_back(*c45bi ? (*c45bi)->asTreeNode(examples, weightID, false, false) : PTreeNode());
}
return newNode;
}
PTreeClassifier TC45Classifier::asTreeClassifier(PExampleGenerator examples, const int &weightID, bool storeContingencies, bool storeExamples)
{
if (storeContingencies)
raiseWarning("'storeContingencies' not supported yet");
PExampleTable exampleTable = toExampleTable(examples);
PTreeNode orangeTree = tree->asTreeNode(examples, weightID, storeContingencies, storeExamples);
return mlnew TTreeClassifier(examples->domain, orangeTree, mlnew TTreeDescender_UnknownMergeAsBranchSizes());
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -