📄 hoeffding.cpp
字号:
// Destroy a node. This means deleting all the dynamically allocated memory
Node::~Node()
{
// Delete the linked list of examples
Example *example = examples;
Example *temp;
while(example) {
temp = example->next;
delete example;
example = temp;
}
// Delete all the children nodes
for(int x = 0; x < num_children; x++)
delete children[x];
}
// Add an example to this node. A specific example can only inhabit
// one node(leaf) at any one time.
void Node::add_example(Example *example)
{
example->next = examples;
examples = example;
num_examples++;
}
// Add an attribute's index to this node. Attributes may inhabit more
// than one node at a time.
void Node::add_attribute(int attr)
{
attributes[num_attrs++] = attr;
}
// Build the decision subtree starting at this node. If this node is the root
// node, then this means building the entire tree! First we see if
// the examples at this node have more than one class. If not, then there
// is no splitting required. If so, then we check all the dimensions
// using a special metric in evaluate() and choose one dimension to split
// on. This split generates new children nodes, and we run the build_tree()
// algorithm on all the children, creating decision subtrees.
//
// default_class is the majority class of the node's parent
void Node::build_tree(int default_class)
{
int x, max_class_count, class_count[MAX_CLASSES];
if (num_examples == 0) { //if there's no examples allocated on this node
major_class = default_class;
return;
}
for (x = 0; x < Decision::num_classes; x++) class_count[x] = 0;
// print_distribution(); // Print the class distribution of this node
// We do not want to split this node if the examples all have the
// same class. Further differentiation is not required, because all
// the "answers" are the same.
int first_class = examples->get_class();
int different_classes = 0; // Assume not different classes
for(Example *example = examples->next; example; example = example->next) {
class_count[example->get_class()]++;
if(example->get_class() != first_class)
different_classes = 1;
}
// Return if no different class
if(!different_classes) {
// cout << "Terminal node" << endl;
major_class = first_class;
return;
}
// Determine majority class in examples at this node.
major_class = 0;
max_class_count = class_count[0];
for (x = 1; x < Decision::num_classes; x++)
if (class_count[x] > max_class_count) {
major_class = x;
max_class_count = class_count[x];
}
if (num_attrs == 0) return; //there's no attribute left for this node
// Find the attribute that maximizes a particular evaluation function
// implemented by evaluate(). Which dimension is best to split on?
attribute = splitter();
// cout << "Dimension chosen: "
// << Decision::dimensions[attribute]->get_attribute() << endl;
partition(); // Break up the node. Allocate all examples to kids
for(x = 0; x < num_children; x++) // Now do it on all the subtrees
children[x]->build_tree(major_class);
}
// This node has many examples. We build children from this node by creating
// new nodes, one each for a value of the attribute specified in "attribute"
void Node::partition()
{
int x, y;
for(int val = 0; val < Decision::dimensions[attribute]->num_values; val++)
children[num_children++] = new Node;//for every value ,build a new node
Example *temp;
Example *example = examples;
while(example) {
temp = example->next;
children[example->get_value(attribute)]->add_example(example);/////////may be error
example = temp;
}
examples = NULL;//delete the example links on the internal node
// pass along all unused attributes
for (x = 0; x < num_attrs; x++)
if (attributes[x] != attribute)
for (y = 0; y < num_children; y++)
children[y]->add_attribute(attributes[x]);//-------????waste!!!!!!
}
// Print out the class distribution of this node
void Node::print_distribution()
{
cout << "Class distribution is ";
int class_totals[MAX_CLASSES]; // How many are in each class?
for(int x = 0; x < Decision::num_classes; x++)
class_totals[x] = 0;
for(Example *example = examples; example; example = example->next)
class_totals[example->get_class()]++;
for(x = 0; x < Decision::num_classes; x++)
cout << Decision::classes[x] << ": " << class_totals[x] << " ";
cout << endl;
}
// There are many dimensions that we could split on at this node.
// Decide which one is most appropriate. Returns the number index.
int Node::splitter()
{
// Go through unused dimensions and find the one with the lowest
// disorder value. We get this from the evaluation function.
double max_value = evaluate(attributes[0]); // evaluation seen so far
int max_dim = attributes[0]; // Dimension of lowest evaluation
double value;
for(int x = 1; x < num_attrs; x++) {
value = evaluate(attributes[x]);
if(value > max_value) {
max_value = value;
max_dim = attributes[x];
}
}
return max_dim;
}
// Assuming that we split this node along the given dimension, there
// will be a number of subtrees. Each of these subtrees will have a certain
// disorder. Add up the total disorder of the subtrees and return it.
double Node::evaluate(int dim)
{
// Find the numerical distribution of examples along this dimension
int class_totals[MAX_CLASSES]; // How many are in each class?
int dim_totals[MAX_VALUES];
double Entropy;
for(int x = 0; x < Decision::num_classes; x++)
class_totals[x] = 0;
for(Example *example = examples; example; example = example->next)
class_totals[example->get_class()]++;
double sum = 0;
double prob = 0;
for(x = 0; x < Decision::num_classes; x++)
if(class_totals[x] > 0) {
prob = class_totals[x] / num_examples;
sum += (-1)*(prob) * (log(prob)/log(2));
}
Entropy = sum;
for(int value = 0; value < Decision::dimensions[dim]->num_values; value++)
dim_totals[value] = 0;
for(example = examples; example; example = example->next)
dim_totals[example->get_value(dim)]++;
// Tally up the overall total
double total = 0;
for(value = 0; value < Decision::dimensions[dim]->num_values; value++)
total += dim_totals[value];//total may not equals to num_examples,for that some examples may
//have missing values on that attribute
double sum1 = 0;
for(value = 0; value < Decision::dimensions[dim]->num_values; value++)
sum1 += disorder(dim, value) * dim_totals[value] / total;
return Entropy-sum1;
}
// Given a dimension to split on, and a particular class, how disordered
// would this subtree be, starting from this node?
double Node::disorder(int dim, int value)
{
int class_totals[MAX_CLASSES]; // How many are in each class?
for(int x = 0; x < Decision::num_classes; x++)
class_totals[x] = 0;
for(Example *example = examples; example; example = example->next)
if(example->get_value(dim) == value)
class_totals[example->get_class()]++;
double total = 0;
for(x = 0; x < Decision::num_classes; x++)
total += class_totals[x]; // Should be same as num_examples. Duh.
// Regard every example has a class
if(total < 0.1) // basically total == 0
return 0;
double sum = 0;
double prob = 0;
for(x = 0; x < Decision::num_classes; x++)
if(class_totals[x] > 0) {
prob = class_totals[x] / total;
sum += (-1)*(prob) * (log(prob)/log(2));
}
// Use base e intead of base 2 in LISP.
return sum;
}
// Print this node with "depth" amount of indentation
void Node::print(int depth)
{
int x, y;
if(num_children == 0) {
for (x = 0; x < depth; x++) cout << " ";
cout << " Class: " << Decision::classes[major_class] << endl;
// for (x = 0; x < depth; x++) cout << " ";
// cout << " Examples:" << endl;
// for(Example *example = examples; example; example = example->next) {
// for(x = 0; x < depth; x++)
// cout << " ";
// cout << " " << *example << endl;
// }
} else {
for(x = 0; x < depth; x++)
cout << " ";
cout << "Split on " << Decision::dimensions[attribute]->get_attribute()
<< ":" << endl;
for(y = 0; y < num_children; y++) {
for(x = 0; x < depth; x++)
cout << " ";
cout << "---"
<< Decision::dimensions[attribute]->values[y] << ": " << endl;
children[y]->print(depth+1);
}
}
}
// The main() function is the first function called in this program. It
// reads the command line to get the filename of the file with the input
// data. It then creates a Decision object and runs the build_tree() method.
// Finally, it prints the completed decision tree.
int main()
{
Decision decision("G:\\data\\dctree\\long.txt");
decision.build_tree();
cout << decision << endl;
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -