📄 hoeffding.cpp

📁 C4.5的简化版本是一个决策树的算法
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12

// Destroy a node. This means deleting all the dynamically allocated memory
Node::~Node()
{
   // Delete the linked list of examples
   Example *example = examples;
   Example *temp;
   while(example) {
      temp = example->next;
      delete example;
      example = temp;
   }

   // Delete all the children nodes
   for(int x = 0; x < num_children; x++)
      delete children[x];
}

// Add an example to this node. A specific example can only inhabit
// one node(leaf) at any one time.
void Node::add_example(Example *example)
{
   example->next = examples;
   examples = example;
   num_examples++;
}

// Add an attribute's index to this node.  Attributes may inhabit more
// than one node at a time. 
void Node::add_attribute(int attr)
{
  attributes[num_attrs++] = attr;
}

// Build the decision subtree starting at this node. If this node is the root
// node, then this means building the entire tree! First we see if
// the examples at this node have more than one class. If not, then there
// is no splitting required. If so, then we check all the dimensions
// using a special metric in evaluate() and choose one dimension to split
// on. This split generates new children nodes, and we run the build_tree()
// algorithm on all the children, creating decision subtrees.
//
// default_class is the majority class of the node's parent
void Node::build_tree(int default_class)
{
   int x, max_class_count, class_count[MAX_CLASSES];  

   if (num_examples == 0) { //if there's no examples allocated on this node
      major_class = default_class;  
      return;
   }

   for (x = 0; x < Decision::num_classes; x++) class_count[x] = 0; 

//   print_distribution();   // Print the class distribution of this node

   // We do not want to split this node if the examples all have the
   // same class. Further differentiation is not required, because all
   // the "answers" are the same.
   int first_class = examples->get_class();
   int different_classes = 0;    // Assume not different classes
   for(Example *example = examples->next; example; example = example->next) {
      class_count[example->get_class()]++; 
      if(example->get_class() != first_class)
         different_classes = 1;
   }

   // Return if no different class
   if(!different_classes) {
//      cout << "Terminal node" << endl;
      major_class = first_class;        
      return;
   }

   // Determine majority class in examples at this node. 
   major_class = 0;
   max_class_count = class_count[0];
   for (x = 1; x < Decision::num_classes; x++)
      if (class_count[x] > max_class_count) {
         major_class = x;
         max_class_count = class_count[x];
      }

   if (num_attrs == 0) return;  //there's no attribute left for this node

   // Find the attribute that maximizes a particular evaluation function
   // implemented by evaluate(). Which dimension is best to split on?
   attribute = splitter();

//   cout << "Dimension chosen: " 
//        << Decision::dimensions[attribute]->get_attribute() << endl;

   partition(); // Break up the node. Allocate all examples to kids

   for(x = 0; x < num_children; x++)  // Now do it on all the subtrees
      children[x]->build_tree(major_class);
}

// This node has many examples. We build children from this node by creating
// new nodes, one each for a value of the attribute specified in "attribute"
void Node::partition()
{
   int x, y; 

   for(int val = 0; val < Decision::dimensions[attribute]->num_values; val++)
      children[num_children++] = new Node;//for every value ,build a new node

   Example *temp;
   Example *example = examples;
   while(example) {
      temp = example->next;
      children[example->get_value(attribute)]->add_example(example);/////////may be error
      example = temp;
   }
   examples = NULL;//delete the example links on the internal node

   // pass along all unused attributes 
   for (x = 0; x < num_attrs; x++)
      if (attributes[x] != attribute)
         for (y = 0; y < num_children; y++)
            children[y]->add_attribute(attributes[x]);//-------????waste!!!!!!
}

// Print out the class distribution of this node
void Node::print_distribution()
{
  cout << "Class distribution is ";
  int class_totals[MAX_CLASSES];   // How many are in each class?
  for(int x = 0; x < Decision::num_classes; x++)
    class_totals[x] = 0;

  for(Example *example = examples; example; example = example->next)
      class_totals[example->get_class()]++;

  for(x = 0; x < Decision::num_classes; x++)
    cout << Decision::classes[x] << ": " << class_totals[x] << " ";

  cout << endl;
}

// There are many dimensions that we could split on at this node.
// Decide which one is most appropriate. Returns the number index.
int Node::splitter()
{
   // Go through unused dimensions and find the one with the lowest
   // disorder value. We get this from the evaluation function.
   double max_value = evaluate(attributes[0]); // evaluation seen so far
   int max_dim = attributes[0];                // Dimension of lowest evaluation
   double value;

   for(int x = 1; x < num_attrs; x++) {
      value = evaluate(attributes[x]);
      if(value > max_value) {
         max_value = value;
         max_dim = attributes[x];
      }
   }

   return max_dim;
}


// Assuming that we split this node along the given dimension, there
// will be a number of subtrees. Each of these subtrees will have a certain
// disorder. Add up the total disorder of the subtrees and return it.
double Node::evaluate(int dim)
{
   // Find the numerical distribution of examples along this dimension
   int class_totals[MAX_CLASSES];   // How many are in each class?
   int dim_totals[MAX_VALUES];
   double Entropy;
   for(int x = 0; x < Decision::num_classes; x++)
      class_totals[x] = 0;

   for(Example *example = examples; example; example = example->next)
	   class_totals[example->get_class()]++;

   double sum = 0;
   double prob = 0;
   for(x = 0; x < Decision::num_classes; x++)
	   if(class_totals[x] > 0) {
		 prob = class_totals[x] / num_examples; 
         sum += (-1)*(prob) * (log(prob)/log(2));
	   }
   Entropy = sum;
   for(int value = 0; value < Decision::dimensions[dim]->num_values; value++)
      dim_totals[value] = 0;

   for(example = examples; example; example = example->next)
         dim_totals[example->get_value(dim)]++;

   // Tally up the overall total
   double total = 0;
   for(value = 0; value < Decision::dimensions[dim]->num_values; value++)
      total += dim_totals[value];//total may not equals to num_examples,for that some examples may
	                             //have missing values on that attribute

   double sum1 = 0;
   for(value = 0; value < Decision::dimensions[dim]->num_values; value++)
      sum1 += disorder(dim, value) * dim_totals[value] / total;

   return Entropy-sum1;
}

// Given a dimension to split on, and a particular class, how disordered
// would this subtree be, starting from this node?
double Node::disorder(int dim, int value)
{
   int class_totals[MAX_CLASSES];   // How many are in each class?
   for(int x = 0; x < Decision::num_classes; x++)
      class_totals[x] = 0;

   for(Example *example = examples; example; example = example->next)
      if(example->get_value(dim) == value)
         class_totals[example->get_class()]++;

   double total = 0;
   for(x = 0; x < Decision::num_classes; x++)
      total += class_totals[x];    // Should be same as num_examples. Duh.
                                   // Regard every example has a class
   if(total < 0.1)  // basically total == 0
      return 0;

   double sum = 0;
   double prob = 0;
   for(x = 0; x < Decision::num_classes; x++)
	   if(class_totals[x] > 0) {
		 prob = class_totals[x] / total; 
         sum += (-1)*(prob) * (log(prob)/log(2));
	   }
   // Use base e intead of base 2 in LISP.
   return sum;
}

// Print this node with "depth" amount of indentation
void Node::print(int depth)
{
   int x, y;
   if(num_children == 0) {
      for (x = 0; x < depth; x++) cout << "   ";
      cout << "  Class: " << Decision::classes[major_class] << endl;
    //  for (x = 0; x < depth; x++) cout << "   ";
   //   cout << "  Examples:" << endl;
   //   for(Example *example = examples; example; example = example->next) {
    //     for(x = 0; x < depth; x++)
      //      cout << "   ";
       //  cout << "    " << *example << endl;
     // }
   } else {
      for(x = 0; x < depth; x++)
         cout << "   ";
      cout << "Split on " << Decision::dimensions[attribute]->get_attribute()
            << ":" << endl;
      for(y = 0; y < num_children; y++) { 
         for(x = 0; x < depth; x++)
            cout << "   ";
         cout << "---"
              << Decision::dimensions[attribute]->values[y] << ": " << endl;
         children[y]->print(depth+1);
      }
   }
}

// The main() function is the first function called in this program. It
// reads the command line to get the filename of the file with the input
// data. It then creates a Decision object and runs the build_tree() method.
// Finally, it prints the completed decision tree.
int main()
{
   Decision decision("G:\\data\\dctree\\long.txt");
   decision.build_tree();
   cout << decision << endl;

   return 0;
}
上一页 12
💿 文件大小 264 K
👤 上传用户 haibokaishi2006
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#4.5 #版本 #决策树 #算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -