📄 id3.txt
字号:
有如下形式的样本,请问用ID3算法生成决策树的时候,怎样构造节点 NODE 的数据结构阿?
field_1 field_2 field_3 level
A1 B1 C1 V1
A2 B1 C1 V1
A1 B2 C1 V3
A3 B1 C2 V2
A3 B1 C3 V2
A3 B1 C3 V3
(field_1 field_2 field_3 都是样本的属性 level 是样本的分类 )
一般用带有指针的结构类型啦,譬如:
typedef struct node {
UINT idno /* 属性的标识号*/
REAL threshold; /* 判断阈值*/
struct node *on; /* 指向通过结点*/
struct node *off; /* 指向不通过结点*/
struct node *parent; /* 父结点*/
} NODE;
我的样本是离散值,还需要阀值吗? “指向通过结点”“指向不通过结点”是什么意思啊? 我的分类取值有v1 v2 v3 三种情况 是不是不适合这样的节点构造呢?
的,针对这种情况,应去掉其中的threshold和on、off字段。因可能是多分支的。
能否这样构造:
typedef struct node {
UINT idno /* 属性的标识号*/
string value; /* 父节点属性值*/
struct node *on; /* 指向第一个左子结点*/
struct node *off; /* 指向右兄弟结点*/
struct node *parent; /* 父结点*/
} NODE;
PROTO.H
NEGENTROPY negentropy ( REAL **, UINT, NODE*, UINT );
void print_tree ( NODE* , CHAR** );
void free_tree ( NODE * );
NODE* ID3 ( MATRIX * , NODE* , UINT , UINT );
void err_exit ( CHAR* , UINT );
MATRIX *build_matrix ( UINT, UINT );
void free_matrix ( MATRIX * );
void read_matrix ( CHAR *, MATRIX * );
void file_size ( CHAR * , UINT * , UINT * );
CHAR **read_tags ( CHAR * , UINT );
void free_tags ( CHAR **, UINT);
ID3.h
typedef unsigned int UINT;
typedef unsigned long ULONG;
typedef char CHAR;
typedef unsigned char BOOL;
typedef double REAL;
typedef struct node {
UINT idx; /* ID code for attribute */
REAL threshold; /* Numerical threshold for attribute test */
struct node *on; /* Address of 'on' node */
struct node *off; /* Address of 'off' node */
struct node *parent; /* Addess of parent node */
} NODE;
typedef struct ne_struct {
REAL ne;
UINT status;
} NEGENTROPY;
typedef struct matrix {
UINT width;
UINT height;
REAL **data;
} MATRIX;
enum UINT { INACTIVE, OFF, ON };
#define LN_2 0.693147180559945309417
#define entropy(x) (x > 0 ? x * log(x) / LN_2 : 0.0)
/*
* FILE: id3.c
*
* Author: Andrew Colin
*
* DISCLAIMER: No liability is assumed by the author for any use made
* of this program.
*
* DISTRIBUTION: Any use may be made of this program, as long as the
* clear acknowledgment is made to the author in code and runtime
* executables
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <string.h>
#include <conio.h>
#include <time.h>
#include "id3.h"
#include "proto.h"
/*-------------------------------------------------------------------*/
MATRIX *build_matrix (UINT width, UINT height)
{
MATRIX *_matrix;
UINT i;
_matrix = (MATRIX*) malloc (sizeof (MATRIX));
if (!_matrix)
err_exit (__FILE__, __LINE__);
_matrix->width = width;
_matrix->height = height;
_matrix->data = (REAL**) malloc (height * sizeof (REAL*));
if (_matrix->data == NULL)
err_exit(__FILE__, __LINE__);
for (i=0; i<height; i++)
{
_matrix->data[i] = (REAL*) malloc (width * sizeof(REAL));
if (_matrix->data[i] == NULL)
err_exit(__FILE__, __LINE__);
}
return _matrix;
}
/*-------------------------------------------------------------------*/
/*
* Standard error handler function
*/
void err_exit (CHAR* file, UINT line)
{
printf("\n Fatal error in file %s, line %u", file, line);
exit(0);
}
/*-------------------------------------------------------------------*/
void file_size (CHAR *file_name, UINT *width, UINT *height)
/*
* Given the name of a file of numeric data, this routine counts
* the numbers of rows and columns. It's assumed that the number
* of entries is the same in each row, and an error is flagged if this
* is not the case.
*
*/
{
FILE *f;
UINT buf_size = 0xFF, _width = 0;
CHAR *buffer, *ptr;
*width = *height = 0;
buffer = (CHAR*) malloc (buf_size * sizeof (CHAR));
if (buffer == NULL)
err_exit (__FILE__, __LINE__);
/* Open price file - abort if filename invalid */
f = fopen(file_name, "r");
if (f == NULL)
{
printf("\n File not found : %s\n", file_name);
err_exit (__FILE__, __LINE__);
}
/* Get number of entries in first row */
if (fgets(buffer, buf_size, f) != NULL)
{
++*height;
ptr = strtok (buffer, " ,");
while (ptr != NULL)
{
++*width;
ptr = strtok (NULL, " ,");
}
}
/* Count numbers of subsequent rows */
while (!feof(f))
{
if (fgets(buffer, buf_size, f) != NULL)
{
if (strlen(buffer) > strlen("\n")) /* if line is more than a NL char */
{
++*height;
_width = 0;
ptr = strtok (buffer, " ,");
while (ptr != NULL)
{
++_width;
ptr = strtok (NULL, " ,");
}
if (*width != _width)
{
printf("\n Number of entries in file %s did not agree", file_name);
err_exit (__FILE__, __LINE__);
}
}
}
}
free (buffer);
}
/*-------------------------------------------------------------------*/
void free_matrix (MATRIX *_matrix)
{
UINT i;
for (i=0; i<_matrix->height; i++)
free (_matrix->data[i]);
free (_matrix->data);
free (_matrix);
}
/*-------------------------------------------------------------------*/
void free_tags ( CHAR** varname, UINT width)
{
UINT i;
for (i=0; i<width; i++)
free(varname[i]);
free (varname);
}
/*-------------------------------------------------------------------*/
void free_tree ( NODE *node )
{
/*
* Frees the memory allocated to a tree structure
*/
if (node == NULL)
return;
else
{
free_tree (node->on);
free_tree (node->off);
free(node);
}
}
/*-------------------------------------------------------------------*/
NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state)
/* Routine to build a decision tree, based on Quinlan's ID3 algorithm. */
{
NEGENTROPY negentropy_struct;
NODE *node;
UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split;
REAL **data = matrix->data;
REAL best_threshold, min_negentropy, _negentropy;
/* Allocate memory for this node */
node = (NODE*) malloc (sizeof(NODE));
if (!node)
err_exit (__FILE__, __LINE__);
/* Set up links in decision tree */
node->parent = parent; /* Set address of parent node */
if (parent != NULL) /* parent to child; not relevant for root node */
{
/* Pass address of this node to the parent node */
if (state == ON)
parent->on = node;
else
if (state == OFF)
parent->off = node;
}
/*
* Select attribute with lowest negentropy for splitting. Scan through
* ALL attributes (except the target) and ALL data samples. This is
* pretty inefficient for data sets with repeated values, but will do
* for illustrative purposes
*/
min_negentropy = 1.0;
for (i=0; i<n_vars; i++)
{
for (j=0; j<n_samples; j++)
{
if (i != target)
{
/* Set trial values for this node... */
node->idx = i;
node->threshold = data[j][i];
/* ...and calculate the negentropy of this partition */
negentropy_struct = negentropy (data, n_samples, node, target);
_negentropy = negentropy_struct.ne;
/* If this negentropy is lower than any other, retain the
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -