📄 id3_proto.c
字号:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <string.h>
#include <conio.h>
#include <time.h>
#include "id3.h"
#include "proto.h"
MATRIX *build_matrix (UINT width, UINT height)
{
MATRIX *_matrix;
UINT i;
_matrix = (MATRIX*) malloc (sizeof (MATRIX));
if (!_matrix)
err_exit (__FILE__, __LINE__);
_matrix->width = width;
_matrix->height = height;
_matrix->data = (REAL**) malloc (height * sizeof (REAL*));
if (_matrix->data == NULL)
err_exit(__FILE__, __LINE__);
for (i=0; i<height; i++)
{
_matrix->data[i] = (REAL*) malloc (width * sizeof(REAL));
if (_matrix->data[i] == NULL)
err_exit(__FILE__, __LINE__);
}
return _matrix;
}
void err_exit (CHAR* file, UINT line) /* Standard error handler function */
{
printf("\n Fatal error in file %s, line %u", file, line);
exit(0);
}
void file_size (CHAR *file_name, UINT *width, UINT *height)
{
FILE *f;
UINT buf_size = 0xFF, _width = 0;
CHAR *buffer, *ptr;
*width = *height = 0;
buffer = (CHAR*) malloc (buf_size * sizeof (CHAR));
if (buffer == NULL)
err_exit (__FILE__, __LINE__);
f = fopen(file_name, "r"); /* Open price file - abort if filename invalid */
if (f == NULL)
{
printf("\n File not found : %s\n", file_name);
err_exit (__FILE__, __LINE__);
}
if (fgets(buffer, buf_size, f) != NULL) /* Get number of entries in first row */
{
++*height;
ptr = strtok (buffer, " ,");
while (ptr != NULL)
{
++*width;
ptr = strtok (NULL, " ,");
}
}
while (!feof(f)) /* Count numbers of subsequent rows */
{
if (fgets(buffer, buf_size, f) != NULL)
{
if (strlen(buffer) > strlen("\n")) /* if line is more than a NL char */
{
++*height;
_width = 0;
ptr = strtok (buffer, " ,");
while (ptr != NULL)
{
++_width;
ptr = strtok (NULL, " ,");
}
if (*width != _width)
{
printf("\n Number of entries in file %s did not agree", file_name);
err_exit (__FILE__, __LINE__);
}
}
}
}
free (buffer);
}
void free_matrix (MATRIX *_matrix)
{
UINT i;
for (i=0; i<_matrix->height; i++)
free (_matrix->data[i]);
free (_matrix->data);
free (_matrix);
}
void free_tags ( CHAR** varname, UINT width)
{
UINT i;
for (i=0; i<width; i++)
free(varname[i]);
free (varname);
}
void free_tree ( NODE *node )
{
if (node == NULL) /* Frees the memory allocated to a tree structure */
return;
else
{
free_tree (node->on);
free_tree (node->off);
free(node);
}
}
NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state)
{
NEGENTROPY negentropy_struct; /* Routine to build a decision tree, based on Quinlan's ID3 algorithm */
NODE *node;
UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split;
REAL **data = matrix->data;
REAL best_threshold, min_negentropy, _negentropy;
node = (NODE*) malloc (sizeof(NODE)); /* Allocate memory for this node */
if (!node)
err_exit (__FILE__, __LINE__);
node->parent = parent; /* Set address of parent node */
if (parent != NULL) /* parent to child; not relevant for root node */
{
if (state == ON) /* Pass address of this node to the parent node */
parent->on = node;
else
if (state == OFF)
parent->off = node;
}
min_negentropy = 1.0;
for (i=0; i<n_vars; i++)
{
for (j=0; j<n_samples; j++)
{
if (i != target)
{
node->idx = i; /* Set trial values for this node */
node->threshold = data[j][i];
negentropy_struct = negentropy (data, n_samples, node, target); /* calculate the negentropy of this partition */
_negentropy = negentropy_struct.ne;
if (_negentropy <min_negentropy) /* If this negentropy is lower than any other, return index and threshold for future use */
{
printf("%d\n",_negentropy);
min_negentropy = _negentropy;
split = i;
best_threshold = data[j][i];
}
}
}
}
node->idx = split; /* Save the combination of best attribute and threshold value */
node->threshold = best_threshold;
if (negentropy_struct.status != INACTIVE)
{
node->on = node->off = NULL;
node->idx = negentropy_struct.status;
}
else
{
node->on = ID3 (matrix, node, target, ON);
node->off = ID3 (matrix, node, target, OFF);
}
return node;
}
void main (int argv, char *argc[])
{
MATRIX *matrix;
NODE *node;
UINT target, n_vars, n_samples;
CHAR data_file[13]="data.txt", tag_file[13]="vag.txt";
CHAR **tag_names;
file_size (data_file, &n_vars, &n_samples); /* Read dimensions of data file */
tag_names = read_tags (tag_file, n_vars); /* Read labels for columns of data */
matrix = build_matrix (n_vars, n_samples); /* Allocate storage for data */
read_matrix (data_file, matrix); /* read it from disk */
target = n_vars - 1; /* Classification target is last column */
node = ID3 ( matrix, NULL, target, 0 ); /* Return root of decision tree - ID3 continues to call itself recursively */
print_tree(node, tag_names);
printf("\n");
free_tags (tag_names, n_vars);
free_matrix(matrix);
free_tree (node);
}
NEGENTROPY negentropy ( REAL **data,
UINT n_samples,
NODE *local,
UINT target)
{
NEGENTROPY ret_val;
NODE *_node, *_parent;
UINT on_ctr, off_ctr, p1, p2, i, _match;
REAL p_on, p_off, negentropy_on, negentropy_off;
on_ctr = off_ctr = p1 = p2 = 0;
for (i=0; i<n_samples; i++) /* Scan through all supplied data samples */
{
_match = 1;
_node = local;
while (_node->parent != NULL)
{
_parent = _node->parent; /* If at the root node, all entries match*/
if (_node == _parent->on)
{
if (data[i][_parent->idx] < _parent->threshold) /* if parent node is ON */
_match = 0;
}
else
if (_node == _parent->off)
{
if (data[i][_parent->idx] >= _parent->threshold) /* if parent node is OFF */
_match = 0;
}
_node = _parent;
}
if (_match)
{
if (data[i][local->idx] >= local->threshold)
{
on_ctr++;
if (data[i][target] >= 0.5)
p1++;
}
else
{
off_ctr++;
if (data[i][target] >= 0.5)
p2++;
}
}
}
if (on_ctr > 0) /* Entropy of subtable with activation ON */
{
p_on = (REAL) p1 / (REAL) on_ctr;
p_off = 1 - p_on;
negentropy_on = -entropy (p_on) - entropy (p_off);
}
else
negentropy_on = 0.0;
if (off_ctr > 0) /* Entropy of subtable with activation OFF */
{
p_on = (REAL) p2 / (REAL) off_ctr;
p_off = 1 - p_on;
negentropy_off = -entropy (p_on) - entropy (p_off);
}
else
negentropy_off = 0.0;
ret_val.ne = (negentropy_on * on_ctr + negentropy_off * off_ctr);
ret_val.ne /= (on_ctr + off_ctr);
if ((p1 == on_ctr) && (p2 == off_ctr))
ret_val.status = ON;
else if ((p1 == 0) && (p2 == 0))
ret_val.status = OFF;
else
ret_val.status = INACTIVE;
return ret_val;
}
void print_tree (NODE *node, CHAR** tag_names)
{
if (node->on == NULL) /* Displays algebraic equivalent of the decision tree */
{
if (node->idx == ON)
printf("ON");
else if (node->idx == OFF)
printf("OFF");
return;
}
else
printf("if { %s >= %1.2f then ", tag_names[node->idx], node->threshold);
print_tree ( node->on, tag_names );
printf(" else ");
print_tree ( node->off, tag_names );
printf( " } " );
}
void read_matrix (CHAR filename[], MATRIX *matrix)
{
UINT i, j;
UINT height = matrix->height;
UINT width = matrix->width;
REAL **data = matrix->data;
FILE *f;
if ((f = fopen(filename, "r")) == NULL) /* Open price file */
{
printf("\n File not found : %s\n", filename);
err_exit (__FILE__, __LINE__);
}
for (i=0; i<height; i++)
for (j=0; j<width; j++)
{
fscanf(f, "%lf\n", &data[i][j] );
}
fclose(f);
}
CHAR **read_tags (CHAR *tag_file, UINT width)
{
FILE *f;
CHAR **_varname;
UINT i;
CHAR buffer[0xFF];
f = fopen(tag_file, "r");
if (f == NULL)
{
printf("\n File not found : %s\n", tag_file);
err_exit (__FILE__, __LINE__);
}
_varname = (CHAR**) malloc (width * sizeof (CHAR*));
for (i=0; i<width; i++)
_varname[i] = (CHAR*) malloc (0xFF * sizeof (CHAR));
i = 0;
while (!feof(f))
{
if (fgets(buffer, 0xFF, f) != NULL)
{
if (strlen(buffer) > strlen("\n"))
{
if (i>width-1)
{
printf("\nMore variable names were detected than data items.");
printf("\nPlease correct this problem before proceeding");
exit(0);
}
sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", _varname[i]);
i++;
}
}
}
if (i<width)
{
printf("\nFewer variable names than data items were detected.");
printf("\nPlease correct this problem before proceeding");
exit(0);
}
fclose (f);
return _varname;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -