📄 id3algorithm(c).txt
字号:
* found.
*/
if (negentropy_struct.status != INACTIVE)
{
node->on = node->off = NULL;
node->idx = negentropy_struct.status;
}
else
{
node->on = ID3 (matrix, node, target, ON);
node->off = ID3 (matrix, node, target, OFF);
}
return node;
}
/*-------------------------------------------------------------------*/
void main (int argv, char *argc[])
{
MATRIX *matrix;
NODE *node;
UINT target, n_vars, n_samples;
CHAR data_file[13], tag_file[13]; /* Longest file name in DOS */
CHAR **tag_names;
/* Set up file names */
if (argv != 2)
{
printf("\nUsage: id3 [datafile]");
exit(0);
}
else
{
printf("\nWelcome to ID3");
printf("\nLast compiled on %s, %s", __DATE__, __TIME__);
printf("\n");
strcpy(data_file, argc[1]);
strcpy(tag_file, argc[1]);
strcat(data_file, ".dat");
strcat(tag_file, ".tag");
}
/* Read dimensions of data file */
file_size (data_file, &n_vars, &n_samples);
/* Read labels for columns of data */
tag_names = read_tags (tag_file, n_vars);
/* Allocate storage for data... */
matrix = build_matrix (n_vars, n_samples);
/* ...and read it from disk */
read_matrix (data_file, matrix);
/* Classification target is last column */
target = n_vars - 1;
/* Return root of decision tree - ID3 continues to call itself
recursively */
node = ID3 ( matrix, NULL, target, 0 );
print_tree(node, tag_names);
printf("\n");
free_tags (tag_names, n_vars);
free_matrix(matrix);
free_tree (node);
}
/*-------------------------------------------------------------------*/
NEGENTROPY negentropy ( REAL **data,
UINT n_samples,
NODE *local,
UINT target)
{
/*
* Calculates the entropy of classification of an attribute, given
* a table of attributes already used, the attribute on which splitting
* is to be taken, and the target attribute. Entropy is calculated in
* bits, so logs are taken to base 2 by dividing by LN_2.
*
* The returned value always lies in the (closed) range [0, 1].
*/
NEGENTROPY ret_val;
NODE *_node, *_parent;
UINT on_ctr, off_ctr, p1, p2, i, _match;
REAL p_on, p_off, negentropy_on, negentropy_off;
on_ctr = off_ctr = p1 = p2 = 0;
/* Scan through all supplied data samples */
for (i=0; i<n_samples; i++)
{
/*
* If pattern matches the current position in the decision tree,
* then use this vector. The match is determined by passing up
* the decision tree and checking whether 'data[idx] >= threshold'
* matches at each step, where idx and threshold are taken from
* each node in turn.
*/
_match = 1;
_node = local;
while (_node->parent != NULL)
{ /* If at the root node, all entries match*/
_parent = _node->parent;
if (_node == _parent->on)
{ /* if parent node is ON */
if (data[i][_parent->idx] < _parent->threshold)
_match = 0;
}
else
if (_node == _parent->off)
{ /* if parent node is OFF */
if (data[i][_parent->idx] >= _parent->threshold)
_match = 0;
}
_node = _parent;
}
if (_match)
{
if (data[i][local->idx] >= local->threshold)
{
on_ctr++;
if (data[i][target] >= 0.5)
p1++;
}
else
{
off_ctr++;
if (data[i][target] >= 0.5)
p2++;
}
}
} /* for (i=0; i<n_samples; i++) */
/* 1: Entropy of subtable with activation ON */
/*
* We now have the numbers of samples that match this part of the
* decision tree, and the number of samples for which the supplied
* condition are true. From these quantities we can find the negentropy of
* this partition.
*/
if (on_ctr > 0)
{
p_on = (REAL) p1 / (REAL) on_ctr;
p_off = 1 - p_on;
negentropy_on = -entropy (p_on) - entropy (p_off);
}
else
negentropy_on = 0.0;
/* 2: Entropy of subtable with activation OFF */
if (off_ctr > 0)
{
p_on = (REAL) p2 / (REAL) off_ctr;
p_off = 1 - p_on;
negentropy_off = -entropy (p_on) - entropy (p_off);
}
else
negentropy_off = 0.0;
ret_val.ne = (negentropy_on * on_ctr + negentropy_off * off_ctr);
ret_val.ne /= (on_ctr + off_ctr);
/*
* If all values examined were the same, set 'ret_val.status' to
* the target value since this will be an end-of-branch node
*/
if ((p1 == on_ctr) && (p2 == off_ctr))
ret_val.status = ON;
else if ((p1 == 0) && (p2 == 0))
ret_val.status = OFF;
else
ret_val.status = INACTIVE;
return ret_val;
}
/*-------------------------------------------------------------------*/
void print_tree (NODE *node, CHAR** tag_names)
{
/*
* Displays algebraic equivalent of the decision tree
*/
if (node->on == NULL)
{
if (node->idx == ON)
printf("ON");
else if (node->idx == OFF)
printf("OFF");
return;
}
else
printf("if { %s >= %1.2f then ", tag_names[node->idx], node->threshold);
print_tree ( node->on, tag_names );
printf(" else ");
print_tree ( node->off, tag_names );
printf( " } " );
}
/*-------------------------------------------------------------------*/
void read_matrix (CHAR filename[], MATRIX *matrix)
{
UINT i, j;
UINT height = matrix->height;
UINT width = matrix->width;
REAL **data = matrix->data;
FILE *f;
/* Open price file */
if ((f = fopen(filename, "r")) == NULL)
{
printf("\n File not found : %s\n", filename);
err_exit (__FILE__, __LINE__);
}
for (i=0; i<height; i++)
for (j=0; j<width; j++)
{
fscanf(f, "%lf\n", &data[i][j] );
}
fclose(f);
}
/*-------------------------------------------------------------------*/
CHAR **read_tags (CHAR *tag_file, UINT width)
{
FILE *f;
CHAR **_varname;
UINT i;
CHAR buffer[0xFF];
f = fopen(tag_file, "r");
if (f == NULL)
{
printf("\n File not found : %s\n", tag_file);
err_exit (__FILE__, __LINE__);
}
_varname = (CHAR**) malloc (width * sizeof (CHAR*));
for (i=0; i<width; i++)
_varname[i] = (CHAR*) malloc (0xFF * sizeof (CHAR));
i = 0;
while (!feof(f))
{
if (fgets(buffer, 0xFF, f) != NULL)
{
if (strlen(buffer) > strlen("\n"))
{
if (i>width-1)
{
printf("\nMore variable names were detected than data items.");
printf("\nPlease correct this problem before proceeding");
exit(0);
}
sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", _varname[i]);
i++;
}
}
}
if (i<width)
{
printf("\nFewer variable names than data items were detected.");
printf("\nPlease correct this problem before proceeding");
exit(0);
}
fclose (f);
return _varname;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -