⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 id3_proto.c

📁 商业智能用 决策树 Id3算法 C语言
💻 C
字号:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <string.h>
#include <conio.h>
#include <time.h>

#include "id3.h"
#include "proto.h"

MATRIX *build_matrix (UINT width, UINT height)
{
    MATRIX *_matrix;
    UINT i;

    _matrix = (MATRIX*) malloc (sizeof (MATRIX));
    if (!_matrix)
        err_exit (__FILE__, __LINE__);

    _matrix->width  = width;
    _matrix->height = height;

    _matrix->data = (REAL**) malloc (height * sizeof (REAL*));
    if (_matrix->data == NULL)
        err_exit(__FILE__, __LINE__);

    for (i=0; i<height; i++)
    {
        _matrix->data[i] = (REAL*) malloc (width * sizeof(REAL));
        if (_matrix->data[i] == NULL)
            err_exit(__FILE__, __LINE__);
    }
    return _matrix;
}



void err_exit (CHAR* file, UINT line)				/* Standard error handler function */
{
    printf("\n Fatal error in file %s, line %u", file, line);
    exit(0);
}

void file_size (CHAR *file_name, UINT *width, UINT *height)
{
    FILE *f;
    UINT buf_size = 0xFF, _width = 0;
    CHAR *buffer, *ptr;

    *width = *height = 0;

    buffer = (CHAR*) malloc (buf_size * sizeof (CHAR));
    if (buffer == NULL)
        err_exit (__FILE__, __LINE__);

    f = fopen(file_name, "r");					/* Open price file - abort if filename invalid */
    if (f == NULL)
    {
        printf("\n File not found : %s\n", file_name);
        err_exit (__FILE__, __LINE__);
    }

    if (fgets(buffer, buf_size, f) != NULL)			/* Get number of entries in first row */
    {
        ++*height;
        ptr = strtok (buffer, " ,");
        while (ptr != NULL)
        {
            ++*width;
            ptr = strtok (NULL, " ,");
        }
    }

    while (!feof(f))						/* Count numbers of subsequent rows */
    {
        if (fgets(buffer, buf_size, f) != NULL)
        {
            if (strlen(buffer) > strlen("\n"))  		/* if line is more than a NL char */
            {
                ++*height;
                _width = 0;
                ptr = strtok (buffer, " ,");
                while (ptr != NULL)
                {
                    ++_width;
                    ptr = strtok (NULL, " ,");
                }

                if (*width != _width)
                {
                    printf("\n Number of entries in file %s did not agree", file_name);
                    err_exit (__FILE__, __LINE__);
                }
            }
        }
    }
    free (buffer);
}

void free_matrix (MATRIX *_matrix)
{
    UINT i;
    for (i=0; i<_matrix->height; i++)
        free (_matrix->data[i]);

    free (_matrix->data);
    free (_matrix);
}

void free_tags ( CHAR** varname, UINT width)
{
    UINT i;
    for (i=0; i<width; i++)
        free(varname[i]);
    free (varname);
}

void free_tree ( NODE  *node )
{
    if (node == NULL)						/* Frees the memory allocated to a tree structure */
        return;
    else
    {
        free_tree (node->on);
        free_tree (node->off);
        free(node);
    }

}

NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state)
{
    NEGENTROPY negentropy_struct;				/* Routine to build a decision tree, based on Quinlan's ID3 algorithm */
    NODE *node;
    UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split;
    REAL **data = matrix->data;
    REAL best_threshold, min_negentropy, _negentropy;

    node = (NODE*) malloc (sizeof(NODE));			/* Allocate memory for this node */
    if (!node)
        err_exit (__FILE__, __LINE__);

    node->parent = parent;  					/* Set address of parent node */

    if (parent != NULL) 					/* parent to child; not relevant for root node */
    {
       
        if (state == ON)					/* Pass address of this node to the parent node */
            parent->on = node;
        else
            if (state == OFF)
                parent->off = node;
    }

    min_negentropy = 1.0;

    for (i=0; i<n_vars; i++)
    {

        for (j=0; j<n_samples; j++) 
        {

            if (i != target) 
            {
		node->idx = i;					/* Set trial values for this node */
                node->threshold = data[j][i];

                negentropy_struct = negentropy (data, n_samples, node, target); /* calculate the negentropy of this partition */

                _negentropy = negentropy_struct.ne;

                if (_negentropy <min_negentropy) 		/* If this negentropy is lower than any other, return index and threshold for future use */
                {
		    printf("%d\n",_negentropy);
                    min_negentropy = _negentropy;
                    split = i;
                    best_threshold = data[j][i];
                }

            } 							

        } 							

    } 								

    node->idx = split;						/* Save the combination of best attribute and threshold value */
    node->threshold = best_threshold;

    if  (negentropy_struct.status != INACTIVE) 
    {
        node->on = node->off = NULL;
        node->idx = negentropy_struct.status;
    }
    else
    {
        node->on  = ID3 (matrix, node, target, ON);
        node->off = ID3 (matrix, node, target, OFF);
    }

    return node;
}

void main (int argv, char *argc[])
{
    MATRIX	*matrix;
    NODE	*node;
    UINT	target, n_vars, n_samples;
    CHAR	data_file[13]="data.txt", tag_file[13]="vag.txt";
    CHAR	**tag_names;

    file_size (data_file, &n_vars, &n_samples);			/* Read dimensions of data file */

    tag_names = read_tags (tag_file, n_vars);     		/* Read labels for columns of data */ 

    matrix = build_matrix (n_vars, n_samples);			/* Allocate storage for data */

    read_matrix (data_file, matrix);				/* read it from disk */

    target = n_vars - 1;					/* Classification target is last column */

    node = ID3 ( matrix, NULL, target, 0 );			/* Return root of decision tree - ID3 continues to call itself recursively */

    print_tree(node, tag_names);

    printf("\n");

    free_tags (tag_names, n_vars);
    free_matrix(matrix);
    free_tree (node);
}

NEGENTROPY negentropy ( REAL **data,
    UINT   n_samples,
    NODE   *local,
    UINT   target)
{
    NEGENTROPY ret_val;
    NODE *_node, *_parent;
    UINT on_ctr, off_ctr, p1, p2, i, _match;
    REAL p_on, p_off, negentropy_on, negentropy_off;

    on_ctr = off_ctr = p1 = p2 = 0;

    for (i=0; i<n_samples; i++)					/* Scan through all supplied data samples */ 
    {
	 _match = 1;
        _node = local;
        while (_node->parent != NULL) 
        { 
            _parent = _node->parent;				/* If at the root node, all entries match*/
            if (_node == _parent->on) 
            { 
                if (data[i][_parent->idx] < _parent->threshold)	/* if parent node is ON */
                    _match = 0;
            }
            else
                if (_node == _parent->off) 
                { 
                    if (data[i][_parent->idx] >= _parent->threshold) /* if parent node is OFF */
                        _match = 0;
                }
            _node = _parent;
        }

        if (_match) 
        {
            if (data[i][local->idx] >= local->threshold) 
            {
                on_ctr++;
                if (data[i][target] >= 0.5)
                    p1++;
            }
            else 
            {
                off_ctr++;
                if (data[i][target] >= 0.5)
                    p2++;
            }
        }
    }   

   if (on_ctr > 0)						 /* Entropy of subtable with activation ON */
    {
        p_on  = (REAL) p1 / (REAL) on_ctr;
        p_off = 1 - p_on;
        negentropy_on = -entropy (p_on) - entropy (p_off);
    }
    else
        negentropy_on = 0.0;

    if (off_ctr > 0)						/* Entropy of subtable with activation OFF */
    {
        p_on  = (REAL) p2 / (REAL) off_ctr;
        p_off = 1 - p_on;
        negentropy_off = -entropy (p_on) - entropy (p_off);
    }
    else
        negentropy_off = 0.0;

    ret_val.ne = (negentropy_on * on_ctr + negentropy_off * off_ctr);
    ret_val.ne /= (on_ctr + off_ctr);

    if ((p1 == on_ctr) && (p2 == off_ctr))
        ret_val.status = ON;
    else if ((p1 == 0) && (p2 == 0))
        ret_val.status = OFF;
    else
        ret_val.status = INACTIVE;

    return ret_val;

}

void print_tree (NODE *node, CHAR** tag_names)
{
    if (node->on == NULL) 					/* Displays algebraic equivalent of the decision tree */
    {
        if (node->idx == ON)
            printf("ON");
        else if (node->idx == OFF)
            printf("OFF");
        return;
    }

    else
        printf("if { %s >= %1.2f then ", tag_names[node->idx], node->threshold);

    print_tree ( node->on, tag_names );
    printf(" else ");
    print_tree ( node->off, tag_names );
    printf( " } " );
}

void read_matrix (CHAR filename[], MATRIX *matrix)

{
    UINT i, j;
    UINT height = matrix->height;
    UINT width  = matrix->width;
    REAL **data = matrix->data;
    FILE *f;

    if ((f = fopen(filename, "r")) == NULL) 			/* Open price file */
    {
        printf("\n File not found : %s\n", filename);
        err_exit (__FILE__, __LINE__);
    }

    for (i=0; i<height; i++)
        for (j=0; j<width; j++)
        {
            fscanf(f, "%lf\n", &data[i][j] );
        }

    fclose(f);

}

CHAR **read_tags (CHAR *tag_file, UINT width)
{
    FILE *f;
    CHAR **_varname;
    UINT i;
    CHAR buffer[0xFF];

    f = fopen(tag_file, "r");
    if (f == NULL)
    {
        printf("\n File not found : %s\n", tag_file);
        err_exit (__FILE__, __LINE__);
    }

    _varname = (CHAR**) malloc (width * sizeof (CHAR*));
    for (i=0; i<width; i++)
        _varname[i] = (CHAR*) malloc (0xFF * sizeof (CHAR));

    i = 0;
    while (!feof(f))
    {
        if (fgets(buffer, 0xFF, f) != NULL)
        {
            if (strlen(buffer) > strlen("\n"))
            {
                if (i>width-1)
                {
                    printf("\nMore variable names were detected than data items.");
                    printf("\nPlease correct this problem before proceeding");
                    exit(0);
                }
                sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", _varname[i]);
                i++;
            }
        }
    }

    if (i<width) 
    {
        printf("\nFewer variable names than data items were detected.");
        printf("\nPlease correct this problem before proceeding");
        exit(0);
    }

    fclose (f);

    return _varname;

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -