⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 id3.txt

📁 掌握目前主流的数据挖掘平台与工具。理解关联规则、频繁集、置信度、支持度的概念; 了解信息熵的概念。
💻 TXT
📖 第 1 页 / 共 2 页
字号:
有如下形式的样本,请问用ID3算法生成决策树的时候,怎样构造节点 NODE 的数据结构阿?

field_1 field_2 field_3 level

A1 B1 C1 V1 

A2 B1 C1 V1

A1 B2 C1 V3

A3 B1 C2 V2 

A3 B1 C3 V2

A3 B1 C3 V3

(field_1 field_2 field_3 都是样本的属性 level 是样本的分类 )
一般用带有指针的结构类型啦,譬如:

typedef struct node {

UINT idno /* 属性的标识号*/

REAL threshold; /* 判断阈值*/

struct node *on; /* 指向通过结点*/

struct node *off; /* 指向不通过结点*/

struct node *parent; /* 父结点*/

} NODE;

我的样本是离散值,还需要阀值吗? “指向通过结点”“指向不通过结点”是什么意思啊? 我的分类取值有v1 v2 v3 三种情况 是不是不适合这样的节点构造呢?

的,针对这种情况,应去掉其中的threshold和on、off字段。因可能是多分支的。

能否这样构造:

typedef struct node {

UINT idno /* 属性的标识号*/

string value; /* 父节点属性值*/

struct node *on; /* 指向第一个左子结点*/

struct node *off; /* 指向右兄弟结点*/

struct node *parent; /* 父结点*/

} NODE;

PROTO.H
NEGENTROPY negentropy ( REAL **, UINT, NODE*, UINT );
void print_tree ( NODE* , CHAR** );
void free_tree ( NODE  * );
NODE* ID3 ( MATRIX * , NODE* , UINT , UINT );
void err_exit ( CHAR* , UINT );
MATRIX *build_matrix ( UINT, UINT );
void free_matrix ( MATRIX * );
void read_matrix ( CHAR *, MATRIX * );
void file_size ( CHAR * , UINT * , UINT * );
CHAR **read_tags ( CHAR * , UINT );
void free_tags ( CHAR **, UINT);

ID3.h
typedef unsigned int  UINT;
typedef unsigned long ULONG;
typedef          char CHAR;
typedef unsigned char BOOL;
typedef double        REAL;

typedef struct node {
   UINT idx; /* ID code for attribute */
   REAL threshold; /* Numerical threshold for attribute test */
   struct node *on; /* Address of 'on' node */
   struct node *off; /* Address of 'off' node */
   struct node *parent; /* Addess of parent node */
} NODE;

typedef struct ne_struct {
    REAL ne;
    UINT status;
} NEGENTROPY;

typedef struct matrix {
   UINT width;
   UINT height;
   REAL **data;
} MATRIX;

enum UINT { INACTIVE, OFF, ON };
#define LN_2 0.693147180559945309417
#define entropy(x) (x > 0 ? x * log(x) / LN_2 : 0.0)

/*
* FILE: id3.c
*
* Author: Andrew Colin
*
* DISCLAIMER: No liability is assumed by the author for any use made
* of this program.
*
* DISTRIBUTION: Any use may be made of this program, as long as the
* clear acknowledgment is made to the author in code and runtime
* executables
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <string.h>
#include <conio.h>
#include <time.h>

#include "id3.h"
#include "proto.h"

/*-------------------------------------------------------------------*/

MATRIX *build_matrix (UINT width, UINT height)
{
    MATRIX *_matrix;
    UINT i;

    _matrix = (MATRIX*) malloc (sizeof (MATRIX));
    if (!_matrix)
        err_exit (__FILE__, __LINE__);

    _matrix->width  = width;
    _matrix->height = height;

    _matrix->data = (REAL**) malloc (height * sizeof (REAL*));
    if (_matrix->data == NULL)
        err_exit(__FILE__, __LINE__);

    for (i=0; i<height; i++)
    {
        _matrix->data[i] = (REAL*) malloc (width * sizeof(REAL));
        if (_matrix->data[i] == NULL)
            err_exit(__FILE__, __LINE__);
    }
    return _matrix;
}

/*-------------------------------------------------------------------*/

/*
* Standard error handler function
*/

void err_exit (CHAR* file, UINT line)
{
    printf("\n Fatal error in file %s, line %u", file, line);
    exit(0);
}

/*-------------------------------------------------------------------*/

void file_size (CHAR *file_name, UINT *width, UINT *height)
/*
* Given the name of a file of numeric data, this routine counts
* the numbers of rows and columns. It's assumed that the number
* of entries is the same in each row, and an error is flagged if this
* is not the case.
*
*/
{
    FILE *f;
    UINT buf_size = 0xFF, _width = 0;
    CHAR *buffer, *ptr;

    *width = *height = 0;

    buffer = (CHAR*) malloc (buf_size * sizeof (CHAR));
    if (buffer == NULL)
        err_exit (__FILE__, __LINE__);

    /* Open price file - abort if filename invalid */
    f = fopen(file_name, "r");
    if (f == NULL)
    {
        printf("\n File not found : %s\n", file_name);
        err_exit (__FILE__, __LINE__);
    }

    /* Get number of entries in first row */
    if (fgets(buffer, buf_size, f) != NULL)
    {
        ++*height;
        ptr = strtok (buffer, " ,");
        while (ptr != NULL)
        {
            ++*width;
            ptr = strtok (NULL, " ,");
        }
    }

    /* Count numbers of subsequent rows */
    while (!feof(f))
    {
        if (fgets(buffer, buf_size, f) != NULL)
        {
            if (strlen(buffer) > strlen("\n"))  /* if line is more than a NL char */
            {
                ++*height;
                _width = 0;
                ptr = strtok (buffer, " ,");
                while (ptr != NULL)
                {
                    ++_width;
                    ptr = strtok (NULL, " ,");
                }

                if (*width != _width)
                {
                    printf("\n Number of entries in file %s did not agree", file_name);
                    err_exit (__FILE__, __LINE__);
                }
            }
        }
    }
    free (buffer);
}

/*-------------------------------------------------------------------*/

void free_matrix (MATRIX *_matrix)
{
    UINT i;
    for (i=0; i<_matrix->height; i++)
        free (_matrix->data[i]);

    free (_matrix->data);
    free (_matrix);
}

/*-------------------------------------------------------------------*/

void free_tags ( CHAR** varname, UINT width)
{
    UINT i;
    for (i=0; i<width; i++)
        free(varname[i]);
    free (varname);
}

/*-------------------------------------------------------------------*/

void free_tree ( NODE  *node )
{
    /*
     *  Frees the memory allocated to a tree structure
     */

    if (node == NULL)
        return;
    else
    {
        free_tree (node->on);
        free_tree (node->off);
        free(node);
    }

}

/*-------------------------------------------------------------------*/

NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state)

/* Routine to build a decision tree, based on Quinlan's ID3 algorithm. */
{
    NEGENTROPY negentropy_struct;
    NODE *node;
    UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split;
    REAL **data = matrix->data;
    REAL best_threshold, min_negentropy, _negentropy;

    /* Allocate memory for this node */
    node = (NODE*) malloc (sizeof(NODE));
    if (!node)
        err_exit (__FILE__, __LINE__);

    /* Set up links in decision tree */
    node->parent = parent;  /* Set address of parent node */

    if (parent != NULL) /* parent to child; not relevant for root node */
    {
        /* Pass address of this node to the parent node */
        if (state == ON)
            parent->on = node;
        else
            if (state == OFF)
                parent->off = node;
    }

    /*
     * Select attribute with lowest negentropy for splitting. Scan through
     * ALL attributes (except the target) and ALL data samples. This is
     * pretty inefficient for data sets with repeated values, but will do
     * for illustrative purposes
     */

    min_negentropy = 1.0;

    for (i=0; i<n_vars; i++)
    {

        for (j=0; j<n_samples; j++) 
        {

            if (i != target) 
            {

                /* Set trial values for this node... */
                node->idx = i;
                node->threshold = data[j][i];

                /* ...and calculate the negentropy of this partition */
                negentropy_struct = negentropy (data, n_samples, node, target);

                _negentropy = negentropy_struct.ne;

                /* If this negentropy is lower than any other, retain the

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -