📄 dtree1.c
字号:
/*---------------------------------------------------------------------- File : dtree1.c Contents: decision and regression tree management, base functions Author : Christian Borgelt History : 12.06.1997 file created (as dtree.c) 08.08.1997 first version completed 25.08.1997 multiple child references made possible 30.08.1997 treatment of new values during pruning enabled 25.08.1997 multiple child references replaced by links 02.02.1998 links between children simplified 03.02.1998 adapted to changed interface of module 'table' 27.05.1998 treatment of val == cut in _exec changed 31.05.1998 rule extraction adapted to interface changes 15.10.1998 major revision completed 20.10.1998 output of relative class frequencies added 15.12.1998 minor improvements, assertions added 13.11.1999 bug in parse error reporting removed 02.12.2000 growing and pruning functions moved to dtree2.c 16.12.2000 decision tree parsing simplified 17.12.2000 regression tree functionality added 22.07.2001 all global variables removed 27.08.2001 bug in _aggr concerning node->cls fixed 12.10.2001 parse error recovery corrected/improved 11.01.2002 number of attributes stored in DTREE structure 12.08.2004 adapted to new module parse----------------------------------------------------------------------*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include <assert.h>#include "vecops.h"#include "dtree.h"#ifdef STORAGE#include "storage.h"#endif/*---------------------------------------------------------------------- Preprocessor Definitions----------------------------------------------------------------------*/#define EPSILON 1e-6 /* to handle roundoff errors */#define BS_PATH 32 /* block size for the path vector */#define islink(d,n) (((d)->link >= (n)->vec) && \ ((d)->link < (n)->vec +(n)->size))/*---------------------------------------------------------------------- Type Definitions----------------------------------------------------------------------*/typedef struct { /* --- decision tree output info --- */ FILE *file; /* output file */ ATTSET *attset; /* attribute set of decision tree */ int type; /* target attribute type */ int ind; /* indentation (in characters) */ int pos; /* position in output line */ int max; /* maximal line length */ int mode; /* description mode, e.g. DT_ALIGN */ char buf[4*AS_MAXLEN+68]; /* output buffer */} DTOUT; /* (decision tree output info) */#ifdef DT_RULEStypedef struct { /* --- rule extraction info --- */ RULESET *ruleset; /* rule set to add to */ ATTSET *attset; /* attribute set */ int type; /* target type */ int subset[1]; /* subset of attribute values */} DTRULE; /* (rule extraction info) */#endif/*---------------------------------------------------------------------- Recursive Parts of Main Functions----------------------------------------------------------------------*/static void _delete (DTNODE *node){ /* --- recursively delete a subtree */ int i; /* loop variable */ DTDATA *data; /* to traverse the data vector */ assert(node); /* check the function argument */ if (!(node->flags & DT_LEAF)){/* if the node is no leaf */ for (data = node->vec +(i = node->size); --i >= 0; ) if ((--data)->child && !islink(data, node)) _delete(data->child); /* recursively delete all subtrees, */ } /* avoiding empty subtrees and links */ free(node); /* delete the node itself */} /* _delete() *//*--------------------------------------------------------------------*/static void _aggr (DTNODE *node, int n, double *buf){ /* --- aggregate node data */ int i; /* loop variable */ DTDATA *data, *dmax; /* to traverse data vector */ double *frq, *max; /* to traverse the class frequencies */ double sum; /* sum of the class frequencies */ assert(node && buf); /* check the function arguments */ /* --- leaf node --- */ if (node->flags & DT_LEAF) { /* if the node is a leaf */ if (n < 0) { /* if the target att. is numeric, */ buf[0] += node->frq; /* compute sums over f, f*v, f*v*v */ buf[1] += sum = node->frq *node->trg.f; buf[2] += node->err + sum *node->trg.f; } else { /* if the target att. is symbolic, */ assert(node->size == n); /* check the data vector size */ dmax = data = node->vec; /* get pointers to the data vector */ *buf += sum = data->frq; /* and start summing the frequencies */ while (--n > 0) { /* traverse the data vector */ sum += (++data)->frq; /* sum the class frequencies and */ *++buf += data->frq; /* aggregate them for classes */ if (data->frq > dmax->frq) dmax = data; } /* update the maximal frequency */ node->frq = (float)sum; /* note the node frequency and */ node->err = sum -dmax->frq; /* the number of errors */ node->trg.i = (int)(dmax -node->vec); } /* determine the most freq. class */ return; /* leaves terminate the recursion, */ } /* so abort the function */ /* --- test node --- */ for (frq = buf +abs(n) +(i = abs(n)); --i >= 0; ) *--frq = 0; /* clear the next buffer section */ for (data = node->vec +(i = node->size); --i >= 0; ) { if ((--data)->child && !islink(data, node)) _aggr(data->child, n, frq); } /* aggregate data recursively */ if (n < 0) { /* if the target att. is numeric, */ buf[0] += frq[0]; /* aggregate sums over f, f*v, f*v*v */ buf[1] += frq[1]; /* for the next higher level and */ buf[2] += frq[2]; /* compute the node information */ sum = (frq[0] > 0) ? frq[1] /frq[0] : 0; node->frq = (float)frq[0]; /* note the node frequency and the */ node->err = frq[2] -sum *frq[1]; /* sum of squared errors */ node->trg.f = (float)sum; } /* note the mean value for the pred. */ else { /* if the target att. is symbolic */ max = frq; *buf += sum = *frq; while (--n > 0) { /* traverse the class frequencies */ sum += *++frq; /* sum the class frequencies and */ *++buf += *frq; /* aggregate them for classes */ if (*frq > *max) max = frq; } /* update the maximal frequency */ node->frq = (float)sum; /* note the node frequency */ node->err = sum -*max; /* and the number of errors */ node->trg.i = (int)(max-buf-1); } /* determine the most freq. class */} /* _aggr() *//*--------------------------------------------------------------------*/static void _mark (ATTSET *attset, DTNODE *node){ /* --- mark occurring attributes */ int i; /* loop variable */ DTDATA *data; /* to traverse data vector */ assert(attset && node); /* check the function arguments */ if (node->flags & DT_LEAF) /* if the node is a leaf, abort, */ return; /* otherwise mark the test attribute */ att_setmark(as_att(attset, node->attid), 1); for (data = node->vec +(i = node->size); --i >= 0; ) if ((--data)->child && !islink(data, node)) _mark(attset,data->child);/* recursively mark the attributes, */} /* _mark() */ /* avoiding empty subtrees and links *//*---------------------------------------------------------------------- Main Functions----------------------------------------------------------------------*/DTREE* dt_create (ATTSET *attset, int trgid){ /* --- create a decision tree */ int k; /* size of the buffer */ int colcnt; /* number of table columns */ int clscnt; /* number of classes (sym. target) */ ATT *att; /* target attribute */ int type; /* type of the target attribute */ DTREE *dt; /* created decision tree */ assert(attset); /* check the function argument */ colcnt = as_attcnt(attset); /* get the number of columns */ if ((trgid < 0) || (trgid >= colcnt)) trgid = colcnt -1; /* check and adapt the target id, */ att = as_att(attset, trgid); /* then get the target attribute */ type = att_type(att); /* and its type */ if (type != AT_SYM) { /* if the target att. is numeric, */ clscnt = 0; k = 3; } /* there are no classes */ else { /* if the target att. is symbolic */ clscnt = k = att_valcnt(att); assert(clscnt >= 1); /* get and check */ } /* the number of classes */ dt = (DTREE*)malloc(sizeof(DTREE)); if (!dt) return NULL; /* create a dec./reg. tree body */ dt->buf = (double*)malloc(k *sizeof(double)); if (!dt->buf) { free(dt); return NULL; } /* create a buffer vector */ dt->attset = attset; /* initialize the fields: */ dt->trgid = trgid; /* note the attribute set, */ dt->type = type; /* the target's id and type, */ dt->clscnt = clscnt; /* and number of classes */ dt->attcnt = 1; /* clear the number of attributes, */ dt->size = dt->height = 0; /* the number of nodes, the height, */ dt->total = 0.0F; /* the total of frequencies, */ dt->root = dt->curr = NULL; /* and the node pointers */ dt->newp = &(dt->root); /* (no root node is created) */ dt->path = NULL; /* do not allocate a path */ dt->plen = dt->pvsz = 0; /* until it is needed */ return dt; /* return created decision tree */} /* dt_create() *//*--------------------------------------------------------------------*/void dt_delete (DTREE *dt, int delas){ /* --- delete a decision tree */ assert(dt); /* check argument */ if (dt->root) _delete(dt->root); /* recursively delete nodes */ if (dt->path) free(dt->path); /* delete the path vector */ if (dt->buf) free(dt->buf); /* and the buffer vector */ if (delas) as_delete(dt->attset); /* delete attribute set */ free(dt); /* delete decision tree body */} /* dt_delete() *//*--------------------------------------------------------------------*/void dt_up (DTREE *dt, int root){ /* --- go up in decision tree */ assert(dt); /* check argument */ if (root || (dt->plen <= 0)) { dt->curr = dt->root; /* if to reset to root, */ dt->plen = 0; } /* make the root the current node */ else /* if to go up one level */ dt->curr = dt->path[--dt->plen];} /* dt_up() */ /* get preceding node in path *//*--------------------------------------------------------------------*/int dt_down (DTREE *dt, int index, int check){ /* --- go down in decision tree */ DTNODE **vec; /* new path vector */ int vsz; /* new size of path vector */ assert(dt); /* check arguments */ if (!dt->curr) return -2; /* and current node */ assert((index >= 0) && (index < dt->curr->size)); if (check) /* if only to check for child node */ return (dt->curr->vec[index].child) ? 0 : 1; if (dt->plen >= dt->pvsz) { /* if the path vector is full, */ vsz = dt->pvsz +BS_PATH; /* enlarge the path vector */ vec = (DTNODE**)realloc(dt->path, vsz *sizeof(DTNODE*)); if (!vec) return -1; /* allocate a new path vector */ dt->path = vec; dt->pvsz = vsz;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -