📄 dtree2.orig
字号:
/*---------------------------------------------------------------------- File : dtree2.c Contents: decision and regression tree management, functions for growing and pruning Author : Christian Borgelt History : 12.06.1997 file created (as dtree.c) 08.08.1997 first version completed 15.08.1997 pruning functions completed 25.08.1997 multiple child references made possible 26.08.1997 node counting added (function _count) 28.08.1997 pessimistic pruning added 30.08.1997 treatment of new values during pruning enabled 25.08.1997 multiple child references replaced by links 18.09.1997 bug in measure evaluation removed 25.09.1997 subsets for symbolic attributes added 01.10.1997 bugs in function _pr_tab removed 02.02.1998 links between children simplified 03.02.1998 adapted to changed interface of module 'table' 30.03.1998 bug in _eval_flt conc. unknown values removed 27.08.1998 single class value coll. in _eval_set corrected 21.09.1998 bug in function _adapt (concerning links) removed 15.10.1998 major revision completed 30.10.1998 bug in function _pr_tab removed 15.12.1998 minor improvements, assertions added 24.02.1999 parameters 'params' and 'minval' added to dt_grow 20.03.1999 roundoff error handling improved 29.02.2000 bug in function _eval_num removed 02.12.2000 growing and pruning moved to this file 03.12.2000 decision tree growing partly redesigned 12.12.2000 leaf creation in pruning with a table redesigned 17.12.2000 regression tree functionality added 23.07.2001 global variables removed 26.09.2001 normal distribution table replaced by a function 19.10.2001 conf. level pruning for numeric targets added 01.03.2002 check of largest branch in _pr_tab made optional 21.07.2004 pure attribute evaluation added 22.07.2004 output of cut values added to pure evaluation 13.09.2004 trivial pruning of grown tree made optional----------------------------------------------------------------------*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include <assert.h>#include "vecops.h"#include "dtree.h"#ifdef STORAGE#include "storage.h"#endif/*---------------------------------------------------------------------- Preprocessor Definitions----------------------------------------------------------------------*/#define WORTHLESS -DBL_MAX /* to indicate a worthless attribute */#define EPSILON 1e-6 /* to handle roundoff errors */#define MINERROR 0.1 /* minimum error for division */#define MAXFACT 1e6 /* maximum factor for error estim. *//* --- normal distribution parameters --- */#define NQ_A0 (-0.322232431088) /* coefficients of */#define NQ_A1 (-1) /* the polynomials */#define NQ_A2 (-0.342242088547) /* used to compute */#define NQ_A3 (-0.0204231210245) /* an approximation of */#define NQ_A4 (-0.453642210148e-4) /* the quantiles of the */#define NQ_B0 0.0993484626060 /* normal distribution */#define NQ_B1 0.588581570495#define NQ_B2 0.531103462366#define NQ_B3 0.103537752850#define NQ_B4 0.0038560700634#define NQ_PMIN 1e-9 /* minimum value for the probability */#define NQ_MAX 6 /* maximum value for quantile *//* --- functions --- */#define islink(d,n) (((d)->link >= (n)->vec) && \ ((d)->link < (n)->vec +(n)->size))/*---------------------------------------------------------------------- Type Definitions----------------------------------------------------------------------*/typedef struct { /* --- tuple selection info. --- */ int col; /* column index */ INST val; /* column value */ DTDATA *vec; /* data vector of the current node */} TSEL; /* (tuple selection information) */typedef int GRPFN (TUPLE **tpls, int n, const TSEL *tsel);typedef double ESTFN (double n, double e);typedef struct _grow * PGROW; /* to avoid warning in def. of EVALFN */typedef double EVALFN (PGROW gi, TUPLE **tpls, int n, int attid, float *cut);typedef struct _grow { /* --- tree grow information --- */ DTREE *dtree; /* current decision/regression tree */ ATTSET *attset; /* attribute set of decision tree */ TUPLE **tpls; /* vector of tuples with known class */ EVALFN *eval_sym; /* eval. function for symbolic atts. */ EVALFN *eval_num; /* eval. function for numeric atts. */ void *curr; /* freq./var. table for current att. */ void *best; /* freq./var. table for best att. */ void *numt; /* freq./var. table for numeric att. */ double err; /* number of errors of subtree */ int type; /* type of the target attribute */ int measure; /* evaluation measure */ double *params; /* optional parameters */ double minval; /* minimal value of measure for split */ float mincnt; /* minimal number of tuples */ int maxht; /* maximal height of decision tree */ int flags; /* grow flags, e.g. DT_SUBSET */ double *evals; /* evaluations of attributes */ int sets[1]; /* index vector for value subsets */} GROW; /* (tree grow information) */typedef struct { /* --- tree pruning information --- */ DTREE *dtree; /* current decision/regression tree */ ATTSET *attset; /* attribute set of decision tree */ TUPLE **tpls; /* vector of tuples with known class */ void *tab; /* freq./var. table for leaf creation */ ESTFN *est; /* error estimation function */ double err; /* number of errors of subtree */ int type; /* type of the target attribute */ int maxht; /* maximal height of decision tree */ int chklb; /* whether to check largest branch */} PRUNE; /* (tree pruning information) *//*---------------------------------------------------------------------- Normal Distribution Function----------------------------------------------------------------------*/#ifdef DT_PRUNEstatic double _ndqtl (double prob){ /* --- comp. quantile of normal dist. */ double p, x; /* with mean 0 and variance 1 */ assert((prob >= 0) /* check the function argument */ && (prob <= 1)); /* (a probability must be in [0,1]) */ if (prob == 0.5) return 0; /* treat prob = 0.5 as a special case */ p = (prob > 0.5) ? 1-prob : prob; /* transform to left tail if nec. */ if (p < NQ_PMIN) return (prob < 0.5) ? -NQ_MAX : NQ_MAX; x = sqrt(log(1/(p*p))); /* compute quotient of polynomials */ x += ((((NQ_A4 *x +NQ_A3) *x +NQ_A2) *x +NQ_A1) *x +NQ_A0) / ((((NQ_B4 *x +NQ_B3) *x +NQ_B2) *x +NQ_B1) *x +NQ_B0); return (prob < 0.5) ? -x : x; /* retransform to right tail if nec. */} /* _ndqtl() *//*----------------------------------------------------------------------References: - R.E. Odeh and J.O. Evans. The percentage points of the normal distribution. Applied Statistics 22:96--97, 1974 - J.D. Beasley and S.G. Springer. Algorithm AS 111: The percentage points of the normal distribution. Applied Statistics 26:118--121, 1977 - M.J. Wichura Algorithm AS 241: The percentage points of the normal distribution. Applied Statistics 37:477--484, 1988----------------------------------------------------------------------*/#endif /* #ifdef DT_PRUNE *//*---------------------------------------------------------------------- Grouping Functions----------------------------------------------------------------------*/static int _grp_sym (TUPLE **tpls, int n, const TSEL *tsel){ /* --- group symbolic values */ TUPLE **src, **dst, *t; /* source and destination, buffer */ assert(tpls && tsel && (n >= 0)); /* check the function arguments */ for (src = dst = tpls; --n >= 0; src++) if (tpl_colval(*src, tsel->col)->i == tsel->val.i) { t = *src; *src = *dst; *dst++ = t; } return (int)(dst -tpls); /* group qualifying tuples */} /* _grp_sym() */ /* and return their number *//*--------------------------------------------------------------------*/static int _grp_set (TUPLE **tpls, int n, const TSEL *tsel){ /* --- group a set of values */ TUPLE **src, **dst, *t; /* source and destination, buffer */ register int i; /* check value and link */ assert(tpls && tsel && (n >= 0)); /* check the function arguments */ for (src = dst = tpls; --n >= 0; src++) if (((i = tpl_colval(*src, tsel->col)->i) == tsel->val.i) || (tsel->vec[i].link -tsel->vec == tsel->val.i)) { t = *src; *src = *dst; *dst++ = t; } return (int)(dst -tpls); /* group qualifying tuples */} /* _grp_set() */ /* and return their number *//*--------------------------------------------------------------------*/static int _grp_uvint (TUPLE **tpls, int n, const TSEL *tsel){ /* --- group unknown values (integer) */ TUPLE **src, **dst, *t; /* source and destination, buffer */ assert(tpls && tsel && (n >= 0)); /* check the function arguments */ for (src = dst = tpls; --n >= 0; src++) if (tpl_colval(*src, tsel->col)->i <= UV_INT) { t = *src; *src = *dst; *dst++ = t; } return (int)(dst -tpls); /* group qualifying tuples */} /* _grp_uvint() */ /* and return their number *//*--------------------------------------------------------------------*/static int _grp_int (TUPLE **tpls, int n, const TSEL *tsel){ /* --- group greater than (integer) */ TUPLE **src, **dst, *t; /* source and destination, buffer */ assert(tpls && tsel && (n >= 0)); /* check the function arguments */ for (src = dst = tpls; --n >= 0; src++) if (tpl_colval(*src, tsel->col)->i > tsel->val.i) { t = *src; *src = *dst; *dst++ = t; } return (int)(dst -tpls); /* group qualifying tuples */} /* _grp_int() */ /* and return their number *//*--------------------------------------------------------------------*/static int _grp_uvflt (TUPLE **tpls, int n, const TSEL *tsel){ /* --- group unknown values (float) */ TUPLE **src, **dst, *t; /* source and destination, buffer */ assert(tpls && tsel && (n >= 0)); /* check the function arguments */ for (src = dst = tpls; --n >= 0; src++) if (tpl_colval(*src, tsel->col)->f <= UV_FLT) { t = *src; *src = *dst; *dst++ = t; } return (int)(dst -tpls); /* group qualifying tuples */} /* _grp_uvflt() */ /* and return their number *//*--------------------------------------------------------------------*/static int _grp_flt (TUPLE **tpls, int n, const TSEL *tsel){ /* --- group greater than (float) */ TUPLE **src, **dst, *t; /* source and destination, buffer */ assert(tpls && tsel && (n >= 0)); /* check the function arguments */ for (src = dst = tpls; --n >= 0; src++) if (tpl_colval(*src, tsel->col)->f > tsel->val.f) { t = *src; *src = *dst; *dst++ = t; } return (int)(dst -tpls); /* group qualifying tuples */} /* _grp_flt() */ /* and return their number *//*--------------------------------------------------------------------*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -