📄 dtree2.orig
字号:
att = as_att(gi->attset,i); /* but skip marked attributes */ if (att_getmark(att) < 0) continue; if (att_type(att) == AT_SYM) curr = gi->eval_sym(gi, tpls,n,i,NULL); else curr = gi->eval_num(gi, tpls,n,i,&cut); if (gi->flags & DT_EVAL) { /* if only to evaluate attributes */ gi->evals[i] = curr; /* store the attribute evaluation */ att_info(att)->f = (att_type(att) == AT_SYM) ? 0 : cut; } /* store a possible cut value */ if (curr <= best) continue; /* evaluate attribute (compute worth) */ best = curr; /* if the current worth is better */ tsel.col = i; /* than that of the best attribute, */ tsel.val.f = cut; /* note worth, attribute id, and cut */ tab = gi->best; /* exchange the freq./var. tables, */ gi->best = gi->curr; /* so that the current table */ gi->curr = tab; /* becomes the best table */ } /* (much more efficient than copying) */ if ((gi->flags & DT_LEAF) /* if only to evaluate attributes */ || (tsel.col < 0) /* or no test attribute found */ || (best < gi->minval)) /* or the best attribute is not */ return leaf; /* good enough, abort the function */ /* --- create a test node --- */ att = as_att(gi->attset, tsel.col); i = (att_type(att) == AT_SYM) ? att_valcnt(att) : 2; node = (DTNODE*)malloc(sizeof(DTNODE) +(i-1) *sizeof(DTDATA)); if (!node) { gi->err = -1; return leaf; } node->flags = 0; /* create a test node */ node->attid = tsel.col; /* set attribute id */ node->cut = tsel.val.f; /* and cut value */ node->frq = leaf->frq; /* copy the node frequency, */ node->err = leaf->err; /* the number of errors, */ node->trg = leaf->trg; /* and the target value */ node->size = i; /* set the data vector size */ /* --- create child nodes --- */ for (data = node->vec +(k = node->size); --k >= 0; ) { if (gi->type == AT_SYM) i = ft_dest(gi->best, k); else i = vt_dest(gi->best, k); if (i == k) /* if the value is not linked, */ (--data)->link = NULL; /* clear the link pointer */ else { /* if the value is linked */ (--data)->link = node->vec +i; node->flags |= DT_LINK; /* set a link to the dest. value */ } /* and set the flag for node links */ } /* (init. pointers for cleanup) */ if (gi->type == AT_SYM) { /* if the target is symbolic */ for (data += k = node->size; --k >= 0; ) { if (!(--data)->link /* traverse all uncombined values */ && (ft_frq_x((FRQTAB*)gi->best, k) >= EPSILON)) { data->child = _leaf_sym(gi->dtree, (FRQTAB*)gi->best, k); if (!data->child) { _delete(node); gi->err = -1; return leaf; } } /* create a leaf node */ } /* for all supported values */ frq = ft_known((FRQTAB*)gi->best); } else { /* if the target is numeric */ for (data += k = node->size; --k >= 0; ) { if (!(--data)->link /* traverse all uncombined values */ && (vt_colfrq((VARTAB*)gi->best, k) >= EPSILON)) { data->child = _leaf_num(gi->dtree, (VARTAB*)gi->best, k); if (!data->child) { _delete(node); gi->err = -1; return leaf; } } /* create a leaf node */ } /* for all supported values */ frq = vt_known((VARTAB*)gi->best); } /* note the sum of the weights of */ known = frq; /* tuples with a known att. value */ e_tree = 0; /* clear the number of errors */ gi->maxht--; /* reduce the maximal tree height */ /* --- branch on symbolic attribute --- */ if (att_type(att) == AT_SYM){ /* if the test attribute is symbolic */ mark = att_getmark(att); /* mark attribute (to prevent tests) */ if (!(gi->flags & DT_SUBSET)) att_setmark(att, -1); tsel.val.i = UV_SYM; /* group tuples with */ i = _grp_sym(tpls,n,&tsel); /* an unknown attribute value */ group = (node->flags & DT_LINK) ? _grp_set : _grp_sym; frq = known; /* get the denominator of the weight */ tsel.vec = data; /* traverse the attribute values */ for (data += tsel.val.i = node->size; --tsel.val.i >= 0; ) { if (!(--data)->child /* if an att. value is not supported */ || islink(data, node)) /* or combined with another value, */ continue; /* skip the attribute value */ k = group(tpls+i, n-i, &tsel); /* group tuples with next value */ if (i > 0) { /* if there are unknown values */ _mul_wgt(tpls, i, data->child->cut/frq); frq = data->child->cut; /* weight tuples with unknown value, */ } /* note the denom. for reweighting */ data->child = _grow(gi, data->child, tpls, k+i); if (gi->err < 0) { _delete(node); return leaf; } e_tree += gi->err; /* grow a leaf/subtree for the value */ if (i > 0) /* if there are unknown values, */ group(tpls,k+i, &tsel); /* regroup tuples with current value */ tpls += k; n -= k; /* and skip processed tuples */ } att_setmark(att, mark); /* restore mark of test attribute */ if (i > 0) _mul_wgt(tpls, i, known/frq); } /* reweight tuples with unknowns */ /* --- branch on numeric attribute --- */ else { /* if the test attribute is numeric */ if (att_type(att) == AT_FLT) { group = _grp_flt; } else { tsel.val.i = (int)floor(tsel.val.f); group = _grp_int; } k = group(tpls, n, &tsel); /* group tuples greater than the cut */ group = (att_type(att) == AT_FLT) ? _grp_uvflt : _grp_uvint; i = group(tpls+k,n-k,&tsel);/* group tuples with unknown value */ if (i > 0) { /* if there are unknown values */ _mul_wgt(tpls+k, i, data[0].child->cut/known); frq = data[0].child->cut; /* weight tuples with unknown value */ } /* and note freq. for reweighting */ data[0].child = _grow(gi, data[0].child, tpls+k, n-k); if (gi->err < 0) { _delete(node); return leaf; } e_tree += gi->err; /* grow a leaf/subtree for <= cut */ if (i > 0) { /* if there are unknown values, */ group(tpls+k,n-k, &tsel); /* regroup tuples with unknown value */ _mul_wgt(tpls+k, i, data[1].child->cut/frq); frq = data[1].child->cut; /* weight tuples with unknown value */ } /* and note freq. for reweighting */ data[1].child = _grow(gi, data[1].child, tpls, k+i); if (gi->err < 0) { _delete(node); return leaf; } e_tree += gi->err; /* grow a leaf/subtree for > cut */ if (i > 0) { /* if there are unknown values, */ group(tpls, k+i, &tsel); /* regroup tuples with unknowns */ _mul_wgt(tpls, i, known/frq); } /* reweight tuples with unknown value */ } gi->maxht++; /* restore maximal (sub)tree height */ /* --- test subtree against leaf --- */ if (!(gi->flags & DT_NOPRUNE) && (leaf->err < e_tree *(1+EPSILON))) { _delete(node); /* if the leaf is not worse */ gi->err = leaf->err; /* than the subtree, */ return leaf; /* discard the subtree */ } /* and return the leaf */ free(leaf); /* if the subtree is better, */ gi->err = e_tree; /* discard the leaf and */ return node; /* return the subtree */} /* _grow() *//*--------------------------------------------------------------------*/static DTREE* _cleanup (GROW *gi, int err){ /* --- clean up after an error */ if (gi->type == AT_SYM) { /* if the target is symbolic */ if (gi->curr) ft_delete(gi->curr); if (gi->best) ft_delete(gi->best); if (gi->numt) ft_delete(gi->numt); } else { /* if the target is numeric */ if (gi->curr) vt_delete(gi->curr); if (gi->best) vt_delete(gi->best); if (gi->numt) vt_delete(gi->numt); } /* delete frequency/variation tables */ if (gi->tpls) free(gi->tpls); /* and the tuple vector */ if (err) { /* if to clean up after an error */ if (gi->dtree) dt_delete(gi->dtree, 0); if (gi->attset && (gi->flags & DT_DUPAS)) as_delete(gi->attset); /* delete grown decision tree and */ } /* corresponding attribute set */ free(gi); return NULL; /* delete tree grow information */} /* _cleanup() *//*--------------------------------------------------------------------*/DTREE* dt_grow (TABLE *table, int trgid, int measure, double *params, double minval, int maxht, float mincnt, int flags){ /* --- grow dec./reg. tree from table */ int i, m; /* loop variable, flagless measure */ TUPLE **tp; /* to traverse the tuple vector */ int tplcnt; /* number of tuples */ int valcnt; /* number of possible values */ int maxcnt; /* maximal number of values */ ATTSET *attset; /* attribute set */ ATT *att; /* target attribute */ int mark; /* mark of the target attribute */ INST *inst; /* instance of the target attribute */ double val; /* value of the target attribute */ GROW *gi; /* tree grow information */ DTREE *dt; /* created decision tree */ assert(table); /* check the function argument */ /* --- create an empty decision/regression tree --- */ attset = tab_attset(table); /* get the attribute set */ for (maxcnt = 2, i = as_attcnt(attset); --i >= 0; ) { if (i == trgid) continue; /* traverse attributes except target */ valcnt = att_valcnt(as_att(attset, i)); if (valcnt > maxcnt) maxcnt = valcnt; } /* determine maximum number of values */ i = (flags & DT_SUBSET) ? maxcnt : 1; gi = (GROW*)calloc(1, sizeof(GROW) +(i-1) *sizeof(int)); if (!gi) return NULL; /* create the tree grow information */ gi->flags = flags; /* and store the induction flags */ if (flags & DT_DUPAS) attset = as_dup(attset); if (!attset) return _cleanup(gi, 1); gi->attset = attset; /* duplicate att. set if requested */ gi->type = att_type(as_att(attset, trgid)); if (gi->type == AT_SYM) { /* if the target att. is symbolic */ m = measure & ~FEF_WGTD; /* remove the flags from measure code */ if ((m <= FEM_NONE) || (m >= FEM_UNKNOWN)) measure = m = FEM_NONE; } /* check the selection measure code */ else { /* if the target att. is numeric */ m = measure & ~VEF_WGTD; /* remove the flags from measure code */ if ((m <= VEM_NONE) || (m >= VEM_UNKNOWN)) measure = m = VEM_NONE; /* check the selection measure code */ } gi->measure = measure; /* note measure, parameters, */ gi->params = params; /* minimal value for a split, */ gi->minval = minval; /* and minimal number of cases */ gi->mincnt = (mincnt <= 0) ? (float)EPSILON : mincnt; gi->maxht = (maxht <= 0) ? INT_MAX : (maxht -1); gi->flags = flags; /* note maximal tree height and flags */ if (flags & DT_EVAL) { /* if only to evaluate attributes */ gi->evals = (double*)malloc(as_attcnt(attset) *sizeof(double)); if (!gi->evals) return _cleanup(gi, 1); att_info(as_att(attset, trgid))->p = gi->evals; } /* create an evaluation vector */ /* --- create decision/regression tree --- */ dt = dt_create(attset, trgid); if (!dt) return _cleanup(gi, 1); gi->dtree = dt; /* create an empty decision tree */ tp = _tuples(table, trgid, &tplcnt); if (!tp) return _cleanup(gi, 1); gi->tpls = tp; /* collect tuples with known class */ /* --- create a root node --- */ if (gi->type == AT_SYM) { /* if the target att. is symbolic */ gi->numt = ft_create(2, dt->clscnt); if (!gi->numt) return _cleanup(gi, 1); ft_init((FRQTAB*)gi->numt, 1, dt->clscnt); /* create and init. a freq. table */ for (tp += i = tplcnt; --i >= 0;) { inst = tpl_colval(*--tp, trgid); ft_add((FRQTAB*)gi->numt, 0, inst->i, tpl_info(*tp)->f); } /* aggregate the tuple weight and */ ft_marg(gi->numt); /* marginalize the frequency table */ if (ft_known((FRQTAB*)gi->numt) > 0) { /* if there are tuples */ dt->root = _leaf_sym(dt, gi->numt, 0); if (!dt->root) return _cleanup(gi, 1); } } /* create a leaf node as the root */ else { /* if the target att. is numeric
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -