📄 dtree2.orig
字号:
static DTNODE* _collapse (DTREE *dt, DTNODE *node){ /* --- collapse subtree into a leaf */ int i; /* loop variable */ DTDATA *data; /* to traverse the data vector */ double *frq, *max; /* to traverse the class frequencies */ double sum; /* sum of the class frequencies */ assert(dt && node); /* check the function arguments */ if (!node || (node->flags & DT_LEAF)) return node; /* if the node is a leaf, abort */ /* --- numeric target attribute --- */ if (dt->type != AT_SYM) { /* if the target att. is numeric */ for (data = node->vec +(i = node->size); --i >= 0; ) if ((--data)->child && !islink(data, node)) _delete(data->child); /* delete all subtrees of the node */ node->flags = DT_LEAF; /* turn the node into a leaf, */ node->attid = dt->trgid; /* store the target identifier, */ node->size = 0; /* and clear the vector size */ return (DTNODE*)realloc(node, sizeof(DTNODE) -sizeof(DTDATA)); } /* shrink the node to leaf size */ /* --- symbolic target attribute --- */ for (frq = dt->buf +(i = dt->clscnt); --i >= 0; ) *--frq = 0; /* clear the class frequencies */ node = _coll_rec(node, frq); /* collapse the subtree into a leaf */ if (!node) return NULL; /* if no result leaf found, abort */ max = frq = dt->buf; sum = *frq; for (i = dt->clscnt; --i > 0; ) { sum += *++frq; /* traverse the class frequencies, */ if (*frq > *max) max = frq; /* compute the frequency sum, and */ } /* determine the most frequent class */ frq = dt->buf +node->size; /* traverse the counter vector */ for (data = node->vec +(i = node->size); --i >= 0; ) (--data)->frq = (float)*--frq; /* copy the class frequencies */ node->frq = (float)sum; /* set the node frequency, */ node->err = sum -*max; /* the number of errors, and */ node->trg.i = (int)(max-frq); /* the most frequent class */ return node; /* return the modified leaf node */} /* _collapse() */#endif /* #ifdef DT_PRUNE *//*--------------------------------------------------------------------*/static int _count (DTREE *dt, DTNODE *node, int height){ /* --- count the number of nodes */ int i, n = 1; /* loop variable, node counter */ DTDATA *data; /* to traverse the data vector */ assert(dt && node && (height >= 0)); /* check the function args. */ if (++height > dt->height) /* if beyond the curr. max. height, */ dt->height = height; /* update the maximal height */ if (!(node->flags & DT_LEAF)){/* if the current node is no leaf */ for (data = node->vec +(i = node->size); --i >= 0; ) if ((--data)->child && !islink(data, node)) n += _count(dt, data->child, height); } /* recursively count the nodes */ return n; /* return the number of nodes */} /* _count() *//*---------------------------------------------------------------------- Evaluation Functions for Symbolic Target Attributes----------------------------------------------------------------------*/#ifdef DT_GROWstatic double _sym_sym (GROW *gi, TUPLE **tpls, int n, int attid, float *cut){ /* --- evaluate a symbolic attribute */ int i, k; /* class attribute id, loop variable */ int cls, val; /* class and attribute value */ assert(gi && tpls && (n > 0));/* check the function arguments */ k = att_valcnt(as_att(gi->attset, attid)); ft_init((FRQTAB*)gi->curr, k, gi->dtree->clscnt); i = gi->dtree->trgid; /* initialize the frequency table */ for (tpls += n; --n >= 0; ) { /* and traverse the tuples */ cls = tpl_colval(*--tpls, i)->i; val = tpl_colval(*tpls, attid)->i; ft_add((FRQTAB*)gi->curr, val, cls, tpl_info(*tpls)->f); } /* fill the frequency table */ ft_marg(gi->curr); /* and marginalize it */ if (gi->flags & DT_SUBSET) /* if only called to compute the */ return 0; /* initial table, abort the function */ for (i = 0; --k >= 0; ) /* count reasonable tuple sets */ if (ft_frq_x((FRQTAB*)gi->curr, k) >= gi->mincnt) i++; /* there must be at least two values */ if (i < 2) return WORTHLESS; /* with at least _mincnt cases each */ return ft_eval((FRQTAB*)gi->curr, gi->measure, gi->params);} /* _sym_sym() */ /* evaluate the frequency table *//*--------------------------------------------------------------------*/static double _sym_set (GROW *gi, TUPLE **tpls, int n, int attid, float *cut){ /* -- evaluate subsets of sym. att. */ int i, k, cls; /* loop variables */ int valcnt; /* number of values/subsets */ int rtscnt; /* number of reasonable tuple sets */ int *sets; /* to traverse subset index vector */ int isrc, idst, imax; /* indices of sets to combine */ double fsrc, fdst, fmax; /* frequencies of sets to combine */ double curr, prev, best; /* current/previous/best worth */ assert(gi && tpls && (n > 0)); /* check the function arguments */ _sym_sym(gi, tpls, n, attid, cut); /* compute initial freq. table */ /* --- remove unsupported and merge single class values --- */ valcnt = att_valcnt(as_att(gi->attset, attid)); imax = rtscnt = 0; /* initialize the variables and */ fmax = 0; sets = gi->sets; /* traverse the set index vector */ for (i = 0; i < valcnt; i++){ /* and the attribute values */ if (ft_dest((FRQTAB*)gi->curr, i) != i) continue; /* skip linked/combined values */ fdst = ft_frq_x((FRQTAB*)gi->curr, i); if (fdst <= 0) continue; /* skip unsupported attribute values */ *sets++ = i; /* note value index in index vector */ for (cls = gi->dtree->clscnt; --cls >= 0; ) { fsrc = ft_frq_xy((FRQTAB*)gi->curr, i, cls); if (fsrc < fdst *(1-EPSILON)) { /* if no single class set, */ if (fsrc > EPSILON) break; /* if class is supp., abort */ else continue; /* otherwise continue */ } /* with the next class */ for (k = i; ++k < valcnt; ) { /* if single class tuple set, */ fsrc = ft_frq_x((FRQTAB*)gi->curr, k); if (fsrc <= 0) continue; /* traverse remaining values */ if (ft_frq_xy((FRQTAB*)gi->curr, k, cls) >= fsrc *(1-EPSILON)) { ft_comb((FRQTAB*)gi->curr, k, i); fdst += fsrc; } } break; /* if second single class set with */ } /* same class found, combine values */ if (fdst > fmax) { fmax = fdst; imax = i; } if (fdst >= gi->mincnt) rtscnt++; } /* count reasonable tuple sets */ valcnt = (int)(sets -gi->sets); sets = gi->sets; /* compute new number of values */ /* --- find best pair mergers --- */ prev = ft_eval((FRQTAB*)gi->curr, gi->measure, gi->params); if (valcnt <= 2) /* check whether a merge is possible */ return (rtscnt >= 2) ? prev : WORTHLESS; while (1) { /* select and merge loop */ best = WORTHLESS; /* initialize worth of partition */ isrc = idst = 0; /* and value set indices */ for (i = 0; i < valcnt; i++) { /* traverse the value sets */ if ((rtscnt <= 2) /* if less than two reasonable sets, */ && (i == imax)) continue; /* skip mergers with largest set */ for (k = i+1; k < valcnt; k++) { /* traverse remaining sets */ if ((rtscnt <= 2) /* if less than two reasonable sets, */ && (k == imax)) continue; /* skip mergers with largest set */ ft_comb((FRQTAB*)gi->curr, sets[k], sets[i]);/* combine sets */ curr = ft_eval((FRQTAB*)gi->curr, gi->measure, gi->params); ft_uncomb((FRQTAB*)gi->curr, sets[k]); /* evaluate partition, */ if (curr <= best) continue; /* then uncombine sets */ best = curr; /* if the current partition is better */ isrc = k; /* than the best one found up to now, */ idst = i; /* note worth and value set indices */ } /* of the current pair merger */ } if ((best <= WORTHLESS) /* if no useful partition found */ || (best < prev)) /* or best partition found is worse */ break; /* than the previous one, abort loop */ prev = best; /* note the best worth */ fsrc = ft_frq_x((FRQTAB*)gi->curr, sets[isrc]); if (fsrc >= gi->mincnt) /* get the source frequency */ rtscnt--; /* and update reasonable tuple sets */ fdst = ft_frq_x((FRQTAB*)gi->curr, sets[idst]); if (fdst >= gi->mincnt) /* get the destination frequency */ rtscnt--; /* and update reasonable tuple sets */ ft_comb(gi->curr, sets[isrc], sets[idst]); fdst = ft_frq_x((FRQTAB*)gi->curr, sets[idst]); if (fdst >= gi->mincnt) /* combine sets, get new frequency, */ rtscnt++; /* and update reasonable tuple sets */ if (fdst >= fmax) { fmax = fdst; imax = idst; } if (imax > isrc) imax--; /* adapt vector index of largest set */ for (i = isrc; i < valcnt-1; i++) sets[i] = sets[i+1]; /* remove source set from vector */ valcnt--; /* (shift to preserve value order) */ } /* and reduce number of values */ if (rtscnt < 2) return WORTHLESS; return prev; /* return the evaluation result */} /* _sym_set() *//*--------------------------------------------------------------------*/static double _sym_num (GROW *gi, TUPLE **tpls, int n, int attid, float *cut){ /* --- evaluate a numeric attribute */ int i, k; /* loop variable, class att. index */ int type; /* type of the test attribute */ int cls; /* class of current tuple */ INST *val, *pval; /* value of current and prec. tuple */ double wgt, sum = 0; /* tuple weight, sum of tuple weights */ double curr, best; /* current and best cut worth */ assert(gi && tpls && (n > 0) && cut); /* check the function args. */ /* --- build initial frequency table --- */ type = att_type(as_att(gi->attset, attid)); v_sort(tpls, n, (type == AT_FLT) ? _cmp_flt : _cmp_int, (void*)attid); ft_init((FRQTAB*)gi->numt, 2, gi->dtree->clscnt); k = gi->dtree->trgid; /* sort tuples and initialize table */ while (1) { /* traverse tuples with unknown value */ if (n <= 1) /* if there is at most one tuple */ return WORTHLESS; /* with a known att. value, abort */ val = tpl_colval(*tpls, attid); /* get the next value */ if ((type == AT_INT) ? (val->i > UV_INT) : (val->f > UV_FLT)) break; /* if the value is known, abort loop */ n--; /* go to the next tuple */ cls = tpl_colval(*tpls, k)->i; /* get the class */ wgt = tpl_info (*tpls++)->f; /* and the tuple weight */ ft_add((FRQTAB*)gi->numt, -1, cls, wgt); } /* store the tuple weight */ do { /* traverse tuples with known value */ if (--n <= 0) /* if there is no tuple left for */ return WORTHLESS; /* the other side of the cut, abort */ cls = tpl_colval(*tpls, k)->i; /* get the class */ wgt = tpl_info (*tpls++)->f; /* and the tuple weight */ ft_add((FRQTAB*)gi->numt, 0, cls, wgt); sum += wgt; /* store the tuple weight */ } while (sum < gi->mincnt); /* while minimal size not reached */ for (i = n; --i >= 0; ) { /* traverse the remaining tuples */ cls = tpl_colval(tpls[i], k)->i; /* get the class */ wgt = tpl_info (tpls[i])->f; /* and the tuple weight */ ft_add((FRQTAB*)gi->numt, 1, cls, wgt); } /* store the tuple weight */ ft_marg(gi->numt); /* marginalize frequency table and */ sum = ft_frq_x((FRQTAB*)gi->numt,1); /* get weight of upper part */ if (sum < gi->mincnt) return WORTHLESS; /* --- find the best cut value --- */ pval = tpl_colval(*(tpls-1), attid); best = WORTHLESS; /* note the class and the value and */ *cut = UV_FLT; /* init. the worth and the cut value */ while (1) { /* traverse the tuples above the cut */ val = tpl_colval(*tpls, attid); /* if next value differs */ if ((type == AT_INT) ? (val->i > pval->i) : (0.5F *(pval->f +val->f) > pval->f)) { curr = ft_eval((FRQTAB*)gi->numt, gi->measure, gi->params); if (curr >= best) { /* evaluate the frequency table, */ best = curr; /* update the best cut worth and */ ft_copy((FRQTAB*)gi->curr, gi->numt); /* copy the table */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -