📄 dtree2.orig
字号:
*cut = (type == AT_INT) /* depending on the attribute type */ ? (0.5F *((float)val->i +pval->i)) : (0.5F *( val->f +pval->f)); } /* compute the cut value as the */ } /* average of the adjacent values */ if (--n <= 0) break; /* if on the last tuple, abort loop */ sum -= wgt = tpl_info(*tpls)->f; /* get and sum the tuple weight */ if (sum < gi->mincnt) /* if the weight of the remaining */ break; /* tuples is too small, abort loop */ cls = tpl_colval(*tpls, k)->i; ft_move((FRQTAB*)gi->numt, 1, 0, cls, wgt); pval = val; tpls++; /* move weight in frequency table, */ } /* note the value, and get next tuple */ return best; /* return worth of best cut */} /* _sym_num() *//*---------------------------------------------------------------------- Evaluation Functions for Numeric Target Attributes----------------------------------------------------------------------*/static double _num_sym (GROW *gi, TUPLE **tpls, int n, int attid, float *cut){ /* --- evaluate a symbolic attribute */ int i, k; /* class attribute id, loop variable */ INST *inst; /* instance of the target attribute */ double val; /* value of candidate test attribute */ int vid; /* identifier of attribute value */ assert(gi && tpls && (n > 0));/* check the function arguments */ k = att_valcnt(as_att(gi->attset, attid)); vt_init((VARTAB*)gi->curr, k); i = gi->dtree->trgid; /* initialize the variation table */ for (tpls += n; --n >= 0; ) { /* and traverse the tuples */ inst = tpl_colval(*--tpls, i); val = (double)((gi->type == AT_INT) ? inst->i : inst->f); vid = tpl_colval(*tpls, attid)->i; vt_add((VARTAB*)gi->curr, vid, val, tpl_info(*tpls)->f); } /* fill the variation table */ vt_calc(gi->curr); /* and calculate aggregates */ if (gi->flags & DT_SUBSET) /* if only called to compute the */ return 0; /* initial table, abort the function */ for (i = 0; --k >= 0; ) /* count reasonable tuple sets */ if (vt_colfrq((VARTAB*)gi->curr, k) >= gi->mincnt) i++; /* there must be at least two values */ if (i < 2) return WORTHLESS; /* with at least _mincnt cases each */ return vt_eval((VARTAB*)gi->curr, gi->measure, gi->params);} /* _num_sym() */ /* evaluate the variation table *//*--------------------------------------------------------------------*/static double _num_set (GROW *gi, TUPLE **tpls, int n, int attid, float *cut){ /* -- evaluate subsets of sym. att. */ int i, k; /* loop variables */ int valcnt; /* number of values/subsets */ int rtscnt; /* number of reasonable tuple sets */ int *sets; /* to traverse subset index vector */ int isrc, idst, imax; /* indices of sets to combine */ double fsrc, fdst, fmax; /* frequencies of sets to combine */ double curr, prev, best; /* current/previous/best worth */ assert(gi && tpls && (n > 0)); /* check the function arguments */ _num_sym(gi, tpls, n, attid, cut); /* compute initial freq. table */ /* --- collect supported values --- */ valcnt = att_valcnt(as_att(gi->attset, attid)); imax = rtscnt = 0; /* initialize the variables and */ fmax = 0; sets = gi->sets; /* traverse the set index vector */ for (i = 0; i < valcnt; i++){ /* and the attribute values */ if (vt_dest((VARTAB*)gi->curr, i) != i) continue; /* skip linked/combined values */ fdst = vt_colfrq((VARTAB*)gi->curr, i); if (fdst <= 0) continue; /* skip unsupported attribute values */ *sets++ = i; /* note value index in index vector */ if (fdst > fmax) { fmax = fdst; imax = i; } if (fdst >= gi->mincnt) rtscnt++; } /* count reasonable tuple sets */ valcnt = (int)(sets -gi->sets); sets = gi->sets; /* compute new number of values */ /* --- find best pair mergers --- */ prev = vt_eval((VARTAB*)gi->curr, gi->measure, gi->params); if (valcnt <= 2) /* check whether a merge is possible */ return (rtscnt >= 2) ? prev : WORTHLESS; while (1) { /* select and merge loop */ best = WORTHLESS; /* initialize worth of partition */ isrc = idst = 0; /* and value set indices */ for (i = 0; i < valcnt; i++) { /* traverse the value sets */ if ((rtscnt <= 2) /* if less than two reasonable sets, */ && (i == imax)) continue; /* skip mergers with largest set */ for (k = i+1; k < valcnt; k++) { /* traverse remaining sets */ if ((rtscnt <= 2) /* if less than two reasonable sets, */ && (k == imax)) continue; /* skip mergers with largest set */ vt_comb((VARTAB*)gi->curr, sets[k], sets[i]);/* combine sets */ curr = vt_eval(gi->curr, gi->measure, gi->params); vt_uncomb((VARTAB*)gi->curr, sets[k]); /* evaluate partition, */ if (curr <= best) continue; /* then uncombine sets */ best = curr; /* if the current partition is better */ isrc = k; /* then the best one found up to now, */ idst = i; /* note worth and value set indices */ } /* of the current pair merger */ } if ((best <= WORTHLESS) /* if no useful partition found */ || (best < prev)) /* or best partition found is worse */ break; /* than the previous one, abort loop */ prev = best; /* note the best worth */ fsrc = vt_colfrq((VARTAB*)gi->curr, sets[isrc]); if (fsrc >= gi->mincnt) /* get the source frequencies */ rtscnt--; /* and update reasonable tuple sets */ fdst = vt_colfrq((VARTAB*)gi->curr, sets[idst]); if (fdst >= gi->mincnt) /* get the destination frequency */ rtscnt--; /* and update reasonable tuple sets */ vt_comb(gi->curr, sets[isrc], sets[idst]); fdst = vt_colfrq((VARTAB*)gi->curr, sets[idst]); if (fdst >= gi->mincnt) /* combine sets, get new frequency, */ rtscnt++; /* and update reasonable tuple sets */ if (fdst > fmax) { fmax = fdst; imax = idst; } if (imax > isrc) imax--; /* adapt vector index of largest set */ for (i = isrc; i < valcnt-1; i++) sets[i] = sets[i+1]; /* remove source set from vector */ valcnt--; /* (shift to preserve value order) */ } /* and reduce number of values */ if (rtscnt < 2) return WORTHLESS; return prev; /* return the evaluation result */} /* _num_set() *//*--------------------------------------------------------------------*/static double _num_num (GROW *gi, TUPLE **tpls, int n, int attid, float *cut){ /* --- evaluate a numeric attribute */ int i, k; /* loop variable, target att. index */ int type, t; /* type of the test/target attribute */ INST *inst; /* target instance of current tuple */ double trg; /* target value of current tuple */ INST *val, *pval; /* value of current and prec. tuple */ double wgt, sum = 0; /* tuple weight, sum of tuple weights */ double curr, best; /* current and best cut worth */ assert(gi && tpls && (n > 0) && cut); /* check the function args. */ /* --- build initial variation table --- */ type = att_type(as_att(gi->attset, attid)); v_sort(tpls, n, (type == AT_FLT) ? _cmp_flt : _cmp_int, (void*)attid); vt_init((VARTAB*)gi->numt,2); /* sort tuples and initialize table */ k = gi->dtree->trgid; /* note the target attribute index */ t = gi->dtree->type; /* and the target type */ while (1) { /* traverse tuples with unknown value */ if (n <= 1) /* if there is at most one tuple */ return WORTHLESS; /* with a known att. value, abort */ val = tpl_colval(*tpls, attid); /* get the next value */ if ((type == AT_INT) ? (val->i > UV_INT) : (val->f > UV_FLT)) break; /* if the value is known, abort loop */ n--; /* go to the next tuple */ inst = tpl_colval(*tpls, k); trg = (double)((t == AT_INT) ? inst->i : inst->f); wgt = tpl_info(*tpls++)->f;/* get the target value */ vt_add((VARTAB*)gi->numt, -1, trg, wgt); } /* store the tuple weight */ do { /* traverse tuples with known value */ if (--n <= 0) /* if there is no tuple left for */ return WORTHLESS; /* the other side of the cut, abort */ inst = tpl_colval(*tpls, k); trg = (double)((t == AT_INT) ? inst->i : inst->f); wgt = tpl_info(*tpls++)->f; /* get the target value and */ vt_add(gi->numt, 0, trg, wgt); /* store the tuple weight */ sum += wgt; /* in the variation table */ } while (sum < gi->mincnt); /* while minimal size not reached */ for (i = n; --i >= 0; ) { /* traverse the remaining tuples */ inst = tpl_colval(tpls[i], k); trg = (double)((t == AT_INT) ? inst->i : inst->f); wgt = tpl_info(tpls[i])->f;/* get the target value */ vt_add((VARTAB*)gi->numt, 1, trg, wgt); } /* store the tuple weight */ vt_calc((VARTAB*)gi->numt); /* calculate aggregates and get */ sum = vt_colfrq((VARTAB*)gi->numt,1); /* weight of upper part */ if (sum < gi->mincnt) return WORTHLESS; /* --- find the best cut value --- */ pval = tpl_colval(*(tpls-1), attid); best = WORTHLESS; /* note the class and the value and */ *cut = UV_FLT; /* init. the worth and the cut value */ while (1) { /* traverse the tuples above the cut */ val = tpl_colval(*tpls, attid); /* if next value differs */ if ((type == AT_INT) ? (val->i > pval->i) : (0.5F *(pval->f +val->f) > pval->f)) { curr = vt_eval((VARTAB*)gi->numt, gi->measure, gi->params); if (curr >= best) { /* evaluate the variation table, */ best = curr; /* update the best cut worth, and */ vt_copy((VARTAB*)gi->curr, gi->numt); /* copy the table */ *cut = (type == AT_INT) /* depending on the attribute type */ ? (0.5F *((float)val->i +pval->i)) : (0.5F *( val->f +pval->f)); } /* compute the cut value as the */ } /* average of the adjacent values */ if (--n <= 0) break; /* if on the last tuple, abort loop */ sum -= wgt = tpl_info(*tpls)->f; /* get and sum the tuple weight */ if (sum < gi->mincnt) /* if the weight of the remaining */ break; /* tuples is too small, abort loop */ inst = tpl_colval(*tpls, k); trg = (double)((t == AT_INT) ? inst->i : inst->f); vt_move((VARTAB*)gi->numt, 1, 0, trg, wgt); pval = val; tpls++; /* move weight in variation table, */ } /* note the value, and get next tuple */ return best; /* return worth of best cut */} /* _num_num() *//*---------------------------------------------------------------------- Growing Functions----------------------------------------------------------------------*/static DTNODE* _grow (GROW *gi, DTNODE *leaf, TUPLE **tpls, int n){ /* --- recursively grow tree */ int i, k; /* loop variables, buffers */ int mark; /* buffer for attribute mark */ ATT *att; /* current/test attribute */ double curr, best; /* current and best worth */ float cut; /* cut value for best attribute */ void *tab; /* exchange buffer for eval. tables */ DTNODE *node; /* created test node (subtree) */ DTDATA *data; /* to traverse the data vector */ TSEL tsel; /* tuple selection information */ GRPFN *group; /* tuple grouping function */ double frq, known; /* tuple frequencies for weighting */ double e_tree; /* sum of subtree errors */ assert(leaf && tpls && (n > 0)); /* check the function arguments */ /* --- check the leaf node --- */ gi->err = leaf->err; /* set the number of leaf errors */ if ((leaf->frq < 2*gi->mincnt)/* if too few tuples to divide */ || (leaf->err < MINERROR) /* or all cases have the same value */ || (gi->maxht <= 0)) /* or maximal tree height reached, */ return leaf; /* simply return the leaf node */ /* --- search for a test attribute --- */ best = WORTHLESS; /* clear worth of best attribute, */ tsel.col = -1; /* identifier of best attribute, */ tsel.val.f = cut = UV_FLT; /* and cut value (numeric atts.) */ k = as_attcnt(gi->attset); /* get the number of attributes */ for (i = 0; i < k; i++) { /* traverse the attributes, */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -