📄 nbayes.c
字号:
/*--------------------------------------------------------------------*/void nbc_clear (NBC *nbc){ /* --- clear a naive Bayes classifier */ int i, k, n; /* loop variables */ DVEC *dvec; /* to traverse the distrib. vectors */ DISCD *discd; /* to traverse the discrete distribs. */ NORMD *normd; /* to traverse the normal distribs. */ double *frq; /* to traverse the frequency vectors */ assert(nbc); /* check the function argument */ nbc->total = 0; /* clear the total number of cases */ for (frq = nbc->frqs +(i = nbc->clscnt); --i >= 0; ) *--frq = 0; /* clear the frequency distribution */ for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) { if ((--dvec)->type == 0) /* traverse all attributes */ continue; /* except the class attribute */ if (dvec->type == AT_NOM) { /* if the attribute is nominal */ for (discd = dvec->discds +(k = nbc->clscnt); --k >= 0; ) { (--discd)->cnt = 0; /* traverse the distributions */ for (frq = discd->frqs +(n = dvec->valcnt); --n >= 0; ) *--frq = 0; /* traverse the frequency vector */ } } /* and clear the value frequencies */ else { /* if the attribute is numeric */ for (normd = dvec->normds +(k = nbc->clscnt); --k >= 0; ) { (--normd)->cnt = 0; normd->sv = normd->sv2 = 0; } } /* clear the sums from which expected */ } /* value and variance are computed */} /* nbc_clear() *//*--------------------------------------------------------------------*/#ifdef NBC_INDUCEint nbc_add (NBC *nbc, const TUPLE *tpl){ /* --- add an instantiation */ int i; /* loop variable */ int cls; /* value of class attribute */ float wgt; /* instantiation weight */ const INST *inst; /* to traverse the instances */ DVEC *dvec; /* to traverse the distrib. vectors */ NORMD *normd; /* to access normal distributions */ DISCD *discd; /* to access discrete distributions */ double v; /* buffer (for an attribute value) */ assert(nbc); /* check the function argument */ /* --- get class and weight --- */ if (tpl) { /* if a tuple is given */ cls = tpl_colval(tpl, nbc->clsid)->i; wgt = tpl_getwgt(tpl); } /* get the class and the tuple weight */ else { /* if no tuple is given */ cls = att_inst(as_att(nbc->attset, nbc->clsid))->i; wgt = as_getwgt(nbc->attset); } /* get the class and the inst. weight */ if (cls < 0) return 0; /* if the class is null, abort */ assert(wgt >= 0.0F); /* check the tuple weight */ /* --- update class distribution --- */ if ((cls >= nbc->clscnt) /* if the class is a new one, */ && (_clsrsz(nbc,cls+1) != 0))/* resize the class dependent vectors */ return -1; /* (frequencies and distributions) */ nbc->frqs[cls] += wgt; /* update the class frequency */ nbc->total += wgt; /* and the total frequency */ /* --- update conditional distributions --- */ for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) { if ((--dvec)->type == 0) /* traverse all attributes */ continue; /* except the class attribute */ inst = (tpl) /* get the attribute instantiation */ ? tpl_colval(tpl, i) /* from the tuple or the att. set */ : att_inst(as_att(nbc->attset, i)); if (dvec->type == AT_NOM) { /* -- if the attribute is nominal */ if (inst->i < 0) /* if the attribute value is null, */ continue; /* there is nothing to do */ if ((inst->i >= dvec->valcnt) && (_valrsz(dvec, nbc->clscnt, inst->i+1) != 0)) return -1; /* resize the value freq. vectors */ discd = dvec->discds +cls; /* get the proper distribution */ discd->frqs[inst->i] += wgt; /* and update the value frequency */ discd->cnt += wgt; } /* and the total frequency */ else { /* -- if the attribute is numeric */ if (dvec->type == AT_REAL){ /* if the attribute is real valued */ if (inst->f <= NV_REAL) continue; v = (double)inst->f;} /* check and get the attribute value */ else { /* if the attribute is integer valued */ if (inst->i <= NV_INT) continue; v = (double)inst->i; /* check and get the attribute value */ } /* (convert it to double) */ normd = dvec->normds +cls;/* get the proper distribution */ normd->cnt += wgt; /* update the case counter */ normd->sv += wgt *v; /* the sum of the values, and */ normd->sv2 += wgt *v*v; /* the sum of their squares */ } /* (expected value and variance */ } /* are computed in nbc_setup) */ return 0; /* return 'ok' */} /* nbc_add() *//*--------------------------------------------------------------------*/NBC* nbc_induce (TABLE *table, int clsid, int mode, double lcorr){ /* --- induce a naive Bayes class. */ int i, r = 0; /* loop variable, buffer */ int cnt; /* number of selectable attributes */ int cls; /* predicted class */ NBC *nbc; /* created classifier */ ATTSET *attset; /* attribute set of the classifier */ SELATT *savec; /* vector of selectable attributes */ SELATT *sa, *best; /* to traverse the selectable atts. */ TUPLE *tpl; /* to traverse the tuples */ DVEC *dvec; /* to traverse the distrib. vectors */ double *p; /* to traverse the class probs. */ double max; /* maximum of class probabilities */ double errs; /* weight sum of misclassified tuples */ assert(table /* check the function arguments */ && (clsid >= 0) && (clsid < tab_colcnt(table)) && (att_type(as_att(tab_attset(table), clsid)) == AT_NOM)); /* --- create a classifier --- */ attset = tab_attset(table); /* get the attribute set of the table */ if (mode & NBC_CLONE) { /* if the corresp. flag is set, */ attset = as_clone(attset); /* clone the attribute set */ if (!attset) return NULL; /* of the given data table, */ } /* then create a classifier */ nbc = nbc_create(attset, clsid); if (!nbc) { if (mode & NBC_CLONE) as_delete(attset); return NULL; } /* --- build initial classifier --- */ for (i = tab_tplcnt(table); --i >= 0; ) nbc_add(nbc, tab_tpl(table, i)); /* start with a */ nbc_setup(nbc, mode|NBC_ALL, lcorr); /* full classifier */ if (!(mode & (NBC_ADD|NBC_REMOVE))) return nbc; /* if no simp. is requested, abort */ /* --- evaluate initial classifier --- */ if (mode & NBC_ADD) { /* if to add attributes, */ p = nbc->frqs; errs = max = *p; for (i = nbc->clscnt; --i > 0; ) { errs += *++p; /* traverse the class frequencies, */ if (*p > max) max = *p; /* sum them, and determine their */ } /* maximum (find the majority class) */ errs -= max; } /* compute the number of prior errors */ else { /* if to remove attributes */ for (errs = 0, i = tab_tplcnt(table); --i >= 0; ) { tpl = tab_tpl(table,i); /* traverse the tuples in the table */ cls = tpl_colval(tpl, clsid)->i; if (cls < 0) continue; /* skip tuples with an null class */ assert(cls < nbc->clscnt);/* check the class value */ if (nbc_exec(nbc, tpl, NULL) != cls) errs += tpl_getwgt(tpl);/* classify the current tuple */ } /* and determine the number */ } /* of misclassifications */ #ifndef NDEBUG fprintf(stderr, "\n%8g errors initially\n", errs); #endif /* print a counter for debugging */ /* --- collect selectable attributes --- */ savec = malloc(nbc->attcnt *sizeof(SELATT)); if (!savec) { nbc_delete(nbc, mode & NBC_CLONE); return NULL; } sa = savec; /* create vector of selectable atts. */ for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) { (--dvec)->mark = -1; /* traverse all attributes */ if ( (dvec->type == 0) /* except the class attribute */ || ((dvec->type == AT_NOM) /* and all nominal attributes */ && (dvec->valcnt <= 0))) /* that do not have any values */ continue; if (mode & NBC_REMOVE) /* if to remove attributes, */ dvec->mark = i; /* mark all selectable attributes */ sa->attid = i; /* note selectable attributes and */ sa->errs = 0; sa++; /* initialize the number of errors */ } cnt = (int)(sa -savec); /* compute the number of attributes */ nbc->dvecs[nbc->clsid].mark = nbc->clsid; /* and mark the class */ /* --- select attributes --- */ while ((cnt > 0) /* while there are selectable atts. */ && (errs > 0)) { /* and the classifier is not perfect */ for (sa = savec +(i = cnt); --i >= 0; ) (--sa)->errs = 0; /* clear the numbers of errors */ r = _eval(nbc, table, mode, savec, cnt); if (r < 0) break; /* evaluate selectable attributes */ best = sa = savec; /* traverse the selectable attributes */ for (i = cnt; --i > 0; ) { /* in order to find the best */ if (((++sa)->errs < best->errs) || (( sa ->errs <= best->errs) && (mode & NBC_ADD))) best = sa; /* find least number of errors and */ } /* note the corresponding attribute */ if ( (best->errs > errs) /* if more tuples were misclassified */ || ((best->errs >= errs) && (mode & NBC_ADD))) break; /* abort the selection loop */ errs = best->errs; /* note the new number of errors */ #ifndef NDEBUG fprintf(stderr, "%8g errors %s\n", best->errs, att_name(as_att(attset, best->attid))); #endif /* print a counter for debugging */ nbc->dvecs[best->attid].mark = (mode & NBC_ADD) ? best->attid : -1; /* mark/unmark the selected attribute */ for (--cnt; best < sa; best++) /* remove the selected */ best->attid = best[1].attid; /* attribute from the */ } /* list of attributes */ free(savec); /* delete the selectable atts. vector */ if (r < 0) { /* if an error occurred, abort */ nbc_delete(nbc, mode & NBC_CLONE); return NULL; } return nbc; /* return the created classifier */} /* nbc_induce() *//*--------------------------------------------------------------------*/int nbc_mark (NBC *nbc){ /* --- mark selected attributes */ int i, m; /* loop variable, buffer for marker */ DVEC *dvec; /* to traverse the distrib. vectors */ int cnt = 0; /* attribute counter */ assert(nbc); /* check the function argument */ for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) { if ((--dvec)->mark < 0) m = -1; else { cnt++; m = 1; } att_setmark(as_att(nbc->attset, i), m); } /* transfer marker to attribute set */ att_setmark(as_att(nbc->attset, nbc->clsid), 0); return cnt; /* return number of marked atts. */} /* nbc_mark() */#endif/*--------------------------------------------------------------------*/void nbc_setup (NBC *nbc, int mode, double lcorr){ /* --- set up a naive Bayes class. */ int i, k, n; /* loop variables */ DVEC *dvec; /* to traverse the distrib. vectors */ NORMD *normd; /* to traverse the normal distribs. */ DISCD *discd; /* to traverse the discrete distribs. */ double *frq, *prb; /* to traverse the value frqs./probs. */ double cnt, sp; /* number of cases, sum of priors */ double add; /* Laplace corr. + distributed weight */ assert(nbc && (lcorr >= 0)); /* check the function arguments */ nbc->mode = mode & (NBC_DWNULL|NBC_MAXLLH); nbc->lcorr = lcorr; /* note estimation parameters */ /* --- estimate class probabilities --- */ cnt = nbc->total +lcorr *nbc->clscnt;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -