📄 nbayes.c
字号:
n = nbc->clscnt; /* get the number of classes and */ prb = nbc->priors +n; /* traverse the class probabilities */ if (cnt <= 0) /* if the denominator is invalid, */ while (--n >= 0) *--prb = 0; /* clear all probabilities */ else { /* if the denominator is valid, */ frq = nbc->frqs +n; /* traverse the class frequencies */ while (--n >= 0) *--prb = (*--frq +lcorr) /cnt; } /* estimate the class probabilities */ /* --- estimate conditional probabilities --- */ for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) { if ((--dvec)->type == 0) { /* traverse all attributes */ dvec->mark = 0; continue;}/* except the class attribute */ if (mode & NBC_ALL) /* if to use all attributes, */ dvec->mark = 1; /* mark attribute as used */ else if (mode & NBC_MARKED) /* if to use only marked atts. */ dvec->mark = (att_getmark(as_att(nbc->attset, i)) >= 0) ? 1 : -1; /* get the attribute mark */ if (dvec->mark < 0) /* otherwise keep the */ continue; /* selection of attributes */ if (dvec->type == AT_NOM) { /* -- if the attribute is nominal */ if (dvec->valcnt <= 0) { /* if the attribute has no values, */ dvec->mark = -1; continue; } /* there is nothing to do */ sp = dvec->valcnt *lcorr; /* compute the sum of the priors */ for (discd = dvec->discds +(k = nbc->clscnt); --k >= 0; ) { --discd; /* traverse the distributions */ n = dvec->valcnt; /* get the number of att. values */ prb = discd->probs +n; /* and the probability vector */ add = (mode & NBC_DWNULL) ? nbc->frqs[k] : discd->cnt; cnt = sp +add; /* compute denominator of estimator */ if (cnt <= 0) /* if the estimator is invalid, */ while (--n >= 0) *--prb = 0; /* clear all probs. */ else { /* if the estimator is valid */ add = lcorr +(add -discd->cnt) /n; for (frq = discd->frqs +n; --n >= 0; ) *--prb = (*--frq +add) /cnt; } /* traverse the value frequencies */ } } /* and estimate the probabilities */ else { /* -- if the attribute is numeric */ for (normd = dvec->normds +(k = nbc->clscnt); --k >= 0; ) { cnt = (--normd)->cnt; /* traverse the distributions */ normd->exp = (cnt > 0) ? normd->sv /cnt : 0; if (!(mode & NBC_MAXLLH)) cnt -= 1; normd->var = (cnt > 0) ? (normd->sv2 -normd->exp *normd->sv) /cnt : 0; } /* estimate the expected value and */ } /* the variance (either with unbiased */ } /* or with max. likelihood estimator) */ nbc->dvecs[nbc->clsid].mark = 1; /* mark the class attribute */} /* nbc_setup() *//*----------------------------------------------------------------------Point estimator for the expected value of a normal distribution: \hat{\mu} = \frac{1}{n} \sum_{i=1}^n x_i(unbiased, consistent, and efficient)Point estimator for the variance of a normal distribution: \hat{\sigma}^2 = \frac{1}{n-1} \sum_{i=1}^n (x_i - \hat{\mu})^2 = \frac{1}{n-1} (\sum_{i=1}^n x_i^2 - n\hat{\mu}^2)(unbiased and consistent)Maximum likelihood estimator for the variance of a normal distribution: \hat{\sigma}^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \hat{\mu})^2 = \frac{1}{n} (\sum_{i=1}^n x_i^2 - n\hat{\mu}^2)(consistent)Source: R.L. Larsen and M.L. Marx. An Introduction to Mathematical Statistics and Its Applications. Prentice Hall, Englewood Cliffs, NJ, USA 1986, pp. 312 & 314----------------------------------------------------------------------*/int nbc_exec (NBC *nbc, const TUPLE *tpl, double *conf){ /* --- execute a naive Bayes class. */ int i, k; /* loop variables */ DVEC *dvec; /* to traverse the distrib. vectors */ const INST *inst; /* to traverse the instances */ double *s, *d; /* to traverse the probabilities */ double sum; /* sum of class probabilities */ assert(nbc); /* check the function argument */ /* --- initialize --- */ s = nbc->priors +nbc->clscnt; /* init. the posterior distribution */ d = nbc->posts +nbc->clscnt; /* with the prior distribution */ for (k = nbc->clscnt; --k >= 0; ) *--d = *--s; /* --- process attribute values --- */ for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) { if (((--dvec)->type == 0) /* traverse all attributes */ || ( dvec ->mark < 0)) /* except the class attribute */ continue; /* and all unmarked attributes */ inst = (tpl) /* get the attribute instantiation */ ? tpl_colval(tpl, i) /* from the tuple or the att. set */ : att_inst(as_att(nbc->attset, i)); if (_exec(nbc, i, inst) < 0)/* execute the classifier */ continue; /* for the current attribute */ s = nbc->cond +nbc->clscnt; /* traverse the cond. probabilities */ d = nbc->posts+nbc->clscnt; /* and the posterior distribution */ for (sum = 0, k = nbc->clscnt; --k >= 0; ) sum += *--d *= *--s; /* multiply with cond. probabilities */ if ((sum > 1e-24) && (sum < 1e24)) continue; /* if the sum is ok, continue */ if (sum <= 0) break; /* if the sum is fubar, abort */ for (d += k = nbc->clscnt; --k >= 0; ) *--d /= sum; /* otherwise renormalize in order */ } /* to avoid an over- or underflow */ s = d = nbc->posts; /* traverse the final distribution */ for (sum = *s, k = nbc->clscnt; --k > 0; ) { if (*++s > *d) d = s; /* find the most probable class */ sum += *s; /* and sum all probabilities */ } /* (for the normalization) */ sum = (sum > 0) ? 1/sum : 1; /* compute normalization factor */ for (k = nbc->clscnt; --k >= 0; ) nbc->posts[k] *= sum; /* normalize probabilities */ if (conf) *conf = *d; /* get the confidence value and */ return (int)(d -nbc->posts); /* return the classification result */} /* nbc_exec() *//*--------------------------------------------------------------------*/void nbc_rand (NBC *nbc, double drand (void)){ /* --- generate a random tuple */ int i, k, n; /* loop variables */ double t, sum; /* random number, sum of probs. */ double *p; /* to access the probabilities */ DVEC *dvec; /* to traverse the distrib. vectors */ INST *inst; /* to traverse the instances */ NORMD *nd; /* normal distribution */ p = nbc->priors; /* get the prior probabilities */ t = drand(); /* generate a random number */ for (sum = i = 0; i < nbc->clscnt; i++) { sum += p[i]; if (sum >= t) break; } if (i >= nbc->clscnt) /* find the class that corresponds */ i = nbc->clscnt -1; /* to the generated random number */ att_inst(as_att(nbc->attset, nbc->clsid))->i = i; for (dvec = nbc->dvecs +(n = nbc->attcnt); --n >= 0; ) { if (((--dvec)->type == 0) /* traverse all attributes */ || ( dvec ->mark < 0)) /* except the class attribute */ continue; /* and all unmarked attributes */ inst = att_inst(as_att(nbc->attset, n)); if (dvec->type == AT_NOM) { /* --- if the attribute is nominal */ p = dvec->discds[i].probs;/* get the conditional distribution */ t = drand(); /* generate a random number */ for (sum = k = 0; k < dvec->valcnt; k++) { sum += p[k]; if (sum >= t) break; } if (k >= dvec->valcnt) /* find the value that corresponds */ k = dvec->valcnt -1; /* to the generated random number */ inst->i = k; } /* and set the attribute instance */ else { /* --- if the attribute is numeric */ nd = dvec->normds +i; /* get the conditional distribution */ t = sqrt(nd->var) *_normd(drand) +nd->exp; if (dvec->type == AT_REAL) inst->f = (float)t; else inst->i = (int)(t +0.5); } /* sample from the normal distrib. */ } /* and transform the result */} /* nbc_rand() *//*--------------------------------------------------------------------*/int nbc_desc (NBC *nbc, FILE *file, int mode, int maxlen){ /* --- describe a naive Bayes class. */ int i, k, n; /* loop variables */ int pos, ind; /* current position and indentation */ int len, l; /* length of class/value name/number */ const char *clsname; /* name of class attribute */ ATT *att, *clsatt; /* to traverse the attributes */ const DVEC *dvec; /* to traverse the distrib. vectors */ const NORMD *normd; /* to traverse the normal distribs. */ const DISCD *discd; /* to traverse the discrete distribs. */ char name[4*AS_MAXLEN+4]; /* output buffer for names */ char num[64]; /* output buffer for numbers */ assert(nbc && file); /* check the function arguments */ /* --- print a header (as a comment) --- */ if (mode & NBC_TITLE) { /* if the title flag is set */ i = k = (maxlen > 0) ? maxlen -2 : 70; fputs("/*", file); while (--i >= 0) fputc('-', file); fputs("\n naive Bayes classifier\n", file); while (--k >= 0) fputc('-', file); fputs("*/\n", file); } /* print a title header */ if (maxlen <= 0) maxlen = INT_MAX; /* --- start description --- */ clsatt = as_att(nbc->attset, nbc->clsid); clsname = att_name(clsatt); /* note the class attribute name */ fputs("nbc(", file); /* (is needed repeatedly below) */ sc_format(name, clsname, 0); /* format and print */ fputs(name, file); /* the class attribute name */ fputs(") = {\n", file); /* and start the description */ if ((nbc->lcorr > 0) /* if estimation parameters */ || nbc->mode) { /* differ from default values */ fprintf(file, " params = %g", nbc->lcorr); if (nbc->mode & NBC_DWNULL) fputs(", dwnull", file); if (nbc->mode & NBC_MAXLLH) fputs(", maxllh", file); fputs(";\n", file); /* print Laplace correction */ } /* and estimation mode */ /* --- print the class distribution --- */ fputs(" prob(", file); /* print a distribution indicator */ fputs(name, file); /* and the class attribute name and */ fputs(") = {\n ", file); /* start the class distribution */ pos = 4; /* initialize the output position */ ind = att_valwd(clsatt,0) +2; /* compute position and indentation */ for (i = 0; i < nbc->clscnt; i++) { /* traverse the classes */ if (i > 0) /* if this is not the first class, */ fputs(",\n ", file); /* start a new output line */ len = sc_format(name, att_valname(clsatt, i), 0); fputs(name, file); /* get and print the class name */ for (pos = len+2; pos < ind; pos++) putc(' ', file); /* pad with blanks to equal width */ fprintf(file, ": %g", nbc->frqs[i]); if (mode & NBC_REL) /* print the absolute class frequency */ fprintf(file, " (%.1f%%)", nbc->priors[i] *100); } /* print the relative class frequency */ fputs(" };\n", file); /* terminate the class distribution */ /* --- print the (conditional) distributions --- */ for (dvec = nbc->dvecs, n = 0; n < nbc->attcnt; dvec++, n++) { if ((dvec->type == 0) /* traverse all attributes, */ || ((mode & NBC_MARKED) /* but skip the class attribute */ && (dvec->mark < 0))) /* and in marked mode also */ continue; /* all unmarked attributes */ fputs(" prob(", file); /* print a distribution indicator */ att = as_att(nbc->attset,n);/* and get the next attribute */ sc_format(name, att_name(att), 0); fputs(name, file); /* print the attribute name */ putc('|', file); /* and the condition separator */ sc_format(name, clsname,0); /* format and print */ fputs(name, file); /* the class attribute name */ fputs(") = {\n ", file); /* and start the cond. distribution */ if (dvec->discds) { /* if the attribute is nominal, */ discd = dvec->discds; /* traverse the discrete distribs. */ for (i = 0; i < nbc->clscnt; discd++, i++) { if (i > 0) /* if this is not the first class, */ fputs(",\n ", file); /* start a new output line */ len = sc_format(name, att_valname(clsatt, i), 0); fputs(name, file); /* get and print the class name */ for (pos = len+2; pos < ind; pos++) putc(' ', file); /* pad with blanks to equal width */ fputs(":{", file); /* start the value distribution and */ pos += 3; /* traverse the attribute values */ for (k = 0; k < dvec->valcnt; k++) { if (k > 0) { /* if this is not the first value, */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -