⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 nbayes.c

📁 数据挖掘中的bayes算法,很好的代码
💻 C
📖 第 1 页 / 共 5 页
字号:
/*--------------------------------------------------------------------*/void nbc_clear (NBC *nbc){                               /* --- clear a naive Bayes classifier */  int    i, k, n;               /* loop variables */  DVEC   *dvec;                 /* to traverse the distrib. vectors */  DISCD  *discd;                /* to traverse the discrete distribs. */  NORMD  *normd;                /* to traverse the normal   distribs. */  double *frq;                  /* to traverse the frequency vectors */  assert(nbc);                  /* check the function argument */  nbc->total = 0;               /* clear the total number of cases */  for (frq = nbc->frqs +(i = nbc->clscnt); --i >= 0; )    *--frq = 0;                 /* clear the frequency distribution */  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {    if ((--dvec)->type == 0)    /* traverse all attributes */      continue;                 /* except the class attribute */    if (dvec->type == AT_NOM) { /* if the attribute is nominal */      for (discd = dvec->discds +(k = nbc->clscnt); --k >= 0; ) {        (--discd)->cnt = 0;     /* traverse the distributions */        for (frq = discd->frqs +(n = dvec->valcnt); --n >= 0; )          *--frq = 0;           /* traverse the frequency vector */      } }                       /* and clear the value frequencies */    else {                      /* if the attribute is numeric */      for (normd = dvec->normds +(k = nbc->clscnt); --k >= 0; ) {        (--normd)->cnt = 0; normd->sv = normd->sv2 = 0; }    }                           /* clear the sums from which expected */  }                             /* value and variance are computed */}  /* nbc_clear() *//*--------------------------------------------------------------------*/#ifdef NBC_INDUCEint nbc_add (NBC *nbc, const TUPLE *tpl){                               /* --- add an instantiation */  int    i;                     /* loop variable */  int    cls;                   /* value of class attribute */  float  wgt;                   /* instantiation weight */  const  INST *inst;            /* to traverse the instances */  DVEC   *dvec;                 /* to traverse the distrib. vectors */  NORMD  *normd;                /* to access normal   distributions */  DISCD  *discd;                /* to access discrete distributions */  double v;                     /* buffer (for an attribute value) */  assert(nbc);                  /* check the function argument */  /* --- get class and weight --- */  if (tpl) {                    /* if a tuple is given */    cls = tpl_colval(tpl, nbc->clsid)->i;    wgt = tpl_getwgt(tpl); }    /* get the class and the tuple weight */  else {                        /* if no tuple is given */    cls = att_inst(as_att(nbc->attset, nbc->clsid))->i;    wgt = as_getwgt(nbc->attset);  }                             /* get the class and the inst. weight */  if (cls < 0) return 0;        /* if the class is null, abort */  assert(wgt >= 0.0F);          /* check the tuple weight */  /* --- update class distribution --- */  if ((cls >= nbc->clscnt)      /* if the class is a new one, */  &&  (_clsrsz(nbc,cls+1) != 0))/* resize the class dependent vectors */    return -1;                  /* (frequencies and distributions) */  nbc->frqs[cls] += wgt;        /* update the class frequency */  nbc->total     += wgt;        /* and the total frequency */  /* --- update conditional distributions --- */  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {    if ((--dvec)->type == 0)    /* traverse all attributes */      continue;                 /* except the class attribute */    inst = (tpl)                /* get the attribute instantiation */         ? tpl_colval(tpl, i)   /* from the tuple or the att. set */         : att_inst(as_att(nbc->attset, i));    if (dvec->type == AT_NOM) { /* -- if the attribute is nominal */      if (inst->i < 0)          /* if the attribute value is null, */        continue;               /* there is nothing to do */      if ((inst->i >= dvec->valcnt)      &&  (_valrsz(dvec, nbc->clscnt, inst->i+1) != 0))        return -1;              /* resize the value freq. vectors */      discd = dvec->discds +cls;   /* get the proper distribution */      discd->frqs[inst->i] += wgt; /* and update the value frequency */      discd->cnt += wgt; }         /* and the total frequency */    else {                      /* -- if the attribute is numeric */      if (dvec->type == AT_REAL){  /* if the attribute is real valued */        if (inst->f <= NV_REAL) continue;        v = (double)inst->f;}   /* check and get the attribute value */      else {                    /* if the attribute is integer valued */        if (inst->i <= NV_INT)  continue;        v = (double)inst->i;    /* check and get the attribute value */      }                         /* (convert it to double) */      normd = dvec->normds +cls;/* get the proper distribution */      normd->cnt += wgt;        /* update the case counter */      normd->sv  += wgt *v;     /* the sum of the values, and */      normd->sv2 += wgt *v*v;   /* the sum of their squares */    }                           /* (expected value and variance */  }                             /*  are computed in nbc_setup) */  return 0;                     /* return 'ok' */}  /* nbc_add() *//*--------------------------------------------------------------------*/NBC* nbc_induce (TABLE *table, int clsid, int mode, double lcorr){                               /* --- induce a naive Bayes class. */  int    i, r = 0;              /* loop variable, buffer */  int    cnt;                   /* number of selectable attributes */  int    cls;                   /* predicted class */  NBC    *nbc;                  /* created classifier */  ATTSET *attset;               /* attribute set of the classifier */  SELATT *savec;                /* vector of selectable attributes */  SELATT *sa, *best;            /* to traverse the selectable atts. */  TUPLE  *tpl;                  /* to traverse the tuples */  DVEC   *dvec;                 /* to traverse the distrib. vectors */  double *p;                    /* to traverse the class probs. */  double max;                   /* maximum of class probabilities */  double errs;                  /* weight sum of misclassified tuples */  assert(table                  /* check the function arguments */      && (clsid >= 0) && (clsid < tab_colcnt(table))      && (att_type(as_att(tab_attset(table), clsid)) == AT_NOM));  /* --- create a classifier --- */  attset = tab_attset(table);   /* get the attribute set of the table */  if (mode & NBC_CLONE) {       /* if the corresp. flag is set, */    attset = as_clone(attset);  /* clone the attribute set */    if (!attset) return NULL;   /* of the given data table, */  }                             /* then create a classifier */  nbc = nbc_create(attset, clsid);  if (!nbc) { if (mode & NBC_CLONE) as_delete(attset); return NULL; }  /* --- build initial classifier --- */  for (i = tab_tplcnt(table); --i >= 0; )    nbc_add(nbc, tab_tpl(table, i));    /* start with a */  nbc_setup(nbc, mode|NBC_ALL, lcorr);  /* full classifier */  if (!(mode & (NBC_ADD|NBC_REMOVE)))    return nbc;                 /* if no simp. is requested, abort */  /* --- evaluate initial classifier --- */  if (mode & NBC_ADD) {         /* if to add attributes, */    p = nbc->frqs; errs = max = *p;    for (i = nbc->clscnt; --i > 0; ) {      errs += *++p;             /* traverse the class frequencies, */      if (*p > max) max = *p;   /* sum them, and determine their */    }                           /* maximum (find the majority class) */    errs -= max; }              /* compute the number of prior errors */  else {                        /* if to remove attributes */    for (errs = 0, i = tab_tplcnt(table); --i >= 0; ) {      tpl = tab_tpl(table,i);   /* traverse the tuples in the table */      cls = tpl_colval(tpl, clsid)->i;      if (cls < 0) continue;    /* skip tuples with an null class */      assert(cls < nbc->clscnt);/* check the class value */      if (nbc_exec(nbc, tpl, NULL) != cls)        errs += tpl_getwgt(tpl);/* classify the current tuple */    }                           /* and determine the number */  }                             /* of misclassifications */  #ifndef NDEBUG  fprintf(stderr, "\n%8g errors initially\n", errs);  #endif                        /* print a counter for debugging */  /* --- collect selectable attributes --- */  savec = malloc(nbc->attcnt *sizeof(SELATT));  if (!savec) { nbc_delete(nbc, mode & NBC_CLONE); return NULL; }  sa = savec;                   /* create vector of selectable atts. */  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {    (--dvec)->mark = -1;        /* traverse all attributes */    if ( (dvec->type == 0)      /* except the class attribute */    ||  ((dvec->type == AT_NOM) /* and all nominal attributes */    &&   (dvec->valcnt <= 0)))  /* that do not have any values */      continue;    if (mode & NBC_REMOVE)      /* if to remove attributes, */      dvec->mark = i;           /* mark all selectable attributes */    sa->attid = i;              /* note selectable attributes and */    sa->errs  = 0; sa++;        /* initialize the number of errors */  }  cnt = (int)(sa -savec);       /* compute the number of attributes */  nbc->dvecs[nbc->clsid].mark = nbc->clsid;   /* and mark the class */  /* --- select attributes --- */  while ((cnt  > 0)             /* while there are selectable atts. */  &&     (errs > 0)) {          /* and the classifier is not perfect */    for (sa = savec +(i = cnt); --i >= 0; )      (--sa)->errs = 0;         /* clear the numbers of errors */    r = _eval(nbc, table, mode, savec, cnt);    if (r < 0) break;           /* evaluate selectable attributes */    best = sa = savec;          /* traverse the selectable attributes */    for (i = cnt; --i > 0; ) {  /* in order to find the best */      if (((++sa)->errs <  best->errs)      ||  ((  sa ->errs <= best->errs) && (mode & NBC_ADD)))        best = sa;              /* find least number of errors and */    }                           /* note the corresponding attribute */    if ( (best->errs >  errs)   /* if more tuples were misclassified */    ||  ((best->errs >= errs) && (mode & NBC_ADD)))      break;                    /* abort the selection loop */    errs = best->errs;          /* note the new number of errors */    #ifndef NDEBUG    fprintf(stderr, "%8g errors %s\n", best->errs,            att_name(as_att(attset, best->attid)));    #endif                      /* print a counter for debugging */    nbc->dvecs[best->attid].mark = (mode & NBC_ADD)      ? best->attid : -1;       /* mark/unmark the selected attribute */    for (--cnt; best < sa; best++)  /* remove the selected */      best->attid = best[1].attid;  /* attribute from the  */  }                                 /* list of attributes  */  free(savec);                  /* delete the selectable atts. vector */  if (r < 0) {                  /* if an error occurred, abort */    nbc_delete(nbc, mode & NBC_CLONE); return NULL; }  return nbc;                   /* return the created classifier */}  /* nbc_induce() *//*--------------------------------------------------------------------*/int nbc_mark (NBC *nbc){                               /* --- mark selected attributes */  int  i, m;                    /* loop variable, buffer for marker */  DVEC *dvec;                   /* to traverse the distrib. vectors */  int  cnt = 0;                 /* attribute counter */  assert(nbc);                  /* check the function argument */  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {    if ((--dvec)->mark < 0) m = -1;    else           { cnt++; m =  1; }    att_setmark(as_att(nbc->attset, i), m);  }                             /* transfer marker to attribute set */  att_setmark(as_att(nbc->attset, nbc->clsid), 0);  return cnt;                   /* return number of marked atts. */}  /* nbc_mark() */#endif/*--------------------------------------------------------------------*/void nbc_setup (NBC *nbc, int mode, double lcorr){                               /* --- set up a naive Bayes class. */  int    i, k, n;               /* loop variables */  DVEC   *dvec;                 /* to traverse the distrib. vectors */  NORMD  *normd;                /* to traverse the normal   distribs. */  DISCD  *discd;                /* to traverse the discrete distribs. */  double *frq, *prb;            /* to traverse the value frqs./probs. */  double cnt, sp;               /* number of cases, sum of priors */  double add;                   /* Laplace corr. + distributed weight */  assert(nbc && (lcorr >= 0));  /* check the function arguments */  nbc->mode  = mode & (NBC_DWNULL|NBC_MAXLLH);  nbc->lcorr = lcorr;           /* note estimation parameters */  /* --- estimate class probabilities --- */  cnt = nbc->total +lcorr *nbc->clscnt;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -