📄 ptree3.c

📁 数据挖掘中的一算法 ines算法 c下实现的。适合初学习数据挖掘者借鉴
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/*----------------------------------------------------------------------  File    : ptree3.c  Contents: probability/possibility tree management            (conditional) dependence measurement functions  Author  : Christian Borgelt  History : 22.10.1995 file created as ctree.c            25.03.1997 function pt_depend added            02.01.2002 switched to sorting function from vecops            08.01.2002 symmetric Gini index added            12.01.2002 normalized chi^2 measure added            22.01.2002 adapted to redesigned node structure            04.02.2002 quadratic information measures added            17.01.2003 normalization of possibilistic measures added----------------------------------------------------------------------*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include <assert.h>#include "vecops.h"#include "ptree.h"#ifdef STORAGE#include "storage.h"#endif/*----------------------------------------------------------------------  Preprocessor Definitions----------------------------------------------------------------------*/#define LN_2   0.693147180559945   /* ln(2) *//*----------------------------------------------------------------------  Type Definitions----------------------------------------------------------------------*/typedef struct {                /* --- measure information --- */  char *name;                   /* name of the measure */  int  flags;                   /* flags, e.g. PT_SYM */} MINFO;                        /* (measure information) */typedef float  MARGFN (PTNODE *node, PTLVL *lvl);typedef double DEPFN  (PTNODE *node, PTLVL *lvl,                       int measure, double *params);/*----------------------------------------------------------------------  Constants----------------------------------------------------------------------*/static const MINFO mi_prob[PT_UNKNOWN+1] = {  /* PT_NONE     0 */ { "no measure",                               1 },  /* PT_INFGAIN  1 */ { "information gain",                    PT_SYM },  /* PT_INFGBAL  2 */ { "balanced information gain",                1 },  /* PT_INFGR    3 */ { "information gain ratio",                   1 },  /* PT_INFSGR1  4 */ { "symmetric information gain ratio 1",  PT_SYM },  /* PT_INFSGR2  5 */ { "symmetric information gain ratio 2",  PT_SYM },  /* PT_QIGAIN   6 */ { "quadratic information gain",          PT_SYM },  /* PT_QIGBAL   7 */ { "balanced quadratic information gain",      1 },  /* PT_QIGR     8 */ { "quadratic information gain ratio",         1 },  /* PT_QISGR1   9 */ { "symmetric quad. info. gain ratio 1",  PT_SYM },  /* PT_QISGR2  10 */ { "symmetric quad. info. gain ratio 2",  PT_SYM },  /* PT_GINI    11 */ { "Gini index",                               1 },  /* PT_GINISYM 12 */ { "symmetric Gini index",                PT_SYM },  /* PT_GINIMOD 13 */ { "modified Gini index",                      1 },  /* PT_RELIEF  14 */ { "relief measure",                           1 },  /* PT_WDIFF   15 */ { "sum of weighted differences",         PT_SYM },  /* PT_CHI2    16 */ { "chi^2 measure",                       PT_SYM },  /* PT_CHI2NRM 17 */ { "normalized chi^2 measure",            PT_SYM },  /* PT_WEVID   18 */ { "weight of evidence",                       1 },  /* PT_RELEV   19 */ { "relevance",                                1 },  /* PT_BDM     20 */ { "Bayesian-Dirichlet / K2 metric",           1 },  /* PT_BDMOD   21 */ { "modified Bayesian-Dirichlet / K2 metric",  1 },  /* PT_RDLREL  22 */ { "reduction of description length "                          "(rel. freq.)", 1 },  /* PT_RDLABS  23 */ { "reduction of description length "                          "(abs. freq.)", 1 },  /* PT_STOCO   24 */ { "stochastic complexity",                    1 },  /* PT_UNKNOWN 25 */ { "<unknown>",                                0 }};                              /* measures for possibility trees */static const MINFO mi_poss[PT_UNKNOWN+1] = {  /* PT_NONE     0 */ { "no measure",                               1 },  /* PT_SPCGAIN  1 */ { "specificity gain",                    PT_SYM },  /* PT_SPCGBAL  2 */ { "balanced specificity gain",                1 },  /* PT_SPCGR    3 */ { "specificity gain ratio",                   1 },  /* PT_SPCSGR1  4 */ { "symmetric specificity gain ratio 1",  PT_SYM },  /* PT_SPCSGR2  5 */ { "symmetric specificity gain ratio 2",  PT_SYM },  /*             6 */ { "<unknown>",                                0 },  /*             7 */ { "<unknown>",                                0 },  /*             8 */ { "<unknown>",                                0 },  /*             9 */ { "<unknown>",                                0 },  /*            10 */ { "<unknown>",                                0 },  /*            11 */ { "<unknown>",                                0 },  /*            12 */ { "<unknown>",                                0 },  /*            13 */ { "<unknown>",                                0 },  /*            14 */ { "<unknown>",                                0 },  /* PT_WDIFF   15 */ { "sum of weighted differences",         PT_SYM },  /* PT_CHI2    16 */ { "possibilistic chi^2 measure",         PT_SYM },  /*            17 */ { "<unknown>",                                0 },  /*            18 */ { "<unknown>",                                0 },  /*            19 */ { "<unknown>",                                0 },  /* PT_MUTSPC  20 */ { "mutual specificity",                  PT_SYM },  /*            21 */ { "<unknown>",                                0 },  /*            22 */ { "<unknown>",                                0 },  /*            23 */ { "<unknown>",                                0 },  /*            24 */ { "<unknown>",                                0 },  /* PT_UNKNOWN 25 */ { "<unknown>",                                0 }};                              /* measures for probability trees *//*----------------------------------------------------------------------  Functions to Compute Marginals----------------------------------------------------------------------*/static float _marg_sum (PTNODE *node, PTLVL *lvl){                               /* --- marginalize by summing */  int    i, j;                  /* loop variables */  PTNODE **p;                   /* to traverse the child vector */  float  *bp, *bc, *c;          /* to traverse the buffers/counters */  float  sum = 0;               /* sum of frequencies */  assert(node && lvl            /* check the function arguments */      && (lvl[1].index < 0));   /* (next level must be leaves) */  for (bc = lvl[1].buf +(i = lvl[1].cnt); --i >= 0; )    *--bc = 0;                  /* clear child marginals */  p = node->children +lvl->cnt; /* traverse the child vector */  for (bp = lvl[0].buf +(j = lvl[0].cnt); --j >= 0; ) {    *--bp = 0;                  /* clear parent marginals */    if (!*--p) continue;        /* if there is no child, skip value */    c = ((PTLEAF*)*p)->cnts +lvl[1].cnt;    for (bc = lvl[1].buf +(i = lvl[1].cnt); --i >= 0; ) {      *  bp += *--c;            /* aggregate the values */      *--bc += *  c;            /* of the joint distribution */    }                           /* in the parent/child buffers */    sum += *bp;                 /* compute the global total */  }  return lvl[0].total = sum;    /* store the total frequency */}  /* _marg_sum() *//*--------------------------------------------------------------------*/static float _marg_max (PTNODE *node, PTLVL *lvl){                               /* --- marginalize by maximizing */  int    i, j;                  /* loop variables */  PTNODE **p;                   /* to traverse the child vector */  float  *bp, *bc, *c;          /* to traverse the buffers/counters */  float  max = 0;               /* maximum of frequencies */  assert(node && lvl            /* check the function arguments */      && (lvl[1].index < 0));   /* (next level must be leaves) */  for (bc = lvl[1].buf +(i = lvl[1].cnt); --i >= 0; )    *--bc = 0;                  /* clear child marginals */  p = node->children +lvl->cnt; /* traverse the child vector */  for (bp = lvl[0].buf +(j = lvl[0].cnt); --j >= 0; ) {    *--bp = 0;                  /* clear parent marginals */    if (!*--p) continue;        /* if there is no child, skip value */    c = ((PTLEAF*)*p)->cnts +lvl[1].cnt;    for (bc = lvl[1].buf +(i = lvl[1].cnt); --i >= 0; ) {      if (*--c > *bp) *bp = *c; /* aggregate the values */      if (*c > *--bc) *bc = *c; /* of the joint distribution */    }                           /* in the parent/child buffers */    if (*bp > max) max = *bp;   /* compute the global maximum */  }  return lvl[0].total = max;    /* store the global maximum */}  /* _marg_max() *//*----------------------------------------------------------------------  Probabilistic Conditional Dependence Measures----------------------------------------------------------------------*/static double _info (PTNODE *node, PTLVL *lvl,                     int measure, double *params){                               /* --- Shannon information measures */  int    i, j;                  /* loop variables */  PTNODE **p;                   /* to traverse the child vector */  float  *c;                    /* to traverse the buffers/counters */  double s_i, s_j, s_ij;        /* sums for entropy computation */  double info, t;               /* information gain (ratio), buffer */  assert(node && lvl);          /* check the function arguments */  s_i = s_j = s_ij = 0;         /* and initialize the sums */  for (c = lvl[1].buf +(i = lvl[1].cnt); --i >= 0; )    if (*--c > 0) s_i += *c *log(*c);  for (c = lvl[0].buf +(j = lvl[0].cnt); --j >= 0; )    if (*--c > 0) s_j += *c *log(*c);  for (p = node->children +(j = lvl[0].cnt); --j >= 0; ) {    if (!*--p) continue;        /* traverse the existing children */    t = 0;                      /* process the joint distribution */    for (c = ((PTLEAF*)*p)->cnts +(i = lvl[1].cnt); --i >= 0; )      if (*--c > 0) t += *c *log(*c);    s_ij += t;                  /* process leaves individually and */  }                             /* sum the results (higher accuracy) */  t = lvl[0].total; t *= log(t);/* compute N *log(N) only once */  s_i  = t -s_i;                /* N H_i  = -N sum_i  p_i  *log(p_i)  */  s_j  = t -s_j;                /* N H_j  = -N sum_j  p_j  *log(p_j)  */  s_ij = t -s_ij;               /* N H_ij = -N sum_ij p_ij *log(p_ij) */  info = s_i +s_j -s_ij;        /* compute information gain *N *ln(2) */  switch (measure) {            /* evaluate measure code */    case PT_INFSGR1: if (s_ij     <= 0) return 0;                     info /= s_ij;               break;    case PT_INFSGR2: if (s_i +s_j <= 0) return 0;                     info /= s_i +s_j;           break;    default        : info /= LN_2 *lvl[0].total; break;  }                             /* form requested entropy ratio */  return (info < 0) ? 0 : info; /* return the information measure */}  /* _info() */
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -