📄 frqtab.c
字号:
/*---------------------------------------------------------------------- File : frqtab.c Contents: frequency table management Author : Christian Borgelt History : 26.06.1997 file created 29.07.1997 first version completed 11.08.1997 some functions changed to #define 25.08.1997 functions ft_comb, ft_uncomb, and ft_dest added 18.09.1997 bug in measure evaluation removed 24.09.1997 function ft_alldst added 29.09.1997 bug in function ft_comb removed 30.09.1997 bug in evaluation with combined columns removed 09.02.1998 order of evaluation measures changed 24.02.1998 bug in function _wevid fixed 23.03.1998 parameters added to evaluation functions 20.03.1999 all 'float' fields/variables changed to 'double' 25.10.1999 evaluation function _wdiff added 15.09.2000 some assertions added 02.12.2000 memory alloc. improved, function ft_copy added 03.12.2000 table access optimized (concerning index -1) 02.03.2001 evaluation measure FEM_INFGBAL added 26.05.2001 computation of ln(\Gamma(n)) improved 26.09.2001 bug in clean up in ft_create removed 02.01.2002 measure FEM_SPCGBAL added, FEM_INFGBAL corrected 06.01.2002 switched to sorting functions from vecops 11.01.2002 measure FEM_CHI2NRM added 22.01.2002 computations in _wdiff improved 31.01.2002 computation of Gini index measures improved 02.02.2002 BD and description length measures reprogrammed 04.02.2002 quadratic information measures added 04.07.2002 bug in function _bdm fixed (equiv. sample size)----------------------------------------------------------------------*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include <assert.h>#include "vecops.h"#include "gamma.h"#include "frqtab.h"#ifdef STORAGE#include "storage.h"#endif/*---------------------------------------------------------------------- Preprocessor Definitions----------------------------------------------------------------------*/#ifdef FT_EVAL#define M_PI 3.14159265358979323846 /* \pi */#define LN_2 0.69314718055994530942 /* ln(2) */#define EPSILON 1e-12 /* to handle roundoff errors *//*---------------------------------------------------------------------- Type Definitions----------------------------------------------------------------------*/typedef double EVALFN (FRQTAB* ftab, int measure, double *params);/*---------------------------------------------------------------------- Constants----------------------------------------------------------------------*/static const char* mnames[FEM_UNKNOWN+1] = { /* FEM_NONE 0 */ "no measure", /* FEM_INFGAIN 1 */ "information gain", /* FEM_INFGBAL 2 */ "balanced information gain", /* FEM_INFGR 3 */ "information gain ratio", /* FEM_INFSGR1 4 */ "symmetric information gain ratio 1", /* FEM_INFSGR2 5 */ "symmetric information gain ratio 2", /* FEM_QIGAIN 1 */ "quadratic information gain", /* FEM_QIGBAL 2 */ "balanced quadratic information gain", /* FEM_QIGR 3 */ "quadratic information gain ratio", /* FEM_QISGR1 4 */ "symmetric quadratic information gain ratio 1", /* FEM_QISGR2 5 */ "symmetric quadratic information gain ratio 2", /* FEM_GINI 6 */ "Gini index", /* FEM_GINISYM 7 */ "symmetric Gini index", /* FEM_GINIMOD 8 */ "modified Gini index", /* FEM_RELIEF 9 */ "relief measure", /* FEM_WDIFF 10 */ "sum of weighted differences", /* FEM_CHI2 11 */ "chi^2 measure", /* FEM_CHI2NRM 12 */ "normalized chi^2 measure", /* FEM_WEVID 13 */ "weight of evidence", /* FEM_RELEV 14 */ "relevance", /* FEM_BDM 15 */ "Bayesian-Dirichlet / K2 metric", /* FEM_BDMOD 16 */ "modified Bayesian-Dirichlet / K2 metric", /* FEM_RDLREL 17 */ "reduction of description length (rel. freq.)", /* FEM_RDLABS 18 */ "reduction of description length (abs. freq.)", /* FEM_STOCO 19 */ "stochastic complexity", /* FEM_SPCGAIN 20 */ "specificity gain", /* FEM_SPCGAIN 21 */ "balanced specificity gain", /* FEM_SPCGR 22 */ "specificity gain ratio", /* FEM_SPCSGR1 23 */ "symmetric specificity gain ratio 1", /* FEM_SPCSGR2 24 */ "symmetric specificity gain ratio 2", /* FEM_UNKNOWN 25 */ "<unknown measure>",}; /* names of evaluation measures *//*---------------------------------------------------------------------- Auxiliary Functions----------------------------------------------------------------------*/static double _nsp (double *dist, int cnt){ /* --- compute nonspecificity */ double nsp = 0; /* nonspecificity */ double prec = 0; /* preceding frequency */ double t; /* temporary buffer */ assert(dist && (cnt >= 0)); /* check the function arguments */ v_dblsort(dist, cnt); /* sort the frequencies */ for ( ; cnt > 1; cnt--) { /* and then traverse them */ t = *dist -prec; prec = *dist++; if (t > 0) nsp += t *log(cnt); } /* calculate and return the */ return nsp; /* nonspecificity of the distribution */} /* _nsp() *//*---------------------------------------------------------------------- Evaluation Functions----------------------------------------------------------------------*/static double _info (FRQTAB *ftab, int measure, double *params){ /* --- Shannon information measures */ int x, y; /* loop variables */ double **c; /* to traverse the table columns */ double *fx, *fy, *fxy; /* to traverse the frequencies */ double s_x, s_y, s_xy; /* sums for entropy computation */ double info, t; /* information gain (ratio), buffer */ assert(ftab); /* check the function argument */ if (ftab->known < EPSILON) return 0; s_x = s_y = s_xy = 0; /* process the row distribution */ for (fy = ftab->frq_y +(y = ftab->ycnt); --y >= 0; ) if (*--fy > 0) s_y += *fy *log(*fy); c = ftab->frq_xy +ftab->xcnt; /* process the column distribution */ for (fx = ftab->frq_x +(x = ftab->xcnt); --x >= 0; --c) { if (*--fx <= 0) continue; /* skip empty and combined columns */ s_x += *fx *log(*fx); /* process the column distribution */ t = 0; /* and a conditional distribution */ for (fxy = *c +(y = ftab->ycnt); --y >= 0; ) if (*--fxy > 0) t += *fxy *log(*fxy); s_xy += t; /* process columns individually and */ } /* sum the results (higher accuracy) */ t = ftab->known; t *= log(t); /* compute N *log(N) only once */ s_x = t -s_x; /* N H_x = -N sum_x p_x *log(p_x) */ s_y = t -s_y; /* N H_y = -N sum_y p_y *log(p_y) */ s_xy = t -s_xy; /* N H_xy = -N sum_xy p_xy *log(p_xy) */ info = s_x +s_y -s_xy; /* compute information gain *N *ln(2) */ switch (measure & 0xff) { /* evaluate the measure code */ case FEM_INFGBAL: info /= log(ftab->xcnt) *ftab->known; break; case FEM_INFGR : if (s_x <= 0) return 0; info /= s_x; break; case FEM_INFSGR1: if (s_xy <= 0) return 0; info /= s_xy; break; case FEM_INFSGR2: if (s_x +s_y <= 0) return 0; info /= s_x +s_y; break; default: info /= LN_2 *ftab->known; break; } /* form requested entropy ratio */ if (measure & FEF_WGTD) return info *(ftab->known/ftab->frq); return info; /* return the information measure */} /* _info() *//*--------------------------------------------------------------------*/static double _quad (FRQTAB *ftab, int measure, double *params){ /* --- quadratic information measures */ int x, y; /* loop variables */ double **c; /* to traverse the table columns */ double *fx, *fy, *fxy; /* to traverse the frequencies */ double s_x, s_y, s_xy; /* sum of squared frequencies */ double quad, t; /* information gain (ratio), buffer */ assert(ftab); /* check the function argument */ if (ftab->known < EPSILON) return 0; s_y = s_x = s_xy = 0; /* process the row distribution */ for (fy = ftab->frq_y +(y = ftab->ycnt); --y >= 0; ) { --fy; s_y += *fy * *fy; } /* compute sum_y N(y)^2 */ c = ftab->frq_xy +ftab->xcnt; /* process the joint distribution */ for (fx = ftab->frq_x +(x = ftab->xcnt); --x >= 0; --c) { if (*--fx <= 0) continue; /* skip empty and combined columns */ s_x += *fx * *fx; /* process the column distribution */ t = 0; /* and a conditional distribution */ for (fxy = *c +(y = ftab->ycnt); --y >= 0; ) { --fxy; t += *fxy * *fxy; } s_xy += t; /* compute sum_xy N(x,y)^2 */ } t = ftab->known; t *= t; /* compute N^2 only once */ s_x = t -s_x; /* N^2/2 H^2_i = N^2 -sum_i N_i^2 */ s_y = t -s_y; /* N^2/2 H^2_j = N^2 -sum_j N_j^2 */ s_xy = t -s_xy; /* N^2/2 H^2_ij = N^2 -sum_ij N_ij^2 */ quad = s_x +s_y -s_xy; /* compute information gain *N *ln(2) */ switch (measure & 0xff) { /* evaluate the measure code */ case FEM_QIGBAL: quad /= t *(1 -1/ftab->xcnt); break; case FEM_QIGR : if (s_x <= 0) return 0; quad /= s_x; break; case FEM_QISGR1: if (s_xy <= 0) return 0; quad /= s_xy; break; case FEM_QISGR2: if (s_x +s_y <= 0) return 0; quad /= s_x +s_y; break; default: quad /= 0.5 *t; break; } /* form requested entropy ratio */ if (measure & FEF_WGTD) return quad *(ftab->known/ftab->frq); return quad; /* return the information measure */} /* _quad() *//*--------------------------------------------------------------------*/static double _gini (FRQTAB *ftab, int measure, double *params){ /* --- Gini index/relief measure */ int x, y; /* loop variables */ double **c; /* to traverse the table columns */ double *fx, *fy, *fxy; /* to traverse the frequencies */ double s_x, s_y, s_xy; /* sum of squared frequencies */ double w_xy, w_yx; /* weighted sum of squared freq. */ double gini, t; /* Gini index / relief measure */ assert(ftab); /* check the function argument */ if (ftab->known < EPSILON) return 0; s_y = s_xy = w_yx = 0; /* process the row distribution */ for (fy = ftab->frq_y +(y = ftab->ycnt); --y >= 0; ) { --fy; s_y += *fy * *fy; } /* compute sum_y N(y)^2 */ c = ftab->frq_xy +ftab->xcnt; /* process the joint distribution */ for (fx = ftab->frq_x +(x = ftab->xcnt); --x >= 0; --c) { if (*--fx <= 0) continue; /* skip empty and combined columns */ t = 0; /* process a conditional distribution */ for (fxy = *c +(y = ftab->ycnt); --y >= 0; ) { --fxy; t += *fxy * *fxy; } s_xy += t; /* compute sum_xy N(x,y)^2 and */ w_yx += t / *fx; /* sum_x 1/N(x) sum_y N(x,y)^2 */ } if ((measure & 0xff) == FEM_GINI) { return (w_yx -s_y /ftab->known) / ((measure & FEF_WGTD) ? ftab->frq : ftab->known); } /* compute and return the Gini index */ s_x = w_xy = 0; /* process the column distribution */ for (fx = ftab->frq_x +(x = ftab->xcnt); --x >= 0; ) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -