⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 entropia.c

📁 source codes for ltr
💻 C
字号:
/*********************************************************************//*        LINEAR TREE for Supervised Learning                        *//*        Versao 1.0 (10/12/1997)                                    *//*        Developed by: Joao Gama                                    *//*                LIACC - Uni.do Porto                               *//*                jgama@ncc.up.pt                                    *//*-------------------------------------------------------------------*//*   File: Entropia.c                                                *//*********************************************************************/#include <stdio.h>#include <stdlib.h>#include <math.h>#include "Ci_instances.h"#include "entropia.h"#include "externs.i"#define LOG2    0.6931471806#define EPSILON 0.0000001#define MYLOG(x) ((x) == 0.0 ? 0.0 : log(x) / LOG2)static double Hy = 0.0;/****************************************//*   Prototyps for Private FUNCTIONS    *//****************************************/static EntAtt *MakeEntAtt();static void InitEntAtt(EntAtt *entatt,  int k);static double entropia_atributo_discreto(CiDs *ds, double ***STATIS, int k, double nr_exs, double *Hx);static double entropia_atributo_continuo(CiDs *ds, double *class_freq, long int Low, long int High, int k, double *Hx, double *split);static void best_attribute(EntAtt *Best, int k, double Gain, double GainRatio, double split, int Flag);static void set_best_attribute(EntAtt *Best, int k, double Gain, double GainRatio, double split);/****************************************//*   Public FUNCTIONS                   *//****************************************/EntAtt *entropia\(CiDs *ds, double ***STATIS, double *class_freq, int nr_att, long int Low, long int High, double *Mean){  register int i, Att, nr = 0;  double p, Hx, H, split, Gain_Mean = 0.0;  EntAtt *Best = MakeEntAtt();    Hy = 0.0;  for(i = 1; i <= Ci_NrClasses(CI_Domain(ds)); i++)     if (class_freq[i]) {      p = (double) class_freq[i] / class_freq[0];      Hy -= p * MYLOG(p);    }    for(Att = 1; Att <= nr_att; Att++) {    H = Hx = split = 0.0;    switch(CiTypeAttr(ds->domain, Att)) {    case integer:    case ordered:    case nominal:      if ((H = entropia_atributo_discreto(ds, STATIS, Att, class_freq[0], &Hx)) != -1) {	best_attribute(Best, Att, H, Hx, 0.0, GAIN_RATIO);	Gain_Mean = (Gain_Mean * nr + H) / ++nr;      }      break;    case continuous:      CiQuickSort(ds, Att, Low, High);      if ((H = entropia_atributo_continuo(ds, class_freq, Low, High, Att, &Hx, &split)) != -1) {	best_attribute(Best, Att, H, Hx, split, GAIN_RATIO);	Gain_Mean = (Gain_Mean * nr + H) / ++nr;      }    }  }  *Mean = Gain_Mean;  return Best;}/****************************************//*   Private FUNCTIONS                  *//****************************************/static void InitEntAtt(EntAtt *entatt,  int k){  if (entatt) {    entatt->attribute = k;    entatt->cut_point = 0.0;    entatt->gain = 0.0;    entatt->info = 1.0;  }}static EntAtt *MakeEntAtt(){  EntAtt *entatt = (EntAtt *) malloc(sizeof(EntAtt));  if (entatt) {    entatt->attribute = -1;    entatt->cut_point = 0.0;    entatt->gain = 0.0;    entatt->info = 1.0;  }  return entatt;}static void best_attribute\(EntAtt *Best, int k, double Gain, double Info, double split, int Flag_GAIN){  if (Best->attribute == -1)     set_best_attribute(Best, k, Gain, Info, split);  else     if (Flag_GAIN) {      if (Gain >= 0.0 || Info > EPSILON)	if (Gain / Info > Best->gain / Best->info) 	  set_best_attribute(Best, k, Gain, Info, split);    }    else {      if (Gain > Best->gain)	set_best_attribute(Best, k, Gain, Info, split);    }}static void set_best_attribute\(EntAtt *Best, int k, double Gain, double Info, double split){  Best->attribute = k;  Best->cut_point = split;  Best->gain = Gain;  Best->info = Info ? Info : EPSILON;}static double entropia_atributo_discreto\(CiDs *ds, double ***STATIS, int Att, double nr_exs, double *Hx){  register int i, j, nr_splits = 0;  double p, H = 0.0;  *Hx = 0.0;  for(i = 1; i <= NValsAttr(ds->domain, Att); i++) {    if(STATIS[Att][0][i]) {      p = (double) STATIS[Att][0][i] / ((double) nr_exs - STATIS[Att][0][0]);      *Hx -= p * MYLOG(p);      ++nr_splits;    }        for(j = 1; j <= Ci_NrClasses(ds->domain); j++)      if(STATIS[Att][j][i]) {        p = (double) STATIS[Att][j][i] / (double) STATIS[Att][0][i];        H -= (STATIS[Att][0][i] / ((double) nr_exs - STATIS[Att][0][0])) * p * MYLOG(p);      }  }  p = Hy - H;    /*    Correction due to unknown values  */  if (STATIS[Att][0][0]) {    p = (double) (nr_exs - STATIS[Att][0][0]) / (double) nr_exs;    p = Hy - H + p * MYLOG(p);  }  if (nr_splits > 1) return p;  return -1.0;}static double entropia_atributo_continuo\(CiDs *ds, double *class_freq, long int Low, long int High, int Att, double *Hx, double *split){  register int i, j, z, classe, change_class, nr_unk = 0, nr_splits = 0;  double minsplit;  double p, p1, H, split_value, mean = 0.0;  double inf = 0.0;  EntAtt entatt;  double nr_exs = class_freq[0], COUNTER_CLASS[Ci_NrClasses(ds->domain) + 1];  for(z = 0; z <= Ci_NrClasses(ds->domain); z++) COUNTER_CLASS[z] = 0;  InitEntAtt(&entatt, Att);  if (nr_exs <= 2 * MINIMUM_EXAMPLES)    return -1;  i = Low;  minsplit = 0.10 * class_freq[0] / (Ci_NrClasses(ds->domain) +1);  if (minsplit < MINIMUM_EXAMPLES) minsplit = MINIMUM_EXAMPLES;  if (minsplit > 25.0) minsplit = 25.0;      /* Skip Unknowns */  while (i < High && TypeOfVal(Ci_AttVal(ds, i)[Att]) != normal) {    nr_unk += Ci_Weight(Ci_Example(ds, i));    ++i;  }  ++i;      for(; i < High; i++) {    --i;    inf = 0.0;    change_class = FALSE;         classe = Ci_Classe(Ci_Example(ds, i));    COUNTER_CLASS[0] += Ci_Weight(Ci_Example(ds, i));    COUNTER_CLASS[classe] += Ci_Weight(Ci_Example(ds, i));    ++i;    /* Find Boundaries Cut Points */    while (i < High && COUNTER_CLASS[0] < minsplit) {      COUNTER_CLASS[0] += Ci_Weight(Ci_Example(ds, i));      COUNTER_CLASS[Ci_Classe(Ci_Example(ds, i))] += Ci_Weight(Ci_Example(ds, i));      ++i;      nr_splits = 1;    }    split_value = (CValAttEx(Ci_AttVal(ds, i),Att) + CValAttEx(Ci_AttVal(ds, i-1),Att)) / 2.0;    while (i < High && split_value >= CValAttEx(Ci_AttVal(ds, i),Att)) {      COUNTER_CLASS[0] += Ci_Weight(Ci_Example(ds, i));      if (classe != Ci_Classe(Ci_Example(ds, i))) 	change_class = TRUE;      COUNTER_CLASS[Ci_Classe(Ci_Example(ds, i))] += Ci_Weight(Ci_Example(ds, i));      ++i;    }    ++nr_splits;     split_value = (CValAttEx(Ci_AttVal(ds, i-1),Att) + CValAttEx(Ci_AttVal(ds, i),Att) ) / 2.0;    classe = Ci_Classe(Ci_Example(ds, i));    if (COUNTER_CLASS[0] >= minsplit && nr_exs - COUNTER_CLASS[0] >= minsplit) {      p = (double) COUNTER_CLASS[0] / (double) nr_exs;       p1 = (double)(nr_exs - COUNTER_CLASS[0]) / (double) nr_exs;      inf -=  p * MYLOG(p) + p1 * MYLOG(p1);            H = 0.0;      for(j = 1; j <= Ci_NrClasses(ds->domain); j++) {	if (COUNTER_CLASS[j]) {	  p = (double) COUNTER_CLASS[j] / (double) COUNTER_CLASS[0];	  H -= (COUNTER_CLASS[0] / (double) nr_exs) * p * MYLOG(p);	}	if (class_freq[j] - COUNTER_CLASS[j]){	  p1 = (double) (class_freq[j] - COUNTER_CLASS[j]) / (double) (nr_exs - COUNTER_CLASS[0]);	  H -= ((nr_exs - COUNTER_CLASS[0]) / (double) nr_exs) * p1 * MYLOG(p1);	}      }            mean += (Hy - H);      best_attribute(&entatt, Att, Hy-H, inf, split_value, FALSE);    }    else      --nr_splits;  }  *split = entatt.cut_point;  *Hx = entatt.info;  mean = (nr_splits ? mean / nr_splits : 1E-6);  if (nr_unk) {        p = (double) (nr_exs - nr_unk)  / (double) nr_exs;    entatt.gain += p * MYLOG(p);  }  if (nr_splits <= 0 ||       (entatt.gain  <= MYLOG((double) nr_splits) / (double) nr_exs - 1E-3) ||  mean < 1E-5) {    return -1.0;  }  return(entatt.gain - MYLOG((double) nr_splits) / nr_exs);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -