📄 entropia.c
字号:
/*********************************************************************//* LINEAR TREE for Supervised Learning *//* Versao 1.0 (10/12/1997) *//* Developed by: Joao Gama *//* LIACC - Uni.do Porto *//* jgama@ncc.up.pt *//*-------------------------------------------------------------------*//* File: Entropia.c *//*********************************************************************/#include <stdio.h>#include <stdlib.h>#include <math.h>#include "Ci_instances.h"#include "entropia.h"#include "externs.i"#define LOG2 0.6931471806#define EPSILON 0.0000001#define MYLOG(x) ((x) == 0.0 ? 0.0 : log(x) / LOG2)static double Hy = 0.0;/****************************************//* Prototyps for Private FUNCTIONS *//****************************************/static EntAtt *MakeEntAtt();static void InitEntAtt(EntAtt *entatt, int k);static double entropia_atributo_discreto(CiDs *ds, double ***STATIS, int k, double nr_exs, double *Hx);static double entropia_atributo_continuo(CiDs *ds, double *class_freq, long int Low, long int High, int k, double *Hx, double *split);static void best_attribute(EntAtt *Best, int k, double Gain, double GainRatio, double split, int Flag);static void set_best_attribute(EntAtt *Best, int k, double Gain, double GainRatio, double split);/****************************************//* Public FUNCTIONS *//****************************************/EntAtt *entropia\(CiDs *ds, double ***STATIS, double *class_freq, int nr_att, long int Low, long int High, double *Mean){ register int i, Att, nr = 0; double p, Hx, H, split, Gain_Mean = 0.0; EntAtt *Best = MakeEntAtt(); Hy = 0.0; for(i = 1; i <= Ci_NrClasses(CI_Domain(ds)); i++) if (class_freq[i]) { p = (double) class_freq[i] / class_freq[0]; Hy -= p * MYLOG(p); } for(Att = 1; Att <= nr_att; Att++) { H = Hx = split = 0.0; switch(CiTypeAttr(ds->domain, Att)) { case integer: case ordered: case nominal: if ((H = entropia_atributo_discreto(ds, STATIS, Att, class_freq[0], &Hx)) != -1) { best_attribute(Best, Att, H, Hx, 0.0, GAIN_RATIO); Gain_Mean = (Gain_Mean * nr + H) / ++nr; } break; case continuous: CiQuickSort(ds, Att, Low, High); if ((H = entropia_atributo_continuo(ds, class_freq, Low, High, Att, &Hx, &split)) != -1) { best_attribute(Best, Att, H, Hx, split, GAIN_RATIO); Gain_Mean = (Gain_Mean * nr + H) / ++nr; } } } *Mean = Gain_Mean; return Best;}/****************************************//* Private FUNCTIONS *//****************************************/static void InitEntAtt(EntAtt *entatt, int k){ if (entatt) { entatt->attribute = k; entatt->cut_point = 0.0; entatt->gain = 0.0; entatt->info = 1.0; }}static EntAtt *MakeEntAtt(){ EntAtt *entatt = (EntAtt *) malloc(sizeof(EntAtt)); if (entatt) { entatt->attribute = -1; entatt->cut_point = 0.0; entatt->gain = 0.0; entatt->info = 1.0; } return entatt;}static void best_attribute\(EntAtt *Best, int k, double Gain, double Info, double split, int Flag_GAIN){ if (Best->attribute == -1) set_best_attribute(Best, k, Gain, Info, split); else if (Flag_GAIN) { if (Gain >= 0.0 || Info > EPSILON) if (Gain / Info > Best->gain / Best->info) set_best_attribute(Best, k, Gain, Info, split); } else { if (Gain > Best->gain) set_best_attribute(Best, k, Gain, Info, split); }}static void set_best_attribute\(EntAtt *Best, int k, double Gain, double Info, double split){ Best->attribute = k; Best->cut_point = split; Best->gain = Gain; Best->info = Info ? Info : EPSILON;}static double entropia_atributo_discreto\(CiDs *ds, double ***STATIS, int Att, double nr_exs, double *Hx){ register int i, j, nr_splits = 0; double p, H = 0.0; *Hx = 0.0; for(i = 1; i <= NValsAttr(ds->domain, Att); i++) { if(STATIS[Att][0][i]) { p = (double) STATIS[Att][0][i] / ((double) nr_exs - STATIS[Att][0][0]); *Hx -= p * MYLOG(p); ++nr_splits; } for(j = 1; j <= Ci_NrClasses(ds->domain); j++) if(STATIS[Att][j][i]) { p = (double) STATIS[Att][j][i] / (double) STATIS[Att][0][i]; H -= (STATIS[Att][0][i] / ((double) nr_exs - STATIS[Att][0][0])) * p * MYLOG(p); } } p = Hy - H; /* Correction due to unknown values */ if (STATIS[Att][0][0]) { p = (double) (nr_exs - STATIS[Att][0][0]) / (double) nr_exs; p = Hy - H + p * MYLOG(p); } if (nr_splits > 1) return p; return -1.0;}static double entropia_atributo_continuo\(CiDs *ds, double *class_freq, long int Low, long int High, int Att, double *Hx, double *split){ register int i, j, z, classe, change_class, nr_unk = 0, nr_splits = 0; double minsplit; double p, p1, H, split_value, mean = 0.0; double inf = 0.0; EntAtt entatt; double nr_exs = class_freq[0], COUNTER_CLASS[Ci_NrClasses(ds->domain) + 1]; for(z = 0; z <= Ci_NrClasses(ds->domain); z++) COUNTER_CLASS[z] = 0; InitEntAtt(&entatt, Att); if (nr_exs <= 2 * MINIMUM_EXAMPLES) return -1; i = Low; minsplit = 0.10 * class_freq[0] / (Ci_NrClasses(ds->domain) +1); if (minsplit < MINIMUM_EXAMPLES) minsplit = MINIMUM_EXAMPLES; if (minsplit > 25.0) minsplit = 25.0; /* Skip Unknowns */ while (i < High && TypeOfVal(Ci_AttVal(ds, i)[Att]) != normal) { nr_unk += Ci_Weight(Ci_Example(ds, i)); ++i; } ++i; for(; i < High; i++) { --i; inf = 0.0; change_class = FALSE; classe = Ci_Classe(Ci_Example(ds, i)); COUNTER_CLASS[0] += Ci_Weight(Ci_Example(ds, i)); COUNTER_CLASS[classe] += Ci_Weight(Ci_Example(ds, i)); ++i; /* Find Boundaries Cut Points */ while (i < High && COUNTER_CLASS[0] < minsplit) { COUNTER_CLASS[0] += Ci_Weight(Ci_Example(ds, i)); COUNTER_CLASS[Ci_Classe(Ci_Example(ds, i))] += Ci_Weight(Ci_Example(ds, i)); ++i; nr_splits = 1; } split_value = (CValAttEx(Ci_AttVal(ds, i),Att) + CValAttEx(Ci_AttVal(ds, i-1),Att)) / 2.0; while (i < High && split_value >= CValAttEx(Ci_AttVal(ds, i),Att)) { COUNTER_CLASS[0] += Ci_Weight(Ci_Example(ds, i)); if (classe != Ci_Classe(Ci_Example(ds, i))) change_class = TRUE; COUNTER_CLASS[Ci_Classe(Ci_Example(ds, i))] += Ci_Weight(Ci_Example(ds, i)); ++i; } ++nr_splits; split_value = (CValAttEx(Ci_AttVal(ds, i-1),Att) + CValAttEx(Ci_AttVal(ds, i),Att) ) / 2.0; classe = Ci_Classe(Ci_Example(ds, i)); if (COUNTER_CLASS[0] >= minsplit && nr_exs - COUNTER_CLASS[0] >= minsplit) { p = (double) COUNTER_CLASS[0] / (double) nr_exs; p1 = (double)(nr_exs - COUNTER_CLASS[0]) / (double) nr_exs; inf -= p * MYLOG(p) + p1 * MYLOG(p1); H = 0.0; for(j = 1; j <= Ci_NrClasses(ds->domain); j++) { if (COUNTER_CLASS[j]) { p = (double) COUNTER_CLASS[j] / (double) COUNTER_CLASS[0]; H -= (COUNTER_CLASS[0] / (double) nr_exs) * p * MYLOG(p); } if (class_freq[j] - COUNTER_CLASS[j]){ p1 = (double) (class_freq[j] - COUNTER_CLASS[j]) / (double) (nr_exs - COUNTER_CLASS[0]); H -= ((nr_exs - COUNTER_CLASS[0]) / (double) nr_exs) * p1 * MYLOG(p1); } } mean += (Hy - H); best_attribute(&entatt, Att, Hy-H, inf, split_value, FALSE); } else --nr_splits; } *split = entatt.cut_point; *Hx = entatt.info; mean = (nr_splits ? mean / nr_splits : 1E-6); if (nr_unk) { p = (double) (nr_exs - nr_unk) / (double) nr_exs; entatt.gain += p * MYLOG(p); } if (nr_splits <= 0 || (entatt.gain <= MYLOG((double) nr_splits) / (double) nr_exs - 1E-3) || mean < 1E-5) { return -1.0; } return(entatt.gain - MYLOG((double) nr_splits) / nr_exs);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -