📄 buildtree.c
字号:
/*********************************************************************//* LINEAR TREE for Supervised Learning *//* Versao 1.0 (10/12/1997) *//* Developed by: Joao Gama *//* LIACC - Uni.do Porto *//* jgama@ncc.up.pt *//*-------------------------------------------------------------------*//* File: BuildTree.c *//*********************************************************************/#include <stdio.h>#include <stdlib.h>#include <math.h>#include <unistd.h>#include <errno.h>#include "Ci_instances.h"#include "distributions.h"#include "entropia.h"#include "tree.h"#include "BuildTree.h"#include "Combine.h"#include "discrim.h"#include "utils.h"#include "externs.i"static int c = 0, vc[100];#define NO_MEMORY(a) \ { fprintf (stderr, "\nERROR: %s Unable to allocate memory\n\n",a); exit( 1 ); }static Tree *build_tree(CiDs *ds, long int Low, long int High, int nr_att, TreeInfo *treeinfo, Tree *ascendent, int depth, double *errors, int *max_depth);static Data *Make_Data(double *class_distribution, double *acum_class_distr, int nr_att, double errors, int split_att, double split_value);/*static int Remove_Coefficients(DomainInfo *domain, Tree *tree, int nr_att);*/Tree *BuildTree\(CiDs *ds, long int Low, long int High, int nr_att, TreeInfo *treeinfo){ int depth = 0; double sub_errors = 0.0; Tree *tree; tree = build_tree(ds, Low, High, nr_att, treeinfo, NULL, 0, &sub_errors, &depth); DEPTH(treeinfo) = depth; return tree;}static Tree *build_tree\(CiDs *ds, long int Low, long int High, int nr_att, TreeInfo *treeinfo, Tree *ascendent, int depth, double *errors, int *max_depth){ register int i, k, majority_class; int act_depth = 0, hidden = 0, actual_nr_att, actual_node, actual_leaves; long int posl, posU, posh, Midle; double *ascendent_class_dist = NULL, *acum_class_dist, *dist_class; double node_errors = 0.0, sub_errors = 0.0, mean; double ***stats, **coef = NULL; Data *data; EntAtt *bestent; Tree *arvore = NULL; ++NODES(treeinfo); ++depth; actual_node = NODES(treeinfo); actual_leaves = LEAVES(treeinfo); actual_nr_att = nr_att; if ((stats = My_Distribuitions(ds, Low, High, nr_att, &dist_class)) == NULL) NO_MEMORY("Generating statistics:") if (ascendent != NULL) { data = (Data *) DATA(ascendent); ascendent_class_dist = (double *) CLASS_DIST(data); } acum_class_dist = combing_evidences(DOMAIN_INFO(ds), dist_class, ascendent_class_dist); majority_class = dmajority(acum_class_dist, Ci_NrClasses(ds->domain)); node_errors = dist_class[0] - dist_class[majority_class]; if ((data = Make_Data(dist_class, acum_class_dist, nr_att, node_errors, 0, 0.0)) == NULL) NO_MEMORY("Generating Data:") /***********************************/ /* Pre Prunning */ /***********************************/ if (!dist_class[0]) { free_stats(ds->domain, stats, Ci_NrClasses(ds->domain), nr_att); *max_depth = 1; ++LEAVES(treeinfo); return Make_Tree(data, NODES(treeinfo), 0, ascendent, leaf); } if (dist_class[0] < 2 * MINIMUM_EXAMPLES) { free_stats(ds->domain, stats, Ci_NrClasses(ds->domain), nr_att); ERRORS(treeinfo) += node_errors; *errors += node_errors; ++LEAVES(treeinfo); *max_depth = 1; return Make_Tree(data, NODES(treeinfo), 0, ascendent, leaf); } if (dist_class[majority_class] + MINIMUM_EXAMPLES > dist_class[0]) { free_stats(ds->domain, stats, Ci_NrClasses(ds->domain), nr_att); ERRORS(treeinfo) += node_errors; *errors += node_errors; ++LEAVES(treeinfo); *max_depth = 1; return Make_Tree(data, NODES(treeinfo), 0, ascendent, leaf); } if (dist_class[majority_class] / dist_class[0] >= MIN_SPLIT) { free_stats(ds->domain,stats, Ci_NrClasses(ds->domain), nr_att); ERRORS(treeinfo) += node_errors; *errors += node_errors; ++LEAVES(treeinfo); *max_depth = 1; return Make_Tree(data, NODES(treeinfo), 0, ascendent, leaf); } /***********************************/ /* Build New Attributes */ /***********************************/ if (MAX_DEPTH > depth) { if ((coef = discriminant(ds, Low, High, stats, dist_class, nr_att, Ci_NrClasses(ds->domain), &hidden)) != NULL) { if (hidden) { apply_discriminant(ds, coef, Low, High, nr_att); nr_att += hidden; } } } /***********************************/ /* Select Attribute */ /***********************************/ bestent = entropia(ds, stats, dist_class, nr_att, Low, High, &mean); if (ENT_ATT(bestent) == -1) { ++COLAPSES(treeinfo); ERRORS(treeinfo) += node_errors; *errors += node_errors; ++LEAVES(treeinfo); free(bestent); free_stats(ds->domain, stats, Ci_NrClasses(ds->domain), actual_nr_att); *max_depth = 1; return Make_Tree(data, NODES(treeinfo), 0, ascendent, leaf); } SPLIT_ATT(data) = ENT_ATT(bestent); SPLIT_VALUE(data) = ENT_SPLIT(bestent); switch(CiTypeAttr(ds->domain,ENT_ATT(bestent))) { case integer: case ordered: case nominal: posl = Low; arvore = Make_Tree(data, NODES(treeinfo), NValsAttr(ds->domain,ENT_ATT(bestent)), ascendent, split_discrete); posU = CiJoinUnknowns(ds, ENT_ATT(bestent), Low, High); for(i = Low; i < posU; i++) Ci_Weight(Ci_Example(ds, i)) /= NValsAttr(ds->domain,ENT_ATT(bestent)); for(i = 1; i <= NValsAttr(ds->domain,ENT_ATT(bestent)) ; i++){ posh = CiJoinValues(ds, ENT_ATT(bestent), i, posU, High); DESCENDENT(arvore, i) = build_tree(ds, Low, posh-1, nr_att, treeinfo, arvore, depth, &sub_errors, &act_depth); if (*max_depth < act_depth) *max_depth = act_depth; act_depth = 0; posU = CiJoinUnknowns(ds, ENT_ATT(bestent), Low, posh-1); for(k = Low; k < posU;k++) Ci_ReBuildInstance(Ci_Example(ds, k), nr_att); } break; case continuous: arvore = Make_Tree(data, NODES(treeinfo), 2, ascendent, split_continuous); CiQuickSort(ds, ENT_ATT(bestent), Low, High); posU = Low; while(!NormalVal(Ci_AttVal(ds, posU)[ENT_ATT(bestent)])) Ci_Weight(Ci_Example(ds, posU++)) /= 2.0; Midle = CiSplitingPosition(ds, ENT_ATT(bestent), Low, High, ENT_SPLIT(bestent)); DESCENDENT(arvore, 1) = build_tree(ds, Low, Midle, nr_att, treeinfo, arvore, depth, &sub_errors, &act_depth); if (*max_depth < act_depth) *max_depth = act_depth; act_depth = 0; posU = CiJoinUnknowns(ds, ENT_ATT(bestent), Low, Midle); for(k = Low; k < posU;k++) Ci_ReBuildInstance(Ci_Example(ds, k), nr_att); Midle = CiMoveUnknowns(ds, ENT_ATT(bestent), Low, posU, Midle); DESCENDENT(arvore, 2) = build_tree(ds, Midle+1, High, nr_att, treeinfo, arvore, depth, &sub_errors, &act_depth); if (*max_depth < act_depth) *max_depth = act_depth; break; }/*************************************//* Pos_prunning: Error reduction *//*************************************/ if(node_errors <= sub_errors) { ERRORS(treeinfo) += node_errors; ERRORS(treeinfo) -= sub_errors; NODES(treeinfo) = actual_node; LEAVES(treeinfo) = 1 + actual_leaves; *errors = node_errors; DESCENDENTS(arvore) = NULL; NR_DESCENDENTS(arvore) = 0; arvore->tipo = leaf; *max_depth = 0; ++COLAPSES(treeinfo); } else{ COEFICIENTS(data) = coef; COLUMNS(data) = actual_nr_att + 1 - Nr_Att_Non_Cont(); ROWS(data) = hidden; } free(bestent); free_stats(ds->domain, stats, Ci_NrClasses(ds->domain), actual_nr_att); ++(*max_depth); return arvore;}static Data *Make_Data(double *class_distribution, double *acum_class_distr, int nr_att, double errors, int split_att, double split_value){ Data *data = NULL; if ((data = (Data *) malloc(sizeof(Data))) != NULL) { CLASS_DIST(data) = class_distribution; ACUM_CLASS_DIST(data) = acum_class_distr; NR_ATT(data) = nr_att; COEFICIENTS(data) = NULL; COLUMNS(data) = 0; ROWS(data) = 0; DERRORS(data) = errors; TREE_ERRORS(data) = 0.0; SPLIT_ATT(data) = split_att; SPLIT_VALUE(data) = split_value; } return data;}void ShowTree(DomainInfo *domain, Data *data, int nr, TYPE_NODE t){ register int i, Att, majority_class; Att = SPLIT_ATT(data); switch (t) { case leaf: majority_class = dmajority(ACUM_CLASS_DIST(data), Ci_NrClasses(domain)); printf("%s (%.2f/%.2f) [ ", LblValId(domain,NrAttrs(domain) + 1 ,majority_class), CLASS_DIST(data)[0], DERRORS(data)); for(i = 1; i <= Ci_NrClasses(domain); i++) printf("%.3f ",ACUM_CLASS_DIST(data)[i]); printf("]"); break; case split_discrete: printf("%s = %s",NameAttr(domain, Att), LblValId(domain, Att, nr)); break; case split_continuous: case split_linear: switch (nr) { case 1: if (Att > NrAttrs(domain)) printf("Linear_%d <= %.3f", Att, SPLIT_VALUE(data)); else printf("%s <= %.3f",NameAttr(domain,Att), SPLIT_VALUE(data)); break; case 2: if (Att > NrAttrs(domain)) printf("Linear_%d > %.3f", Att, SPLIT_VALUE(data)); else printf("%s > %.3f",NameAttr(domain, Att), SPLIT_VALUE(data)); break; } break; } printf("\n");}void Show_Coeficients(DomainInfo *domain, Tree *tree){ register int i, j, att, nr_rows, nr_columns; double **coef; Data *data; static int nr = 0; data = DATA(tree); if (!nr) nr = NR_ATT(data); switch(tree->tipo) { case leaf: break; case split_discrete: case split_continuous: case split_linear: nr_rows = ROWS(data); nr_columns = COLUMNS(data); coef = COEFICIENTS(data); if (nr_rows && nr_columns && coef) { for (i = 1; i <= nr_rows; i++) { ++nr; printf("Linear_%d\t", nr); printf("%+.3f", coef[i][1]); for(j = 2, att = 1; att <= NR_ATT(data); att++) if (CiTypeAttr(domain, att)) if(coef[i][j]) printf("%+.3f*[%d]", coef[i][j++], att); else ++j; printf("\n"); } } for(i = 1; i <= NR_DESCENDENTS(tree); i++) Show_Coeficients(domain, DESCENDENT(tree, i)); break; }}int WriteData(int fd, Data *data, int nr_cl){ register int i; if (write(fd, &SPLIT_ATT(data), sizeof(int)) == sizeof(int)) if (write(fd, &SPLIT_VALUE(data), sizeof(double)) == sizeof(double)) if (write(fd, &DERRORS(data), sizeof(double)) == sizeof(double)) if (write(fd, &TREE_ERRORS(data), sizeof(double)) == sizeof(double)) if (write(fd, 1+ACUM_CLASS_DIST(data), nr_cl * sizeof(double)) == nr_cl * sizeof(double)) if (write(fd, CLASS_DIST(data), (nr_cl + 1) * sizeof(double)) == (nr_cl+1) * sizeof(double)) if (write(fd, &NR_ATT(data), sizeof(int)) == sizeof(int)) { if (write(fd, &COLUMNS(data), sizeof(int)) == sizeof(int)) if (write(fd, &ROWS(data), sizeof(int)) == sizeof(int)) if (COLUMNS(data) * ROWS(data)) { for(i = 1; i <= ROWS(data); i++) if (write(fd, 1+(COEFICIENTS(data)[i]), COLUMNS(data) * sizeof(double)) != COLUMNS(data) * sizeof(double)) return FALSE; return TRUE; } } return FALSE;}void *ReadData(int fd, int nr_cl){ register int i; int split_att, nr_att, rows, columns; double *class_distribution, *acum_class_distr, errors, tree_errors, split_value; double **coefs = NULL; Data *data; class_distribution = dvector(0, nr_cl); acum_class_distr = dvector(1, nr_cl); if (read(fd, &split_att, sizeof(int)) == sizeof(int)) if (read(fd, &split_value, sizeof(double)) == sizeof(double)) if (read(fd, &errors, sizeof(double)) == sizeof(double)) if (read(fd, &tree_errors, sizeof(double)) == sizeof(double)) if (read(fd, 1+acum_class_distr, nr_cl * sizeof(double)) == nr_cl * sizeof(double)) if (read(fd, class_distribution, (nr_cl + 1) * sizeof(double)) == (nr_cl + 1) * sizeof(double)) if (read(fd, &nr_att, sizeof(int)) == sizeof(int)) if (read(fd, &columns, sizeof(int)) == sizeof(int)) if (read(fd, &rows, sizeof(int)) == sizeof(int)) { if (columns * rows) { coefs = dmatrix(1, rows, 1, columns); for(i = 1; i <= rows; i++) read(fd, 1+coefs[i], columns * sizeof(double)); } data = Make_Data(class_distribution, acum_class_distr, nr_att, errors,split_att,split_value); TREE_ERRORS(data) = tree_errors; COLUMNS(data) = columns; ROWS(data) = rows; COEFICIENTS(data) = coefs; return data; } return NULL;}int PruneTree(DomainInfo *domain, Data *data, double **distrib){ *distrib = CLASS_DIST(data); return dmajority(ACUM_CLASS_DIST(data), Ci_NrClasses(domain));}int RemoveCoefficients\(DomainInfo *domain, Tree *tree, int nr_att){ register int i, nr_new = 0, new_desc = 0; Data *data; int test_att; data = DATA(tree); test_att = SPLIT_ATT(data); nr_new = ROWS(data) > 0 ? ROWS(data) : 0; if (nr_new > 0) { for(i = c; i >= 0; i--) { if(vc[i] >= nr_att + nr_new) { vc[i] = 0; c--; } } } if (test_att > NrAttrs(domain)) vc[c++] = test_att; switch(tree->tipo) { case leaf: return 0; break; case split_discrete: for(i = 1; i <= NR_DESCENDENTS(tree); i++) new_desc += RemoveCoefficients(domain, DESCENDENT(tree, i),NR_ATT(data)+nr_new); break; case split_continuous: case split_linear: new_desc = RemoveCoefficients(domain, DESCENDENT(tree, 1),NR_ATT(data)+nr_new); new_desc += RemoveCoefficients(domain, DESCENDENT(tree, 2),NR_ATT(data)+nr_new); break; } if (nr_new) { for (i = 0; i <= c; i++) if (vc[i] > nr_att) return 0; ROWS(data) = 0; COLUMNS(data) = 0; COEFICIENTS(data) = NULL; } return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -