📄 weak_rule.c
字号:
/****************************************************************************** weak_rule.c - find a single rule (a weak learning method) code stolen from fit.c basic interface is x_model(data), x_classify(c,inst), x_error_rate(c,data), print_x(c), fshow_x(c)******************************************************************************/#include <stdio.h>#include "ripper.h"#include "boost.h"#include "mdb.h"/* locally used functions */static void wrefine(vec_t *,vec_t *,symbol_t *,vec_t *);static void simplify(vec_t *,vec_t *,symbol_t *,vec_t *);static BOOL stop_refining(vec_t *,int,int);static BOOL reject_refinement(int,int,int,int,int);static double rule_value(vec_t *,symbol_t *,vec_t *);static DATA *adequate_subsample(symbol_t *,DATA *,DATA **);static weak_rule_t *wrule_model1(symbol_t *,DATA *);/*****************************************************************************/weak_rule_t *wrule_model(DATA *data){ symbol_t *cl; static int classes_ordered=0; double err,best_err; int i; weak_rule_t *wr,*best_wr; if (!classes_ordered) { count_classes(data); reorder_classes(data); classes_ordered = 1; } if (Find_rule_class_spec!=ALLCL && vmax(Classes) != 2) { warning("two-class problem is assumed!"); } if (Find_rule_class_spec==MINCL) { cl = vref(atom_t,Classes,0)->nom; return wrule_model1(cl,data); } else if (Find_rule_class_spec==MAXCL) { cl = vref(atom_t,Classes,1)->nom; return wrule_model1(cl,data); } else { /* bug: I'm being sloppy here about GC-ing the unused rules */ best_err = MAXREAL; best_wr = NULL; for (i=0; i<vmax(Classes); i++) { cl = vref(atom_t,Classes,i)->nom; wr = wrule_model1(cl,data); err = wrule_error_rate(wr,data); trace(SUMM) { printf("// class %s: best rule ",cl->name); print_rule(wr->rule); printf(" (%.2f%% error)\n",err*100.0); } if (err < best_err) { best_err = err; best_wr = wr; } } return best_wr; }}/* use a greedy strategy to find rules to cover the examples of class cl */static weak_rule_t *wrule_model1(symbol_t *cl,DATA *data){ double p; vec_t grow_data,prune_data; ex_count_t rule_p,rule_n; vec_t *new_form,*new_deriv; int i; ex_count_t tmp_p,tmp_n; rule_t *r; weak_rule_t *wr; DATA data1; /* determine the starting points for growing rules */ new_form = new_vec(gsym_t); new_deriv = new_vec(deriv_step_t); make_the_universe(cl,new_form); stratified_partition(data,3,&grow_data,&prune_data); trace(LONG) printf("// finding new rule:\n"); wrefine(new_form,new_deriv,cl,&grow_data); simplify(new_form,new_deriv,cl,&prune_data); count_examples(new_form,cl,data,&rule_p,&rule_n); /* save the rule */ r = newmem(1,rule_t); r->conseq = cl; r->antec = new_vec(gsym_t); copy_vec(gsym_t,r->antec,new_form); r->deriv = new_vec(deriv_step_t); copy_vec(deriv_step_t,r->deriv,new_deriv); r->nposx = rule_p; r->nnegx = rule_n; wr = newmem(1,weak_rule_t); wr->rule = r; trace(SUMM) { printf("// * "); print_rule(r); printf("\n"); fflush(stdout); } /* count class frequencies of uncovered examples -inefficiently */ copy_data(&data1,data); remove_covered_examples(new_form,&data1); wr->dist = newmem(vmax(Classes),ex_count_t); for (i=0; i<vmax(Classes); i++) { count_class_freq(vref(atom_t,Classes,i)->nom,&data1,&tmp_p,&tmp_n); if (tmp_n+tmp_p) { wr->dist[i] = tmp_p/(tmp_p + tmp_n); } else { wr->dist[i] = 1.0/vmax(Classes); } } return wr;}/* use a greedy strategy to refine a rule */static void wrefine(ref,deriv,cl,data)vec_t *ref, *deriv; /* these are side-effected here */symbol_t *cl;vec_t *data;{ ex_count_t p,n,p1,n1; DATA data1,*working_sample; int i, best_i; BOOL last_refinement_rejected; double best_val, vali, prev_info; vec_t *csi; int priority; static DATA *space=NULL; trace(LONG) { printf("// refining: "); print_form(ref); printf("\n"); } copy_data(&data1,data); count_examples(ref,cl,&data1,&p,&n); prev_info = information(p,n); last_refinement_rejected = FALSE; while (!stop_refining(ref,p,n) && !last_refinement_rejected) { /* if appropriate use a smaller sample */ working_sample = adequate_subsample(cl,&data1,&space); /* find the best refinement */ priority = 0; do { best_val = -MAXREAL; compute_designated_refinements(ref,deriv,priority++); for (i=0; i<n_designated_refinements(); i++) { Value_function = (n>0) ? &info_gain : &pos_cvg; vali = value(refinement(i),cl,working_sample,ref,prev_info,p,n); if (vali > best_val) { best_val = vali; best_i = i; } } } while (best_val<=0 && priority<=Max_priority); /* decide if it's worth keeping */ count_examples(refinement(best_i),cl,&data1,&p1,&n1); if (reject_refinement(priority-1,p,n,p1,n1)) { last_refinement_rejected = TRUE; trace(LONG) { printf("// stopped refining (%g/%g) ",p1,n1); print_form(refinement(best_i)); printf("\n"); } } else { copy_vec(gsym_t,ref,refinement(best_i)); copy_vec(deriv_step_t,deriv,derivation(best_i)); p = p1; n = n1; prev_info = information(p,n); remove_uncovered_examples(ref,&data1); trace(LONG) { printf("// refined to (%g/%g): ",p,n); print_form(ref); printf("\n"); } } /* else !reject_refinement */ } /* while !stop_refining */}/* use greedy strategy to simplify a rule */static void simplify(form,deriv,cl,prune_data)vec_t *form, *deriv; /* side-effected here */symbol_t *cl;vec_t *prune_data;{ int i, j, best_j; ex_count_t p, n; double val, best_val; DATA *working_sample; static DATA *space=NULL; working_sample = adequate_subsample(cl,prune_data,&space); best_val = rule_value(form,cl,working_sample); trace(LONG) { count_examples(form,cl,working_sample,&p,&n); printf("// simplifying with %d examples\n",vmax(working_sample)); printf("// initial rule [%g/%g val=%.3f]: ",p,n,best_val); print_form(form); printf("\n"); } compute_designated_generalizations(cl,form,deriv); while (n_designated_generalizations() > 0) { /* find generalization better than best_val */ best_j = -1; for (j=0; j<n_designated_generalizations(); j++) { val = rule_value(generalization(j),cl,working_sample); trace(DBUG) { count_examples(generalization(j),cl,working_sample,&p,&n); printf("// gen %d [%g/%g val=%.3f]: ",j,p,n,val); print_form(generalization(j)); printf("\n"); } if (val >= best_val) { best_val = val; best_j = j; } } /*for generalization j */ if (best_j == -1) { /* nothing better found */ break; /* exit while loop */ } else { /* replace form/deriv with the current best */ copy_vec(gsym_t,form,generalization(best_j)); copy_vec(deriv_step_t,deriv,derivation(best_j)); trace(LONG) { count_examples(form,cl,working_sample,&p,&n); printf("// simplified to [%g/%g val=%.3f] ",p,n,best_val); print_form(form); printf("\n"); } compute_designated_generalizations(cl,form,deriv); } } /* while */ trace(LONG) { val = rule_value(form,cl,working_sample); count_examples(form,cl,working_sample,&p,&n); printf("// final rule [%g/%g val=%.3f]: ",p,n,val); print_form(form); printf("\n"); }}/* decide when to stop refining and when to reject a refinement */static BOOL stop_refining(form,p,n)vec_t *form;int p,n;{ int i; gsym_t *ci; if (n!=0) return FALSE; else { for (i=0; i<vmax(form); i++) { ci = vref(gsym_t,form,i); if (ci->nonterm && !Derives_emptystr[ci->nonterm->index]) { return FALSE; } } return TRUE; }}static BOOL reject_refinement(priority,pi,ni,pj,nj)int priority;int pi,ni,pj,nj;{ if (pj<Min_coverage) return TRUE; else if (priority<Max_priority) return FALSE; else if (ni==nj) return TRUE; else return FALSE;}static double rule_value(form,cl,data)vec_t *form;symbol_t *cl;vec_t *data;{ int i,P,N; ex_count_t p,n, cov,uncov,fp,fn; double ret; count_rule(form,cl,data,&cov,&uncov,&fp,&fn); if (vmax(data)==0) ret=0.0; else ret = 1.0 - (fp*FP_cost+fn*FN_cost)/vmax(data); return ret;}/* construct a subsample of data of size at most Max_sample */static DATA *adequate_subsample(symbol_t *cl,DATA *data,DATA **space){ DATA *subsample; int i; if (Max_sample>0 && vmax(data)>Max_sample) { /* work with a subsample */ sample_data(cl,Max_sample,data,space); subsample = (*space); } else { /* work with full dataset */ subsample = data; } return subsample;}/*****************************************************************************/symbol_t *wrule_classify(weak_rule_t *wr,vec_t *inst){ double r,s; int i; if (form_covers(wr->rule->antec,inst)) { return wr->rule->conseq; } else { r = ((double)(random() % HUGEINT)) / ((double)HUGEINT); s = 0; for (i=0; i<vmax(Classes); i++) { s += wr->dist[i]; if (r<=s) { return vref(atom_t,Classes,i)->nom; } } assert(TRUE); /* random choice failure */ return NULL; }}/* return EXPECTED error rate of rule on data */double wrule_error_rate(weak_rule_t *wr,DATA *data){ int i; example_t *exi; ex_count_t cov, uncov, fp, fn, errors, total; double expected_uncov_error; expected_uncov_error = 1.0; for (i=0; i<vmax(Classes); i++) { expected_uncov_error -= wr->dist[i]*wr->dist[i]; } count_rule(wr->rule->antec,wr->rule->conseq,data,&cov,&uncov,&fp,&fn); total = cov + uncov; errors = fp + uncov*expected_uncov_error; return errors/total;}void fprint_wrule(FILE *fp,weak_rule_t *wr){ int i; fprint_rule(fp,wr->rule); fprintf(fp," ["); for (i=0; i<vmax(Classes); i++) { fprintf(fp,"%.4f%c", wr->dist[i], i==vmax(Classes)-1 ? ']' : ' '); }}void fshow_wrule(FILE *fp,weak_rule_t *wr){ int i; rule_t *r; gsym_t *gsym; r = wr->rule; fprintf(fp,"%s",r->conseq->name); fprintf(fp," %g %g IF",r->nposx,r->nnegx); for (i=0; i<vmax(r->antec); i++) { gsym = vref(gsym_t,r->antec,i); if (!gsym->nonterm) { fprintf(fp," "); fshow_gsym(fp,gsym); } } fprintf(fp," ;"); for (i=0; i<vmax(Classes); i++) { fprintf(fp," %.4f",wr->dist[i]); } fprintf(fp,".\n");}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -