📄 boost_model.c

📁 Ripper 分类算法
💻 C
字号:
/****************************************************************************** boost_model.c - use fit() on each class to find a model that fits the data******************************************************************************/#include <stdio.h>#include "ripper.h"#include "boost.h"
#include "vector.h"/*****************************************************************************/boosted_concept_t *boost_model(data)vec_t *data;{    boosted_concept_t *hyp;     rule_t *rulej;    count_classes(data);    reorder_classes(data);    /* do the learning */    boost1(data);    return hyp;}/* locally used functions */static void refine(vec_t *,vec_t *,symbol_t *,vec_t *);static void simplify(vec_t *,vec_t *,symbol_t *,vec_t *,int,vec_t *);static BOOL stop_refining(vec_t *,int,int);static BOOL reject_refinement(int,int,int,int,int);static double rule_value(vec_t *,symbol_t *,vec_t *,int,vec_t *);static BOOL reject_rule(vec_t *,symbol_t *,int,int,vec_t *,vec_t *);static DATA *adequate_subsample(symbol_t *,DATA *,DATA **);/*****************************************************************************//* use a greedy strategy to find rules to cover the examples of class cl */static boosted_concept_t boost1(data)vec_t *data;vec_t *rules;  /* side-effected */{    symbol_t *cl;    ex_count_t all_p,all_n,rule_p,rule_n;    rule_t *curr_rule,tmp_rule;    vec_t data1,grow_data,prune_data;    vec_t 	*new_form,*new_deriv,	*old_form, *old_deriv,	*rev_form,*rev_deriv; /* revised */    BOOL last_rule_accepted,revising_rule;    int rule_num;    double new_val,old_val,rev_val,pr_val;    double comp,best_comp;    int i;    ex_count_t tmp_p,tmp_n;    char *operation;    int n_deleted;    DATA *working_sample;    static DATA *space=NULL;    /* work out what class is */    cl = vref(atom_t,Classes,0)->nom;    if (vmax(Classes)!=2) {	fatal("boosting only implemented now for two classes");    }    copy_data(&data1,data);    last_rule_accepted = TRUE;    /* main loop: learn a bunch of rules */    while (all_p>0 && last_rule_accepted) {	/* determine the starting points for growing rules */	new_form = new_vec(gsym_t);	new_deriv = new_vec(deriv_step_t);	make_the_universe(cl,new_form);	/* allocate space to put next rule */	curr_rule = &tmp_rule;	curr_rule->conseq = cl;	/* grow the rule */	stratified_partition(&data1,3,&grow_data,&prune_data);	trace(LONG) printf("// finding new rule %d\n",rule_num+1);	refine(new_form,new_deriv,cl,&grow_data);	simplify(new_form,new_deriv,cl,rules,rule_num,&prune_data);		count_examples(new_form,cl,&data1,&rule_p,&rule_n);	if (reject_rule(new_form,cl,rule_p,rule_n,rules,&data1)) {		last_rule_accepted = FALSE;		/* clean up new_form,deriv */		free_vec(gsym_t,new_form); 		free_vec(deriv_step_t,new_deriv);	} else {	    curr_rule->antec = new_form;	    curr_rule->deriv = new_deriv;	    curr_rule->nposx = rule_p;	    curr_rule->nnegx = rule_n;	    ext_vec(rule_t,rules,curr_rule);	    /* replace this with the boosting stuff */	    remove_covered_examples(new_form,&data1);	    trace(SUMM) {		pr_val = rule_value(new_form,cl,rules,rule_num,&prune_data):		printf("// r%d added [m=%d,v=%.3f] ",		       rule_num+1,vmax(&data1),pr_val);		print_rule(curr_rule); printf("\n");		fflush(stdout);	    }	}    } /* while allp>0 and last_rule_accepted */}/* use a greedy strategy to refine a rule */static void refine(ref,deriv,cl,data)vec_t *ref, *deriv;  /* these are side-effected here */symbol_t *cl;vec_t *data;{    ex_count_t p,n,p1,n1;    DATA data1,*working_sample;    int i, best_i;    BOOL last_refinement_rejected;    double best_val, vali, prev_info;    vec_t *csi;    int priority;    static DATA *space=NULL;    trace(LONG) {	printf("// refining:  "); 	print_form(ref); 	printf("\n");    }    copy_data(&data1,data);    count_examples(ref,cl,&data1,&p,&n);    prev_info = information(p,n);    last_refinement_rejected = FALSE;    while (!stop_refining(ref,p,n) && !last_refinement_rejected) {		/* if appropriate use a smaller sample */	working_sample = adequate_subsample(cl,&data1,&space);	/* find the best refinement */	priority = 0;	do {	    best_val = -MAXREAL; 	    compute_designated_refinements(ref,deriv,priority++);	    for (i=0; i<n_designated_refinements(); i++) {		//vali = value(refinement(i),cl,working_sample,ref,prev_info);		if (vali > best_val) {		    best_val = vali; best_i = i; 		}	    }	} while (best_val<=0 && priority<=Max_priority);	/* decide if it's worth keeping */	count_examples(refinement(best_i),cl,&data1,&p1,&n1);	if (reject_refinement(priority-1,p,n,p1,n1)) {	    last_refinement_rejected = TRUE;	    trace(LONG) {		printf("// stopped refining (%g/%g) ",p1,n1);		print_form(refinement(best_i));		printf("\n");	    }	} else {	    copy_vec(gsym_t,ref,refinement(best_i));	    copy_vec(deriv_step_t,deriv,derivation(best_i));	    p = p1; n = n1; prev_info = information(p,n);	    remove_uncovered_examples(ref,&data1);	    trace(LONG) {		printf("// refined to (%g/%g): ",p,n); 		print_form(ref); 		printf("\n");	    }	} /* else !reject_refinement */    } /* while !stop_refining */}/* use greedy strategy to simplify a rule */static void simplify(form,deriv,cl,rules,rule_num,prune_data)vec_t *form, *deriv;  /* side-effected here */symbol_t *cl;vec_t *rules;int rule_num;vec_t *prune_data;{    int i, j, best_j;    ex_count_t p, n;    double val, best_val;    DATA *working_sample;    static DATA *space=NULL;    working_sample = adequate_subsample(cl,prune_data,&space);    best_val = rule_value(form,cl,rules,rule_num,working_sample);    trace(LONG) {	count_examples(form,cl,working_sample,&p,&n);	printf("// simplifying with %d examples\n",vmax(working_sample)); 	printf("// initial rule [%g/%g val=%.3f]: ",p,n,best_val);	print_form(form); 	printf("\n");    }    compute_designated_generalizations(cl,form,deriv);    while (n_designated_generalizations() > 0) {	/* find generalization better than best_val */	best_j = -1; 	for (j=0; j<n_designated_generalizations(); j++) {	    val = rule_value(generalization(j),cl,rules,rule_num,working_sample);	    trace(DBUG) {		count_examples(generalization(j),cl,working_sample,&p,&n);		printf("// gen %d [%g/%g val=%.3f]: ",j,p,n,val);		print_form(generalization(j));		printf("\n");	    }	    if (val >= best_val) {		best_val = val;		best_j = j;	    }		  	} /*for generalization j */	if (best_j == -1) {	    /* nothing better found */	    break; /* exit while loop */	} else {	    /* replace form/deriv with the current best */	    copy_vec(gsym_t,form,generalization(best_j));	    copy_vec(deriv_step_t,deriv,derivation(best_j));	    trace(LONG) {		count_examples(form,cl,working_sample,&p,&n);		printf("// simplified to [%g/%g val=%.3f] ",p,n,best_val); 		print_form(form); 		printf("\n");	    }	    compute_designated_generalizations(cl,form,deriv);	}     } /* while */    trace(LONG) {	val = rule_value(form,cl,rules,rule_num,working_sample);	count_examples(form,cl,working_sample,&p,&n);	printf("// final rule [%g/%g val=%.3f]: ",p,n,val); 	print_form(form); 	printf("\n");    }}/* decide when to stop refining and when to reject a refinement */static BOOL stop_refining(form,p,n)vec_t *form;int p,n;{    int i;    gsym_t *ci;    if (n!=0) return FALSE;    else {	for (i=0; i<vmax(form); i++)  {	    ci = vref(gsym_t,form,i);	    if (ci->nonterm && !Derives_emptystr[ci->nonterm->index]) {		return FALSE;	    }	}	return TRUE;    }}static BOOL reject_refinement(priority,pi,ni,pj,nj)int priority;int pi,ni,pj,nj;{    if (pj<Min_coverage) return TRUE;    else if (priority<Max_priority) return FALSE;    else if (ni==nj) return TRUE;    else return FALSE;}static BOOL reject_rule(form,cl,p,n,rules,data)vec_t *form;symbol_t *cl;int p,n;vec_t *rules;vec_t *data;{    char reason[100];    double decomp;     BOOL ret;    int allp,alln;    if (p==0) {	ret = TRUE;	sprintf(reason,"too few positive examples");    } else if (n*FP_cost/(p*FN_cost + n*FP_cost) >= 0.5) {	ret = TRUE;	sprintf(reason,"error rate too large");    } else {	ret = FALSE;	sprintf(reason,"");    }    trace (SUMM) {	if (ret) {	    printf("// rejected [%s]",reason);	    print_form(form);	    printf("\n");	}	fflush(stdout);    }    return ret;}static double rule_value(form,cl,rules,rule_num,data)vec_t *form;symbol_t *cl;vec_t *rules;int rule_num;vec_t *data;{    int i,P,N;    ex_count_t p,n, cov,uncov,fp,fn;    double num,denom;    double ret,errs;    if (rule_num >= vmax(rules)) {     	/* if adding a rule, use modified Furncrantz metric */	count_examples(form,cl,data,&p,&n);	num = (p+1)*FN_cost - (n+1)*FP_cost;	denom = p*FN_cost + (n+1)*FP_cost + 1;	ret =  num/denom;    } else { 	/* if revising, use accuracy of theory resulting	 * from replacing rules[rule_num] with form 	 * given that examples covered by later rules	 * are gone, we can just use errors committed by 	 * this single rule	 */	count_rule(form,cl,data,&cov,&uncov,&fp,&fn);	if (vmax(data)==0) ret=0.0; 	else ret = 1.0 - (fp*FP_cost+fn*FN_cost)/vmax(data);    }    return ret;}/* construct a subsample of data of size at most Max_sample */static DATA *adequate_subsample(symbol_t *cl,DATA *data,DATA **space){    DATA *subsample;    int i;    if (Max_sample>0 && vmax(data)>Max_sample) {	/* work with a subsample */	sample_data(cl,Max_sample,data,space);	subsample = (*space);    } else {	/* work with full dataset */	subsample = data;    }    return subsample;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -