📄 fit.c

📁 Ripper 分类算法
💻 C
字号:
/****************************************************************************** fit.c - find a model that fits the data uses variant of incremental reduced error pruning (Furnkranz,ML94) main twiddles: a postpass to reduce MDL, and repetition of the learning******************************************************************************/#include <stdio.h>#include "ripper.h"BOOL Simplify=TRUE;double FP_cost=1.0;double FN_cost=1.0;double Max_decompression=64.0;int Max_sample=0;int Min_coverage=1;int Optimizations=2;/* locally used functions */static void fit1(vec_t *,symbol_t *,vec_t *);static void simplify(vec_t *,vec_t *,symbol_t *,vec_t *,int,vec_t *);static BOOL stop_refining(vec_t *,int,int);static BOOL reject_refinement(int,int,int,int,int);static double rule_value(vec_t *,symbol_t *,vec_t *,int,vec_t *);static BOOL reject_rule(vec_t *,symbol_t *,int,int,vec_t *,vec_t *,double *,double *);static DATA *adequate_subsample(symbol_t *,DATA *,DATA **);/*****************************************************************************//* iterate fit1 to get good hypothesis */vec_t *fit(data,cl)symbol_t *cl;vec_t *data;{    int i,trial_num;    vec_t *hyp;    /* hyp starts as empty ruleset */     hyp = new_vec(rule_t);    /* my algorithm */    if (!Simplify) {	fit1(data,cl,hyp);    } else {	fit1(data,cl,hyp);	trace(SUMM) {	    printf("// current ruleset for %s\n",cl->name);	    for (i=0; i<vmax(hyp); i++) {		printf("//    ");		print_rule(vref(rule_t,hyp,i));		printf("\n");	    }	    fflush(stdout);	}	/* iterate fit1  */	for (trial_num=0; trial_num<Optimizations; trial_num++) {	    trace (SUMM) {		printf("// optimizing rules for class %s---pass %d of %d\n",		       cl->name,trial_num+1,Optimizations);		fflush(stdout);	    }	    fit1(data,cl,hyp);	    reduce_dlen(hyp,cl,data);	    trace(SUMM) {		printf("// current ruleset for %s\n",cl->name);		for (i=0; i<vmax(hyp); i++) {		    printf("//    ");		    print_rule(vref(rule_t,hyp,i));		    printf("\n");		}		fflush(stdout);	    }	}    }    return hyp;}/* use a greedy strategy to find rules to cover the examples of class cl */static void fit1(data,cl,rules)vec_t *data;symbol_t *cl;vec_t *rules;  /* side-effected */{    ex_count_t all_p,all_n,rule_p,rule_n;    rule_t *curr_rule,tmp_rule;    vec_t data1,grow_data,prune_data;    vec_t 	*new_form,*new_deriv,	*old_form, *old_deriv,	*rev_form,*rev_deriv; /* revised */    BOOL last_rule_accepted,revising_rule;    int rule_num;    double new_val,old_val,rev_val,pr_val;    double comp,best_comp;    int i;    ex_count_t tmp_p,tmp_n;    char *operation;    int n_deleted;    DATA *working_sample;    static DATA *space=NULL;    comp = best_comp = 0.0;    copy_data(&data1,data);    count_class_freq(cl,&data1,&all_p,&all_n);    rule_num = 0;    n_deleted = 0;    last_rule_accepted = TRUE;    /* main loop: learn a set of rules */    while (all_p>0 && last_rule_accepted) {	/* determine the starting points for growing rules */	new_form = new_vec(gsym_t);	new_deriv = new_vec(deriv_step_t);	make_the_universe(cl,new_form);	if (rule_num >= vmax(rules)) {	    revising_rule = FALSE;	    curr_rule = &tmp_rule;	    curr_rule->conseq = cl;	} else {	    revising_rule = TRUE;	    curr_rule = vref(rule_t,rules,rule_num);	    rev_form = new_vec(gsym_t);	    rev_deriv = new_vec(deriv_step_t);	    copy_vec(gsym_t,rev_form,curr_rule->antec);	    copy_vec(deriv_step_t,rev_deriv,curr_rule->deriv);	    old_form = curr_rule->antec;    	    old_deriv = curr_rule->deriv;	}	/* don't bother revising a rule that's subsumed by an earlier one */	if (revising_rule) {	    count_examples(old_form,cl,&data1,&rule_p,&rule_n);	    curr_rule->nposx = rule_p;	    curr_rule->nnegx = rule_n;	    trace(LONG) {		printf("// old rule %d: ",rule_num+1);		print_rule(curr_rule); printf("\n");	    }	    if (rule_p==0 && rule_n==0) {		trace (SUMM) {		    printf("// r%d skipped [null coverage] ",rule_num+1);		    print_rule(curr_rule); printf("\n");		    fflush(stdout);		}		free_vec(gsym_t,new_form); free_vec(deriv_step_t,new_deriv);		free_vec(gsym_t,rev_form); free_vec(deriv_step_t,rev_deriv);		n_deleted++;		rule_num++;		continue;	    }	}	/* grow a new rule, and possibly a revised version of old rule */	if (!Simplify) {	    refine(new_form,new_deriv,cl,&data1);		} else {	    stratified_partition(&data1,3,&grow_data,&prune_data);	    trace(LONG) printf("// finding new rule %d\n",rule_num+1);	    refine(new_form,new_deriv,cl,&grow_data);	    /* remove examples covered by later rules from pruning set 	       since the current rule can't change their classification */	    for (i=rule_num+1; i<vmax(rules); i++) {		remove_covered_examples( 		   vref(rule_t,rules,i)->antec,&prune_data);	    }	    simplify(new_form,new_deriv,cl,rules,rule_num,&prune_data);	    if (revising_rule) {		trace(LONG) printf("// revising rule %d\n",rule_num+1);		refine(rev_form,rev_deriv,cl,&grow_data);		simplify(rev_form,rev_deriv,cl,rules,rule_num,&prune_data);	    }	}		/* decide which rule is best */	if (!revising_rule) {	    count_examples(new_form,cl,&data1,&rule_p,&rule_n);	    if (reject_rule(new_form,cl,rule_p,rule_n,rules,&data1,&comp,&best_comp)) {		last_rule_accepted = FALSE;		/* clean up new_form,deriv */		free_vec(gsym_t,new_form); free_vec(deriv_step_t,new_deriv);	    } else {		curr_rule->antec = new_form;		curr_rule->deriv = new_deriv;		curr_rule->nposx = rule_p;		curr_rule->nnegx = rule_n;		ext_vec(rule_t,rules,curr_rule);		remove_covered_examples(new_form,&data1);		trace(SUMM) {		    pr_val = 		      Simplify?			rule_value(new_form,cl,rules,rule_num,&prune_data):			rule_value(new_form,cl,rules,rule_num,&data1);		    printf("// r%d added [m=%d,v=%.3f] ",			   rule_num+1,vmax(&data1),pr_val);		    print_rule(curr_rule); printf("\n");		    fflush(stdout);		}		rule_num++;	    }	} else /* revising rule */ {	    working_sample = adequate_subsample(cl,&data1,&space);	    new_val = relative_compression(new_form,cl,rules,rule_num,working_sample);	    rev_val = relative_compression(rev_form,cl,rules,rule_num,working_sample);	    old_val = relative_compression(old_form,cl,rules,rule_num,working_sample);	    trace(LONG) {		printf("// value: old=%.3f rev=%.3f new=%.3f\n", 		       old_val,rev_val,new_val);	    }	    if (old_val >= new_val && old_val >= rev_val) {		count_examples(old_form,cl,&data1,&rule_p,&rule_n);		curr_rule->antec = old_form;		curr_rule->deriv = old_deriv;		curr_rule->nposx = rule_p;		curr_rule->nnegx = rule_n;		/* clean up new_form,deriv, rev_form,deriv */		free_vec(gsym_t,new_form); free_vec(deriv_step_t,new_deriv);		free_vec(gsym_t,rev_form); free_vec(deriv_step_t,rev_deriv);		trace(SUMM) { operation="retained"; pr_val = old_val; }	    } else if (rev_val>=new_val) {		count_examples(rev_form,cl,&data1,&rule_p,&rule_n);		curr_rule->antec = rev_form;		curr_rule->deriv = rev_deriv;		curr_rule->nposx = rule_p;		curr_rule->nnegx = rule_n;		/* clean up old_form,deriv, new_form,deriv */		free_vec(gsym_t,old_form); free_vec(deriv_step_t,old_deriv);		free_vec(gsym_t,new_form); free_vec(deriv_step_t,new_deriv);		trace(SUMM) { operation="revised"; pr_val = rev_val; }	    } else /* new_val is best */ {		count_examples(new_form,cl,&data1,&rule_p,&rule_n);		curr_rule->antec = new_form;		curr_rule->deriv = new_deriv;		curr_rule->nposx = rule_p;		curr_rule->nnegx = rule_n;		/* clean up old_form,deriv, rev_form,deriv */		free_vec(gsym_t,old_form); free_vec(deriv_step_t,old_deriv);		free_vec(gsym_t,rev_form); free_vec(deriv_step_t,rev_deriv);		trace(SUMM) { operation="replaced"; pr_val = new_val; }	    }	    remove_covered_examples(curr_rule->antec,&data1);	    count_class_freq(cl,&data1,&all_p,&all_n);	    trace(SUMM) {		printf("// r%d %s [m=%d,c=%.3f] ",		       rule_num+1,operation,vmax(&data1),pr_val);		print_rule(curr_rule); printf("\n");		fflush(stdout);	    }	    rule_num++;	}    } /* while allp>0 and last_rule_accepted */}/* use a greedy strategy to refine a rule */void refine(ref,deriv,cl,data)vec_t *ref, *deriv;  /* these are side-effected here */symbol_t *cl;vec_t *data;{    ex_count_t p,n,p1,n1;    DATA data1,*working_sample;    int i, best_i;    BOOL last_refinement_rejected;    double best_val, vali, prev_info;    vec_t *csi;    int priority;    static DATA *space=NULL;    trace(LONG) {	printf("// refining:  "); 	print_form(ref); 	printf("\n");    }    copy_data(&data1,data);    count_examples(ref,cl,&data1,&p,&n);    if (form_size(ref)>0) {	remove_uncovered_examples(ref,&data1);    }    prev_info = information(p,n);    last_refinement_rejected = FALSE;    while (!stop_refining(ref,p,n) && !last_refinement_rejected) {		/* if appropriate use a smaller sample */	working_sample = adequate_subsample(cl,&data1,&space);	/* find the best refinement */	priority = 0;	do {	    best_val = -MAXREAL; 	    compute_designated_refinements(ref,deriv,priority++);	    for (i=0; i<n_designated_refinements(); i++) {		vali = value(refinement(i),cl,working_sample,ref,prev_info,p,n);		if (vali > best_val) {		    best_val = vali; best_i = i; 		}	    }	} while (best_val<=0 && priority<=Max_priority);	/* decide if it's worth keeping */	count_examples(refinement(best_i),cl,&data1,&p1,&n1);	if (reject_refinement(priority-1,p,n,p1,n1)) {	    last_refinement_rejected = TRUE;	    trace(LONG) {		printf("// stopped refining (%g/%g) ",p1,n1);		print_form(refinement(best_i));		printf("\n");	    }	} else {	    copy_vec(gsym_t,ref,refinement(best_i));	    copy_vec(deriv_step_t,deriv,derivation(best_i));	    p = p1; n = n1; prev_info = information(p,n);	    remove_uncovered_examples(ref,&data1);	    trace(LONG) {		printf("// refined to (%g/%g): ",p,n); 		print_form(ref); 		printf("\n");	    }	} /* else !reject_refinement */    } /* while !stop_refining */}/* use greedy strategy to simplify a rule */static void simplify(form,deriv,cl,rules,rule_num,prune_data)vec_t *form, *deriv;  /* side-effected here */symbol_t *cl;vec_t *rules;int rule_num;vec_t *prune_data;{    int i, j, best_j;    ex_count_t p, n;    double val, best_val;    DATA *working_sample;    static DATA *space=NULL;    working_sample = adequate_subsample(cl,prune_data,&space);    best_val = rule_value(form,cl,rules,rule_num,working_sample);    trace(LONG) {	count_examples(form,cl,working_sample,&p,&n);	printf("// simplifying with %d examples\n",vmax(working_sample)); 	printf("// initial rule [%g/%g val=%.3f]: ",p,n,best_val);	print_form(form); 	printf("\n");    }    compute_designated_generalizations(cl,form,deriv);    while (n_designated_generalizations() > 0) {	/* find generalization better than best_val */	best_j = -1; 	for (j=0; j<n_designated_generalizations(); j++) {	    val = rule_value(generalization(j),cl,rules,rule_num,working_sample);	    trace(DBUG) {		count_examples(generalization(j),cl,working_sample,&p,&n);		printf("// gen %d [%g/%g val=%.3f]: ",j,p,n,val);		print_form(generalization(j));		printf("\n");	    }	    if (val >= best_val) {		best_val = val;		best_j = j;	    }		  	} /*for generalization j */	if (best_j == -1) {	    /* nothing better found */	    break; /* exit while loop */	} else {	    /* replace form/deriv with the current best */	    copy_vec(gsym_t,form,generalization(best_j));	    copy_vec(deriv_step_t,deriv,derivation(best_j));	    trace(LONG) {		count_examples(form,cl,working_sample,&p,&n);		printf("// simplified to [%g/%g val=%.3f] ",p,n,best_val); 		print_form(form); 		printf("\n");	    }	    compute_designated_generalizations(cl,form,deriv);	}     } /* while */    trace(LONG) {	val = rule_value(form,cl,rules,rule_num,working_sample);	count_examples(form,cl,working_sample,&p,&n);	printf("// final rule [%g/%g val=%.3f]: ",p,n,val); 	print_form(form); 	printf("\n");    }}/* decide when to stop refining and when to reject a refinement */static BOOL stop_refining(form,p,n)vec_t *form;int p,n;{    int i;    gsym_t *ci;    if (n!=0) return FALSE;    else {	for (i=0; i<vmax(form); i++)  {	    ci = vref(gsym_t,form,i);	    if (ci->nonterm && !Derives_emptystr[ci->nonterm->index]) {		return FALSE;	    }	}	return TRUE;    }}static BOOL reject_refinement(priority,pi,ni,pj,nj)int priority;int pi,ni,pj,nj;{    if (pj<Min_coverage) return TRUE;    else if (priority<Max_priority) return FALSE;    else if (ni==nj) return TRUE;    else return FALSE;}static BOOL reject_rule(form,cl,p,n,rules,data,pcomp,pbest_comp)vec_t *form;symbol_t *cl;int p,n;vec_t *rules;vec_t *data;double *pcomp,*pbest_comp;{    char reason[100];    double decomp;     BOOL ret;    int allp,alln;    /* enforce a description-length based cut-off */    (*pcomp) += relative_compression(form,cl,rules,vmax(rules),data);    if ((*pcomp) > (*pbest_comp)) {	(*pbest_comp) = (*pcomp);    }     if ((*pcomp) < (*pbest_comp) - Max_decompression) {	ret = TRUE;	sprintf(reason,"description length too large");    } else if (p==0) {	ret = TRUE;	sprintf(reason,"too few positive examples");    } else if (n*FP_cost/(p*FN_cost + n*FP_cost) >= 0.5) {	ret = TRUE;	sprintf(reason,"error rate too large");    } else {	ret = FALSE;	sprintf(reason,"");    }    trace (LONG) {	printf("// final rule compression is %g (best %g)\n",(*pcomp),(*pbest_comp));    }    trace (SUMM) {	if (ret) {	    printf("// rejected [%s]",reason);	    print_form(form);	    printf("\n");	}	fflush(stdout);    }    return ret;}static double rule_value(form,cl,rules,rule_num,data)vec_t *form;symbol_t *cl;vec_t *rules;int rule_num;vec_t *data;{    int i,P,N;    ex_count_t p,n, cov,uncov,fp,fn;    double num,denom;    double ret,errs;    if (rule_num >= vmax(rules)) {     	/* if adding a rule, use modified Furncrantz metric */	count_examples(form,cl,data,&p,&n);	num = (p+1)*FN_cost - (n+1)*FP_cost;	denom = p*FN_cost + (n+1)*FP_cost + 1;	ret =  num/denom;    } else { 	/* if revising, use accuracy of theory resulting	 * from replacing rules[rule_num] with form 	 * given that examples covered by later rules	 * are gone, we can just use errors committed by 	 * this single rule	 */	count_rule(form,cl,data,&cov,&uncov,&fp,&fn);	if (vmax(data)==0) ret=0.0; 	else ret = 1.0 - (fp*FP_cost+fn*FN_cost)/vmax(data);    }    return ret;}/* construct a subsample of data of size at most Max_sample */static DATA *adequate_subsample(symbol_t *cl,DATA *data,DATA **space){    DATA *subsample;    int i;    if (Max_sample>0 && vmax(data)>Max_sample) {	/* work with a subsample */	sample_data(cl,Max_sample,data,space);	subsample = (*space);    } else {	/* work with full dataset */	subsample = data;    }    return subsample;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -