📄 boost.c

📁 Ripper 分类算法
💻 C
字号:
/****************************************************************************** boost.c - boost ripper's hypotheses ******************************************************************************/#include <stdio.h>#include <math.h>#include "ripper.h"#include "boost.h"#include "mdb.h"/*****************************************************************************/static concept_kind_t weak_learner_hypkind(char *);static symbol_t *ith_weak_learner_classify(boosted_concept_t*,vec_t *,int);static symbol_t *last_weak_learner_classify(boosted_concept_t*,vec_t *);static BOOL add_weak_hyp(DATA *,boosted_concept_t *,double *);/* basic adaboost procedure    -- test data is used only for traces */boosted_concept_t *boost_model(DATA *train_data,DATA *test_data){    ex_count_t tot_weight,tot_test_weight,new_tot,z;    example_t *exi;    int i,j,t;    double epsilon, beta;    double train_err,test_err;    boosted_concept_t *hyp;    hyp = newmem(1,boosted_concept_t);    hyp->hyp_kind = weak_learner_hypkind(Weak_learner);    hyp->weak_hyp = NULL;    hyp->wt = new_vec(double);    /* let tot_weight be total weight of examples */    tot_weight = 0;    for (i=0; i<vmax(train_data); i++) {	exi = vref(example_t,train_data,i); 	tot_weight += exi->wt;    }    /* now boost up to N_boost times */    for (t=0; t<N_boost; t++) {	/* call weak learner, add a new hypothesis to hyp 	 *   return also train error epsilon and test error 	 *   and abort if epsilon>0.5	 */	if (!add_weak_hyp(train_data,hyp,&epsilon)) {	    trace(SUMM) {		printf("// aborting boost with epsilon = %.3f\n",epsilon);	    }	    break;	}	/* compute update factor for new hypothesis */	beta = epsilon / (1.0 - epsilon);	/* print some info */	trace(SUMM) {	    printf("// weak hyp %d: train err = %.2f%%, beta=%.3f\n",		   t+1,epsilon,beta);	}	/* update the 'distribution' -- ie example weights */	new_tot = 0;	for (i=0; i<vmax(train_data); i++) {	    exi = vref(example_t,train_data,i); 	    if (last_weak_learner_classify(hyp,exi->inst) == exi->lab.nom) {		exi->wt *= beta;	    }	    new_tot += exi->wt; 	}	z = tot_weight/new_tot;	for (i=0; i<vmax(train_data); i++) {	    exi = vref(example_t,train_data,i); 	    exi->wt *= z;	}	trace(SUMM) {	    train_err = boost_error_rate(hyp,train_data);	    if (test_data && vmax(test_data) > 0) {		test_err = boost_error_rate(hyp,test_data);	    } else {		test_err = 0.0;	    }	    /* print a message */	    printf("Boost %d: train+test errors = %.2f%% + %.2f%%\n\n",		   t+1,train_err*100,test_err*100); 	}    }    trace(SUMM) {	if (t<N_boost && test_data && vmax(test_data) > 0) {	    test_err = boost_error_rate(hyp,test_data);	    printf("\nFinal boosted test error: %.3f%%\n",test_err*100);	}    }    return hyp;}/* classify with boosted hypothesis */symbol_t *boost_classify(boosted_concept_t *hyp,vec_t *inst){    int k,clx,bestk;    double w,bestw;    static double *pred;    if (pred==NULL) pred = newmem(vmax(Classes),double);	    /* compute prediction for instance */    for (k=0; k<vmax(Classes); k++) pred[k]=0.0;    for (k=0; k<vmax(hyp->weak_hyp); k++) {	clx =  ith_weak_learner_classify(hyp,inst,k)->index;	w = *vref(double,hyp->wt,k);	pred[clx] += w;    }    bestk = -1; bestw = -1.0;    for (k=0; k<vmax(Classes); k++) {	if (pred[k] > bestw) {	    bestw = pred[k];	    bestk = k;	}    }    return vref(atom_t,Classes,bestk)->nom;}/* error rate of boosted hypothesis */double boost_error_rate(boosted_concept_t *c,DATA *data){    example_t *exp,*exmax;    double err;    double tot;    err = tot = 0;    exmax = vbase(example_t,data)+vmax(data);    for (exp=vbase(example_t,data); exp<exmax; exp++) {	tot += exp->wt;	if (boost_classify(c,exp->inst)!=exp->lab.nom) err += exp->wt;    }    if (tot==0) return 0.0;    else return err/tot;}/*************************************************************************** interface to weak learners and underlying boosted representation ***************************************************************************//* map string naming weak learner to its hypothesis */static concept_kind_t weak_learner_hypkind(char *weak){    if (streq(Weak_learner,"ripper")) {	return RULESET;    } else if (streq(Weak_learner,"findrule")) {	return RULE;    } else {	fatal("unimplemented weak learner '%s'\n",Weak_learner);		return 0;    }}/* add a new weak hypothesis to a set of boosted hypotheses */static BOOL add_weak_hyp(DATA *data,boosted_concept_t *hyp,double *epsilon){    double w;    int sz;    if (streq(Weak_learner,"ripper")) {	weak_ruleset_t *c;	/* reduce level of tracing for the duration */	Trace_level--;	c = model(data);	Trace_level++;	*epsilon = error_rate(c,data);	if (*epsilon > 0.5) {	    return FALSE;	} else {	    if (hyp->weak_hyp==NULL) {		hyp->weak_hyp = new_vec(weak_ruleset_t);	    }	    ext_vec(weak_ruleset_t,hyp->weak_hyp,c);	    /* subtle point: after this first round, I assume that	       the class ordering is fixed; so prevent the	       learner from re-ordering the classes ever again	       */	    Class_ordering = GIVEN;	}    } else if (streq(Weak_learner,"findrule")) {	weak_rule_t *wr;	wr = wrule_model(data);	*epsilon = wrule_error_rate(wr,data);	if (*epsilon > 0.5) {	    return FALSE;	} else {	    if (hyp->weak_hyp==NULL) {		hyp->weak_hyp = new_vec(weak_rule_t);	    }	    ext_vec(weak_rule_t,hyp->weak_hyp,wr);	}    } else {	fatal("unimplemented weak learner '%s'\n",Weak_learner);    }    /* if we get here a new hyp has been added so add the assoc. weight */    w = Log2( (1.0 - *epsilon) / *epsilon );    ext_vec(double,hyp->wt,&w);    return TRUE;}/* classify an example with the LAST learned weak hypothesis */static symbol_t *last_weak_learner_classify(boosted_concept_t *hyp,vec_t *inst){    int k;        assert(hyp && hyp->weak_hyp && vmax(hyp->weak_hyp)>0);    k = vmax(hyp->weak_hyp)-1;    return ith_weak_learner_classify(hyp,inst,k);}/* classify an example with the ith weak hypothesis in a set */static symbol_t *ith_weak_learner_classify(boosted_concept_t *hyp,vec_t *inst,int k){    weak_ruleset_t *c;    weak_rule_t *wr;    assert(hyp && hyp->weak_hyp && vmax(hyp->weak_hyp)>k);        switch (hyp->hyp_kind) {      case RULESET:	c = vref(weak_ruleset_t,hyp->weak_hyp,k);	return classify(c,inst);	break;      case RULE:	wr = vref(weak_rule_t,hyp->weak_hyp,k);	return wrule_classify(wr,inst);	break;      default:	fatal("ith_...: hypothesis kind %d unsupported",hyp->hyp_kind);	return NULL;    }}/* print a set of boosted concepts */void fprint_boost_concept(FILE *fp,boosted_concept_t *hyp){    int i;    double w;    weak_ruleset_t *c;    weak_rule_t *wr;    switch (hyp->hyp_kind) {      case RULESET:	fprintf(fp,"== Boosted rulesets:\n");	for (i=0; i<vmax(hyp->weak_hyp); i++) {	    c = vref(weak_ruleset_t,hyp->weak_hyp,i);	    w = *vref(double,hyp->wt,i);	    fprintf(fp,"\nWeight %.3f:\n",w);	    fprint_concept(fp,c);	}	fprintf(fp,"== End boosted rulesets:\n");	break;      case RULE:	fprintf(fp,"== Boosted rules:\n");	for (i=0; i<vmax(hyp->weak_hyp); i++) {	    wr = vref(weak_rule_t,hyp->weak_hyp,i);	    w = *vref(double,hyp->wt,i);	    fprintf(fp,"Weight %.3f: ",w);	    fprint_wrule(fp,wr);	    fprintf(fp,"\n");	}	fprintf(fp,"== End boosted rules:\n");	break;      default:	fatal("print: hypothesis kind %d unsupported",hyp->hyp_kind);    }}void fshow_boost_concept(FILE *fp,boosted_concept_t *hyp){    int i;    double w;    weak_ruleset_t *c;    weak_rule_t *wr;    switch (hyp->hyp_kind) {      case RULESET:	for (i=0; i<vmax(hyp->weak_hyp); i++) {	    c = vref(weak_ruleset_t,hyp->weak_hyp,i);	    w = *vref(double,hyp->wt,i);	    fprintf(fp,"%.g\n",w);	    fshow_concept(fp,c);	}	break;      case RULE:	for (i=0; i<vmax(hyp->weak_hyp); i++) {	    wr = vref(weak_rule_t,hyp->weak_hyp,i);	    w = *vref(double,hyp->wt,i);	    fprintf(fp,"%.g\n",w);	    fshow_wrule(fp,wr);	}	break;      default:	fatal("fshow: hypothesis kind %d unsupported",hyp->hyp_kind);    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -