📄 boost-main.c

📁 Ripper 分类算法
💻 C
字号:
/****************************************************************************** boost-main.c - main driver program for experiments with boosting largely pilfered from ripper-main.c******************************************************************************/#include <stdio.h>#include <math.h>#include "ripper.h"#include "boost.h"/*****************************************************************************/static double binomial_std_err(double,int);/* interface to cross-validation*//* for boosting routine */int N_boost=10;char *Weak_learner="ripper";static boosted_concept_t *Hyp=NULL;static void train_boost(vec_t *data){    Hyp = boost_model(data,NULL);    printf("Hypothesis:\n");    print_boost_concept(Hyp); }static double test_boost(vec_t *data){    return boost_error_rate(Hyp,data);}/******************************************************************************/char *Program="boost";char *Help_str[] = { "syntax: boost [options] filestem", "  learn a ruleset from examples",  "", "options are:", " -v#      set trace level to #, which must be 0, 1, 2, or 3", " -kN      estimate error rate by N-fold cross-validation",    " -l       estimate error rate via leave-one-out method", "           (ie N-fold cross-validation where N is training set size)", " -g str  use grammar file str.gram (rather than stem.gram)", " -s      read data file from stdin", " -G      print completed grammar and exit", " -N      print names file and exit", " -R      randomize operation", " -I#     discretize continuous attributes into # equal-frequency segments", " -!ns    allow/disallow negative tests in nominal/set valued attributes", "         ('negative' tests are != for nominals, !~ for sets)", " -B #    boost # times", " -W str  use weak learner named 'str'", "         current options: ripper, findrule", "", "findrule options:", "  -f+    find rules for first (usually minority) class only",      "  -f-    find rules for second (usually majority) class only",      "  -fa    find rules for all classes (default)",      "  -Vg    use information gain to choose rule specialization", "  -Ve    use one-sided entropy reduction to choose rule specialization", "", "ripper options:", " -n       expect noisy data (default)", " -c       expect clean data", " -aORD    arrange classes in order ORD where ORD must be either",  "              +freq---order by increasing frequency (the default)", "              -freq---order by decreasing frequency", "              mdl---order by description length of ruleset", "              given---order classes as listed in .names file", " -M #    use subsamples of size # in choosing tests", " -O #    perform # optimization passes", " -D #    set max decompression", " -S #    simplify hypothesis more (>1) or less (<1)", " -L #    set loss ratio to # (ratio=false-pos-cost/false-neg-cost),", "         where a 'positive' example is an instance of the minority class", " -A      add redundant features to rules ", " -F #    force rules to cover at least # examples", "", "default options: ", "  boost -v0 -B10 -Wripper -n -a+freq -O2 -D64 -!n -S1.0 -L1.0 -F2 -M+infty <filestem>", "", NULL};/*****************************************************************************/class_spec_t Find_rule_class_spec = ALLCL;int main(argc,argv)int argc;char *argv[];{    int o;     char *charp;    vec_t *train_data,*test_data;    char *stem="foo", *gstem=NULL;    boosted_concept_t *hyp;    BOOL print_grammar_flag=FALSE;    BOOL print_names_flag=FALSE;    BOOL crossval=FALSE;    BOOL leave_one_out=FALSE;    int folds;    double loss_ratio;    FILE *out_fp;    BOOL randomize_seed=FALSE;    BOOL use_stdin=FALSE;    double tm,err;    set_trace_level(NONE);    folds=5;    while ((o=getopt(argc,argv,"hv:nck:la:g:tsO:GNRL:D:I:!:S:M:AF:B:W:f:"))	   !=EOF)     {	switch (o) {	  case 'f':	      switch(optarg[0]) {	        case '+': 		  Find_rule_class_spec = MINCL; break;	        case '-': 		  Find_rule_class_spec = MAXCL; break;	        default:		  Find_rule_class_spec = ALLCL; break;	      }	      printf("option: find rules for class '%s'\n",optarg); 	      break;	  case 'V':	      switch(optarg[0]) {	        case 'g': 		  Value_function = &info_gain; break;	        case 'e': 		  Value_function = &entropy_reduction1; break;	        default:		  warning("bad value for -V option\n");	      }	      printf("option: value function is '%s'\n",optarg); 	      break;	  case 'v':	    set_trace_level(atoi(optarg)); 	    printf("option: trace level set to %d\n",trace_level());	    break;	  case 'n':	    Simplify = TRUE;	    printf("option: data is noisy\n");	    break;	  case 'c':	    Simplify = FALSE;	    printf("option: data is clean\n");	    break;	  case 'k':	    crossval = TRUE;	    folds = atoi(optarg);	    printf("option: %d-fold cross-validation\n",folds);	    break;	  case 'l':	    crossval = TRUE;	    folds = 0;	    printf("option: leave-one-out cross-validation\n");	    break;	  case 'a': 	    switch (optarg[0]) {	      case '+': 		Class_ordering = INCFREQ;		printf("option: find rules for least frequent classes first\n");		break;	      case '-':		Class_ordering = DECFREQ;		printf("option: find rules for most frequent classes first\n");		break;	      case 'g':	      case 'G':		Class_ordering = GIVEN;		printf("option: use class ordering given in names file\n");		break;	      case 'm':	      case 'M':		Class_ordering = MDL;		printf("option: choose ordering of classes using MDL\n");		break;	      case 'u':	      case 'U':		Class_ordering = UNORDERED;		printf("option: unordered classes\n");		break;	      default:		warning("bad -o argument: use '+freq', '-freq', 'given'  or 'mdl'");		break;	    }	    break;	  case 'g':	    gstem = newmem(strlen(optarg)+1,char);	    strcpy(gstem,optarg);	    printf("option: use grammar file '%s.gram'\n",gstem);	    break;	  case 's':	    use_stdin = TRUE;	    printf("option: read data from stdin\n");	    break;	  case 'O':	    Optimizations = atoi(optarg);	    printf("option: optimize %d time(s)\n",Optimizations);	    break;	  case 'S':	    Simplify=TRUE;	    MDL_theory_factor = atof(optarg);	    printf("option: multiply coding cost of theory by %g\n",MDL_theory_factor);	    break;	  case 'D':	    Max_decompression = atof(optarg);	    printf("option: max decompression is %g\n",Max_decompression);	    break;	  case '!':	    for (charp=optarg; *charp; charp++) {		if (*charp=='n' || *charp=='N') Eq_negations = TRUE;		if (*charp=='s' || *charp=='S') Set_negations = TRUE;	    }	    printf("option: %s allow inequality tests for nominal attributes\n",		   Eq_negations? "will" : "will not");	    printf("option: %s allow inequality tests for set-valued attributes\n",		   Set_negations? "will" : "will not");	    break;	  case 'R':	    randomize_seed=TRUE;	    printf("option: will set random seed from clock\n");	    break;	  case 'L':	    loss_ratio = atof(optarg);	    FP_cost = 2.0*loss_ratio/(loss_ratio+1.0);	    FN_cost = 2.0/(loss_ratio+1.0);	    printf("option: ratio of cost of FP to cost of FN is %g:%g\n",		   FP_cost,FN_cost);	    break;	  case 'I':	    N_intervals = atoi(optarg);	    if (N_intervals < 0) 	      fatal("argument to -i must be a positive integer or 0");	    if (N_intervals==0) 	      printf("option: no discretization\n",N_intervals);	    else	      printf("option: discretize into %d intervals\n",N_intervals);	    break;	  case 'G':	    /* printf("option: will echo grammar\n"); */	    print_grammar_flag = TRUE;	    break;	  case 'N':	    /* printf("option: will echo names file\n"); */	    print_names_flag = TRUE;	    break;	  case 'M':	    Max_sample = atoi(optarg);	    printf("option: max subsample is %d\n",Max_sample); 	    break;	  case 'A':	    Add_redundancy = TRUE;	    printf("option: add redundant conditions to rules\n");	    break;	  case 'F':	    Min_coverage = max(atoi(optarg),1);	    printf("option: rules must cover %d example(s)\n",Min_coverage); 	    break;	  case 'B':	    N_boost = atoi(optarg);	    printf("option: boost %d times\n",N_boost);	    break;	  case 'W':	    Weak_learner = optarg;	    printf("option: use weak learner %s\n",Weak_learner);	    break;	  case 'h':	  case '?':	  default: 	    give_help();	    if (o!='h') fatal("option not implemented");	    else exit(0);	}    }        /* print out parameter settings */    trace(SUMM) {	printf("// parameter settings:\n");	printf("//   boost %d times\n",N_boost);	printf("//   weak learner is %s\n",Weak_learner);	if (N_intervals!=0)	  printf("//   discretize into %d intervals\n",N_intervals);	else 	  printf("//   no discretization\n");	if (Max_sample!=0) 	  printf("//   use subsamples of %d examples\n",Max_sample);	printf("//   %s != tests in rules\n",	       Eq_negations?"allow":"disallow");	printf("//   %s !~ tests in rules\n",	       Set_negations?"allow":"disallow");	printf("//   ordering classes by ");	if (Class_ordering==UNORDERED) {	    printf("//   not ordering classes\n"); 	} else {	    printf("//   ordering classes by ");	    switch (Class_ordering) {	      case GIVEN: printf("ordering given in .names file\n"); break;	      case MDL: printf("MDL heuristic\n"); break;	      case INCFREQ: printf("increasing frequency\n"); break;	      case DECFREQ: printf("decreasing frequency\n"); break;	      default: printf("???\n"); break;	    }	}	printf("//   expect %s data\n",Simplify?"noisy":"clean");	if (Simplify) {	    printf("//   optimize %d time(s)\n",Optimizations);	    printf("//   max decompression is %g bits\n",		   Max_decompression);	}	printf("//   rules cover >= %d examples\n",Min_coverage);	fflush(stdout);    }    if (optind<argc) {	stem = argv[optind++];	if (gstem==NULL) gstem=stem;    } else {	give_help();	fatal("no file stem specified");    }    if (optind<argc) {	warning("not all arguments were used: %s ...",argv[optind]);    }    start_clock();     trace(SUMM) { 	printf("// timing loading...\n"); 	fflush(stdout);    }    ld_names(add_ext(stem,".names"));    if (use_stdin) {	train_data = ld_data((char *)NULL);    } else {	train_data = ld_data(add_ext(stem,".data"));    }    if (!train_data) {	fatal("no training data!");    }    if (!crossval) test_data = ld_data(add_ext(stem,".test"));    else test_data = NULL;    if (!ld_grammar(add_ext(gstem,".gram"),gstem==stem,train_data)) {	fatal("error in loading grammar");    }    tm = elapsed_time();    trace(SUMM) { 	if (!test_data) {	    printf("// loaded %d examples %d features %d values in %.2f sec\n",		   vmax(train_data),n_fields(),n_symbolic_values(),tm); 	} else  {	    printf("// loaded %d+%d examples %d features %d values in %.2f sec\n",		   vmax(train_data),vmax(test_data),n_fields(),n_symbolic_values(),tm); 	}	fflush(stdout);    };    if (print_grammar_flag) {	print_grammar();	return 0;    }    if (print_names_flag) {	print_names();	return 0;    }    if (randomize_seed) randomize();    if (crossval) {	cross_validate(folds,train_data,&train_boost,&test_boost);    } else {	if ((out_fp=fopen(add_ext(stem,".bhyp"),"w"))==NULL) {	    fatal("can't open output file");	}	trace(SUMM) {	    printf("// timing model command...\n"); 	    fflush(stdout);	}	start_clock(); 	hyp = boost_model(train_data,test_data);	tm = elapsed_time();	trace(SUMM) {	    printf("// model command took %.2f sec\n",tm);	    fflush(stdout);	}	printf("Final hypothesis is:\n");	print_boost_concept(hyp); fshow_boost_concept(out_fp,hyp);	printf("===============================");	printf(" summary ");	printf("===============================\n");	err = boost_error_rate(hyp,train_data);	printf("Train error rate:  %.2f%% +/- %.2f%% (%d datapoints)    <<\n",	       (err*100),	       100*binomial_std_err(err,vmax(train_data)),	       vmax(train_data));	if (test_data) {	    err = boost_error_rate(hyp,test_data);	    printf("Test error rate:   %.2f%% +/- %.2f%% (%d datapoints)   <<\n",		   100*err,		   100*binomial_std_err(err,vmax(test_data)),		   vmax(test_data));	}	printf("Learning time:     %.2f sec\n",tm);    }    return 0;}static double binomial_std_err(err_rate,n)double err_rate;int n;{    return sqrt( (err_rate)*(1-err_rate)/((double)n-1) );}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -