📄 boost-main.c
字号:
/****************************************************************************** boost-main.c - main driver program for experiments with boosting largely pilfered from ripper-main.c******************************************************************************/#include <stdio.h>#include <math.h>#include "ripper.h"#include "boost.h"/*****************************************************************************/static double binomial_std_err(double,int);/* interface to cross-validation*//* for boosting routine */int N_boost=10;char *Weak_learner="ripper";static boosted_concept_t *Hyp=NULL;static void train_boost(vec_t *data){ Hyp = boost_model(data,NULL); printf("Hypothesis:\n"); print_boost_concept(Hyp); }static double test_boost(vec_t *data){ return boost_error_rate(Hyp,data);}/******************************************************************************/char *Program="boost";char *Help_str[] = { "syntax: boost [options] filestem", " learn a ruleset from examples", "", "options are:", " -v# set trace level to #, which must be 0, 1, 2, or 3", " -kN estimate error rate by N-fold cross-validation", " -l estimate error rate via leave-one-out method", " (ie N-fold cross-validation where N is training set size)", " -g str use grammar file str.gram (rather than stem.gram)", " -s read data file from stdin", " -G print completed grammar and exit", " -N print names file and exit", " -R randomize operation", " -I# discretize continuous attributes into # equal-frequency segments", " -!ns allow/disallow negative tests in nominal/set valued attributes", " ('negative' tests are != for nominals, !~ for sets)", " -B # boost # times", " -W str use weak learner named 'str'", " current options: ripper, findrule", "", "findrule options:", " -f+ find rules for first (usually minority) class only", " -f- find rules for second (usually majority) class only", " -fa find rules for all classes (default)", " -Vg use information gain to choose rule specialization", " -Ve use one-sided entropy reduction to choose rule specialization", "", "ripper options:", " -n expect noisy data (default)", " -c expect clean data", " -aORD arrange classes in order ORD where ORD must be either", " +freq---order by increasing frequency (the default)", " -freq---order by decreasing frequency", " mdl---order by description length of ruleset", " given---order classes as listed in .names file", " -M # use subsamples of size # in choosing tests", " -O # perform # optimization passes", " -D # set max decompression", " -S # simplify hypothesis more (>1) or less (<1)", " -L # set loss ratio to # (ratio=false-pos-cost/false-neg-cost),", " where a 'positive' example is an instance of the minority class", " -A add redundant features to rules ", " -F # force rules to cover at least # examples", "", "default options: ", " boost -v0 -B10 -Wripper -n -a+freq -O2 -D64 -!n -S1.0 -L1.0 -F2 -M+infty <filestem>", "", NULL};/*****************************************************************************/class_spec_t Find_rule_class_spec = ALLCL;int main(argc,argv)int argc;char *argv[];{ int o; char *charp; vec_t *train_data,*test_data; char *stem="foo", *gstem=NULL; boosted_concept_t *hyp; BOOL print_grammar_flag=FALSE; BOOL print_names_flag=FALSE; BOOL crossval=FALSE; BOOL leave_one_out=FALSE; int folds; double loss_ratio; FILE *out_fp; BOOL randomize_seed=FALSE; BOOL use_stdin=FALSE; double tm,err; set_trace_level(NONE); folds=5; while ((o=getopt(argc,argv,"hv:nck:la:g:tsO:GNRL:D:I:!:S:M:AF:B:W:f:")) !=EOF) { switch (o) { case 'f': switch(optarg[0]) { case '+': Find_rule_class_spec = MINCL; break; case '-': Find_rule_class_spec = MAXCL; break; default: Find_rule_class_spec = ALLCL; break; } printf("option: find rules for class '%s'\n",optarg); break; case 'V': switch(optarg[0]) { case 'g': Value_function = &info_gain; break; case 'e': Value_function = &entropy_reduction1; break; default: warning("bad value for -V option\n"); } printf("option: value function is '%s'\n",optarg); break; case 'v': set_trace_level(atoi(optarg)); printf("option: trace level set to %d\n",trace_level()); break; case 'n': Simplify = TRUE; printf("option: data is noisy\n"); break; case 'c': Simplify = FALSE; printf("option: data is clean\n"); break; case 'k': crossval = TRUE; folds = atoi(optarg); printf("option: %d-fold cross-validation\n",folds); break; case 'l': crossval = TRUE; folds = 0; printf("option: leave-one-out cross-validation\n"); break; case 'a': switch (optarg[0]) { case '+': Class_ordering = INCFREQ; printf("option: find rules for least frequent classes first\n"); break; case '-': Class_ordering = DECFREQ; printf("option: find rules for most frequent classes first\n"); break; case 'g': case 'G': Class_ordering = GIVEN; printf("option: use class ordering given in names file\n"); break; case 'm': case 'M': Class_ordering = MDL; printf("option: choose ordering of classes using MDL\n"); break; case 'u': case 'U': Class_ordering = UNORDERED; printf("option: unordered classes\n"); break; default: warning("bad -o argument: use '+freq', '-freq', 'given' or 'mdl'"); break; } break; case 'g': gstem = newmem(strlen(optarg)+1,char); strcpy(gstem,optarg); printf("option: use grammar file '%s.gram'\n",gstem); break; case 's': use_stdin = TRUE; printf("option: read data from stdin\n"); break; case 'O': Optimizations = atoi(optarg); printf("option: optimize %d time(s)\n",Optimizations); break; case 'S': Simplify=TRUE; MDL_theory_factor = atof(optarg); printf("option: multiply coding cost of theory by %g\n",MDL_theory_factor); break; case 'D': Max_decompression = atof(optarg); printf("option: max decompression is %g\n",Max_decompression); break; case '!': for (charp=optarg; *charp; charp++) { if (*charp=='n' || *charp=='N') Eq_negations = TRUE; if (*charp=='s' || *charp=='S') Set_negations = TRUE; } printf("option: %s allow inequality tests for nominal attributes\n", Eq_negations? "will" : "will not"); printf("option: %s allow inequality tests for set-valued attributes\n", Set_negations? "will" : "will not"); break; case 'R': randomize_seed=TRUE; printf("option: will set random seed from clock\n"); break; case 'L': loss_ratio = atof(optarg); FP_cost = 2.0*loss_ratio/(loss_ratio+1.0); FN_cost = 2.0/(loss_ratio+1.0); printf("option: ratio of cost of FP to cost of FN is %g:%g\n", FP_cost,FN_cost); break; case 'I': N_intervals = atoi(optarg); if (N_intervals < 0) fatal("argument to -i must be a positive integer or 0"); if (N_intervals==0) printf("option: no discretization\n",N_intervals); else printf("option: discretize into %d intervals\n",N_intervals); break; case 'G': /* printf("option: will echo grammar\n"); */ print_grammar_flag = TRUE; break; case 'N': /* printf("option: will echo names file\n"); */ print_names_flag = TRUE; break; case 'M': Max_sample = atoi(optarg); printf("option: max subsample is %d\n",Max_sample); break; case 'A': Add_redundancy = TRUE; printf("option: add redundant conditions to rules\n"); break; case 'F': Min_coverage = max(atoi(optarg),1); printf("option: rules must cover %d example(s)\n",Min_coverage); break; case 'B': N_boost = atoi(optarg); printf("option: boost %d times\n",N_boost); break; case 'W': Weak_learner = optarg; printf("option: use weak learner %s\n",Weak_learner); break; case 'h': case '?': default: give_help(); if (o!='h') fatal("option not implemented"); else exit(0); } } /* print out parameter settings */ trace(SUMM) { printf("// parameter settings:\n"); printf("// boost %d times\n",N_boost); printf("// weak learner is %s\n",Weak_learner); if (N_intervals!=0) printf("// discretize into %d intervals\n",N_intervals); else printf("// no discretization\n"); if (Max_sample!=0) printf("// use subsamples of %d examples\n",Max_sample); printf("// %s != tests in rules\n", Eq_negations?"allow":"disallow"); printf("// %s !~ tests in rules\n", Set_negations?"allow":"disallow"); printf("// ordering classes by "); if (Class_ordering==UNORDERED) { printf("// not ordering classes\n"); } else { printf("// ordering classes by "); switch (Class_ordering) { case GIVEN: printf("ordering given in .names file\n"); break; case MDL: printf("MDL heuristic\n"); break; case INCFREQ: printf("increasing frequency\n"); break; case DECFREQ: printf("decreasing frequency\n"); break; default: printf("???\n"); break; } } printf("// expect %s data\n",Simplify?"noisy":"clean"); if (Simplify) { printf("// optimize %d time(s)\n",Optimizations); printf("// max decompression is %g bits\n", Max_decompression); } printf("// rules cover >= %d examples\n",Min_coverage); fflush(stdout); } if (optind<argc) { stem = argv[optind++]; if (gstem==NULL) gstem=stem; } else { give_help(); fatal("no file stem specified"); } if (optind<argc) { warning("not all arguments were used: %s ...",argv[optind]); } start_clock(); trace(SUMM) { printf("// timing loading...\n"); fflush(stdout); } ld_names(add_ext(stem,".names")); if (use_stdin) { train_data = ld_data((char *)NULL); } else { train_data = ld_data(add_ext(stem,".data")); } if (!train_data) { fatal("no training data!"); } if (!crossval) test_data = ld_data(add_ext(stem,".test")); else test_data = NULL; if (!ld_grammar(add_ext(gstem,".gram"),gstem==stem,train_data)) { fatal("error in loading grammar"); } tm = elapsed_time(); trace(SUMM) { if (!test_data) { printf("// loaded %d examples %d features %d values in %.2f sec\n", vmax(train_data),n_fields(),n_symbolic_values(),tm); } else { printf("// loaded %d+%d examples %d features %d values in %.2f sec\n", vmax(train_data),vmax(test_data),n_fields(),n_symbolic_values(),tm); } fflush(stdout); }; if (print_grammar_flag) { print_grammar(); return 0; } if (print_names_flag) { print_names(); return 0; } if (randomize_seed) randomize(); if (crossval) { cross_validate(folds,train_data,&train_boost,&test_boost); } else { if ((out_fp=fopen(add_ext(stem,".bhyp"),"w"))==NULL) { fatal("can't open output file"); } trace(SUMM) { printf("// timing model command...\n"); fflush(stdout); } start_clock(); hyp = boost_model(train_data,test_data); tm = elapsed_time(); trace(SUMM) { printf("// model command took %.2f sec\n",tm); fflush(stdout); } printf("Final hypothesis is:\n"); print_boost_concept(hyp); fshow_boost_concept(out_fp,hyp); printf("==============================="); printf(" summary "); printf("===============================\n"); err = boost_error_rate(hyp,train_data); printf("Train error rate: %.2f%% +/- %.2f%% (%d datapoints) <<\n", (err*100), 100*binomial_std_err(err,vmax(train_data)), vmax(train_data)); if (test_data) { err = boost_error_rate(hyp,test_data); printf("Test error rate: %.2f%% +/- %.2f%% (%d datapoints) <<\n", 100*err, 100*binomial_std_err(err,vmax(test_data)), vmax(test_data)); } printf("Learning time: %.2f sec\n",tm); } return 0;}static double binomial_std_err(err_rate,n)double err_rate;int n;{ return sqrt( (err_rate)*(1-err_rate)/((double)n-1) );}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -