📄 ripper-main.c
字号:
/****************************************************************************** ripper-main.c - main driver program for learning program******************************************************************************/#include <stdio.h>#include <math.h>#include "ripper.h"/*****************************************************************************/static double binomial_std_err(double,int);/* interface to cross-validation*/static concept_t *Hyp=NULL;static void train_ripper(vec_t *data){ if (Hyp!=NULL) free_concept(Hyp); Hyp = model(data); printf("Hypothesis:\n"); print_concept(Hyp); }static double test_ripper(vec_t *data){ return error_rate(Hyp,data);}/******************************************************************************/char *Program="ripper";char *Help_str[] = { "syntax: ripper [options] filestem", " learn a ruleset from examples", "", "options are:", " -v# set trace level to #, which must be 0, 1, 2, or 3", " -n expect noisy data (default)", " -c expect clean data", " -kN estimate error rate by N-fold cross-validation", " -l estimate error rate via leave-one-out method", " (ie N-fold cross-validation where N is training set size)", " -aORD arrange classes in order ORD where ORD must be either", " +freq---order by increasing frequency (the default)", " -freq---order by decreasing frequency", " mdl---order by description length of ruleset", " given---order classes as listed in .names file", " unordered--don't order classes", " -g str use grammar file str.gram (rather than stem.gram)", " -f str use names file str.gram (rather than stem.gram)", " -s read data file from stdin", "", "other options:", " -M # use subsamples of size # in choosing tests", " -O # perform # optimization passes", " -G print completed grammar and exit", " -N print names file and exit", " -R randomize operation", " -D # set max decompression", " -I# discretize continuous attributes into # equal-frequency segments", " -!ns allow/disallow negative tests in nominal/set valued attributes", " ('negative' tests are != for nominals, !~ for sets)", " -S # simplify hypothesis more (>1) or less (<1)", " -L # set loss ratio to # (ratio=false-pos-cost/false-neg-cost),", " where a 'positive' example is an instance of the minority class", " -A add redundant features to rules ", " -F # force rules to cover at least # examples", " -E extend each rule to be very specific and write", " the revised rules into a file .ehyp (for experiments)", "", "default: ", " ripper -v0 -n -a+freq -O2 -D64 -!n -S0.5 -L1.0 -F1 -M+infty <filestem>", "", NULL};/*****************************************************************************/int main(argc,argv)int argc;char *argv[];{ int o; char *charp; vec_t *train_data,*test_data; char *stem="foo", *gstem=NULL, *nstem=NULL; concept_t *hyp; BOOL print_grammar_flag=FALSE; BOOL print_names_flag=FALSE; BOOL crossval=FALSE; BOOL leave_one_out=FALSE; int folds; double loss_ratio; FILE *out_fp; BOOL randomize_seed=FALSE; BOOL use_stdin=FALSE; double tm,err; BOOL do_extend_rules=FALSE; set_trace_level(NONE); folds=5; while ((o=getopt(argc,argv,"hv:nck:la:g:f:tsO:GNRL:D:I:!:S:M:AF:E")) !=EOF) { switch (o) { case 'v': set_trace_level(atoi(optarg)); printf("option: trace level set to %d\n",trace_level()); break; case 'n': Simplify = TRUE; printf("option: data is noisy\n"); break; case 'c': Simplify = FALSE; printf("option: data is clean\n"); break; case 'k': crossval = TRUE; folds = atoi(optarg); printf("option: %d-fold cross-validation\n",folds); break; case 'l': crossval = TRUE; folds = 0; printf("option: leave-one-out cross-validation\n"); break; case 'a': switch (optarg[0]) { case '+': Class_ordering = INCFREQ; printf("option: find rules for least frequent classes first\n"); break; case '-': Class_ordering = DECFREQ; printf("option: find rules for most frequent classes first\n"); break; case 'g': case 'G': Class_ordering = GIVEN; printf("option: use class ordering given in names file\n"); break; case 'm': case 'M': Class_ordering = MDL; printf("option: choose ordering of classes using MDL\n"); break; case 'u': case 'U': Class_ordering = UNORDERED; printf("option: unordered classes\n"); break; default: warning("bad argument to -o option"); break; } break; case 'g': gstem = newmem(strlen(optarg)+1,char); strcpy(gstem,optarg); printf("option: use grammar file '%s.gram'\n",gstem); break; case 'f': nstem = newmem(strlen(optarg)+1,char); strcpy(nstem,optarg); printf("option: use names file '%s.gram'\n",nstem); break; case 's': use_stdin = TRUE; printf("option: read data from stdin\n"); break; case 'O': Optimizations = atoi(optarg); printf("option: optimize %d time(s)\n",Optimizations); break; case 'S': Simplify=TRUE; MDL_theory_factor = atof(optarg); printf("option: multiply coding cost of theory by %g\n",MDL_theory_factor); break; case 'D': Max_decompression = atof(optarg); printf("option: max decompression is %g\n",Max_decompression); break; case '!': for (charp=optarg; *charp; charp++) { if (*charp=='n' || *charp=='N') Eq_negations = TRUE; if (*charp=='s' || *charp=='S') Set_negations = TRUE; } printf("option: %s allow inequality tests for nominal attributes\n", Eq_negations? "will" : "will not"); printf("option: %s allow inequality tests for set-valued attributes\n", Set_negations? "will" : "will not"); break; case 'R': randomize_seed=TRUE; printf("option: will set random seed from clock\n"); break; case 'L': loss_ratio = atof(optarg); FP_cost = 2.0*loss_ratio/(loss_ratio+1.0); FN_cost = 2.0/(loss_ratio+1.0); printf("option: ratio of cost of FP to cost of FN is %g:%g\n", FP_cost,FN_cost); break; case 'I': N_intervals = atoi(optarg); if (N_intervals < 0) fatal("argument to -i must be a positive integer or 0"); if (N_intervals==0) printf("option: no discretization\n",N_intervals); else printf("option: discretize into %d intervals\n",N_intervals); break; case 'G': /* printf("option: will echo grammar\n"); */ print_grammar_flag = TRUE; break; case 'N': /* printf("option: will echo names file\n"); */ print_names_flag = TRUE; break; case 'M': Max_sample = atoi(optarg); printf("option: max subsample is %d\n",Max_sample); break; case 'A': Add_redundancy = TRUE; printf("option: add redundant conditions to rules\n"); break; case 'F': Min_coverage = max(atoi(optarg),1); printf("option: rules must cover %d example(s)\n",Min_coverage); break; case 'E': do_extend_rules = 1; printf("option: will save extended rules\n"); break; case 'h': case '?': default: give_help(); if (o!='h') fatal("option not implemented"); else exit(0); } } /* print out parameter settings */ trace(SUMM) { printf("// parameter settings:\n"); if (N_intervals!=0) printf("// discretize into %d intervals\n",N_intervals); else printf("// no discretization\n"); if (Max_sample!=0) printf("// use subsamples of %d examples\n",Max_sample); printf("// %s != tests in rules\n", Eq_negations?"allow":"disallow"); printf("// %s !~ tests in rules\n", Set_negations?"allow":"disallow"); if (Class_ordering==UNORDERED) { printf("// not ordering classes\n"); } else { printf("// ordering classes by "); switch (Class_ordering) { case GIVEN: printf("ordering given in .names file\n"); break; case MDL: printf("MDL heuristic\n"); break; case INCFREQ: printf("increasing frequency\n"); break; case DECFREQ: printf("decreasing frequency\n"); break; default: printf("???\n"); break; } } printf("// expect %s data\n",Simplify?"noisy":"clean"); if (Simplify) { printf("// optimize %d time(s)\n",Optimizations); printf("// max decompression is %g bits\n", Max_decompression); } printf("// rules cover %d examples\n",Min_coverage); fflush(stdout); } if (optind<argc) { stem = argv[optind++]; if (gstem==NULL) gstem=stem; } else { give_help(); fatal("no file stem specified"); } if (optind<argc) { warning("not all arguments were used: %s ...",argv[optind]); } trace(SUMM) printf("// loading training data...\n"); if (nstem==NULL) nstem = stem; ld_names(add_ext(nstem,".names")); if (use_stdin) { train_data = ld_data((char *)NULL); } else { train_data = ld_data(add_ext(stem,".data")); } if (!train_data) { fatal("no training data!"); } trace(SUMM) printf("// loading test data...\n"); if (!crossval) test_data = ld_data(add_ext(stem,".test")); else test_data = NULL; if (!ld_grammar(add_ext(gstem,".gram"),gstem==stem,train_data)) { fatal("error in loading grammar"); } if (print_grammar_flag) { print_grammar(); return 0; } if (print_names_flag) { print_names(); return 0; } if (randomize_seed) randomize(); if (crossval) { cross_validate(folds,train_data,&train_ripper,&test_ripper); } else { if ((out_fp=fopen(add_ext(stem,".hyp"),"w"))==NULL) { fatal("can't open output file"); } trace(SUMM) { printf("// timing model command...\n"); fflush(stdout); } start_clock(); hyp = model(train_data); tm = elapsed_time(); trace(SUMM) { printf("// model command took %.2f sec\n",tm); fflush(stdout); } printf("Final hypothesis is:\n"); print_concept(hyp); fshow_concept(out_fp,hyp); fclose(out_fp); printf("==============================="); printf(" summary "); printf("===============================\n"); err = error_rate(hyp,train_data); printf("Train error rate: %.2f%% +/- %.2f%% (%d datapoints) <<\n", (err*100), 100*binomial_std_err(err,vmax(train_data)), vmax(train_data)); if (test_data) { err = error_rate(hyp,test_data); printf("Test error rate: %.2f%% +/- %.2f%% (%d datapoints) <<\n", 100*err, 100*binomial_std_err(err,vmax(test_data)), vmax(test_data)); } printf("Hypothesis size: %d rules, %d conditions\n", vmax(hyp->rules),concept_size(hyp)); printf("Learning time: %.2f sec\n",tm); if (do_extend_rules) { if ((out_fp=fopen(add_ext(stem,".ehyp"),"w"))==NULL) { fatal("can't open secondard output file"); } extend_rules(out_fp,hyp,train_data); } } return 0;}static double binomial_std_err(err_rate,n)double err_rate;int n;{ return sqrt( (err_rate)*(1-err_rate)/((double)n-1) );}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -