📄 filter-text-main.c
字号:
#include <stdio.h>#include "ripper.h"#include "protos.h"#include "mdb.h"/******************************************************************************/char *Program="filtertext";char *Help_str[] = { "syntax: filtertext [options] [stem]", " remove words from a set-valued fields in a dataset", "", "options are:", " -s read from stdin", " -v# set verbosity", " -f <opt> filter by opt", " entropy, pos_entropy, neg_entropy, frequency, gain", " -r # retain best # features (#<1.0 is a fraction)", " -d # discard the worse # features (#<1.0 is a fraction)", " -k # keep all features with value at least #", " -0 <str> use <str> to indicate deleted words (default is ___)", " -4 output in c4.5 format (feature vector)", " -ag use class order given in names file", NULL};/* filter functions */static double symbol_frequency(symbol_t *,int,int,double);static double symbol_gain(symbol_t *,int,int,double);static double symbol_entropy(symbol_t *,int,int,double);static double symbol_pos_entropy(symbol_t *,int,int,double);static double symbol_neg_entropy(symbol_t *,int,int,double);double (*Filter_f)(symbol_t *,int,int,double);/* marker for deleted symbol */static symbol_t *Deleted;static void init_filtering(vec_t *,int,double,BOOL,double,int,BOOL);static void filter_data(vec_t *, char *,BOOL);static void create_c4_names(char *);static BOOL Given_class_ordering;main(argc,argv)int argc;char *argv[];{ vec_t *train_data,*test_data; char *stem; int use_stdin; double tmp; int nfeats; double pfeats; BOOL compute_threshold; double threshold=0.0; BOOL best; int o; char gainp; BOOL c4_output; /* defaults */ set_trace_level(SUMM); use_stdin = FALSE; best = TRUE; pfeats = 0.50; nfeats = 0; Deleted = intern("___"); c4_output = FALSE; Filter_f = &symbol_frequency; Given_class_ordering = FALSE; while ((o=getopt(argc,argv,"a:stv:f:r:d:0:k:h4"))!=EOF) { switch (o) { case 'a': Given_class_ordering = TRUE; break; case '4': c4_output = TRUE; break; case 's': use_stdin = TRUE; break; case 'v': set_trace_level(atoi(optarg)); break; case 'f': switch (optarg[0]) { case 'e': printf("option: filter by entropy\n"); Filter_f = &symbol_entropy; break; case 'p': printf("option: filter by pos entropy\n"); Filter_f = &symbol_pos_entropy; break; case 'n': printf("option: filter by neg entropy\n"); Filter_f = &symbol_neg_entropy; break; case 'f': printf("option: filter by frequency\n"); Filter_f = &symbol_frequency; break; case 'g': printf("option: filter by FOIL gain\n"); Filter_f = &symbol_gain; break; default: give_help(); fatal("unimplemented filtering option"); } break; case 'k': best = TRUE; nfeats = 0; pfeats = 0.0; if (optarg[0]=='g') { /* gain(p,n) */ gainp = atoi(optarg+1); compute_threshold = TRUE; printf("option: retain if gain>=gain(p=%d,n=0)\n",gainp); } else { compute_threshold = FALSE; threshold = atof(optarg); printf("option: retain if metric>=%g\n",threshold); } break; case 'r': case 'd': best = (o=='r'); tmp = atof(optarg); if (tmp<1.0) { pfeats = tmp; nfeats = 0; } else { nfeats=tmp; } break; case '0': Deleted = intern(optarg); break; case 'h': case '?': default: give_help(); if (o=='h') exit(0); else fatal("option not implemented"); } } if (optind<argc) { stem = argv[optind++]; ld_names(add_ext(stem,".names")); if (use_stdin) train_data = ld_data(NULL); else { train_data = ld_data(add_ext(stem,".data")); test_data = ld_data(add_ext(stem,".test")); } } else { train_data = ld_data(NULL); } if (optind<argc) { warning("not all arguments were used: %s ...",argv[optind]); } if (!train_data || vmax(train_data)==0) fatal("no examples"); init_filtering(train_data, nfeats,pfeats,compute_threshold,threshold,gainp,best); if (c4_output) { if (stem==NULL) stem="foo.names"; create_c4_names(add_ext(stem,"_f.names")); } filter_data(train_data,add_ext(stem,"_f.data"),c4_output); if (test_data) { filter_data(test_data,add_ext(stem,"_f.test"),c4_output); }}ex_count_t Total_count;int Maxfeat;typedef struct symbol_pair_s { symbol_t *w; double val;} symbol_pair_t;symbol_pair_t *Symbol_pair;int N_pairs;static int symbol_pair_lt(symbol_pair_t *,symbol_pair_t *);static void init_filtering( vec_t *data, int nfeats,double pfeats,BOOL compute_threshold,double threshold, int gainp,BOOL best){ int i; ex_count_t p,n,ctmp,total_count; double old_info; int posclass; /* count class distribution */ Total_count = 0; Class_counts = newmem(vmax(Classes),ex_count_t); for (i=0; i<vmax(Classes); i++) { count_class_freq(vref(atom_t,Classes,i)->nom, data,&Class_counts[i],&ctmp); total_count += Class_counts[i]; } /* find minority class, for gain computations, etc */ posclass = 0; for (i=1; i<vmax(Classes); i++) { if (Class_counts[i]<Class_counts[posclass]) { posclass=i; } } if (Given_class_ordering) posclass = 0; Symbol_pair = NULL; N_pairs = 0; if (!set_field(0) || vmax(Names)>1) { fatal("filter-text only works on with one set-valued attribute"); } /* cache out stats wrt minority class */ compute_field_stats(vref(atom_t,Classes,posclass)->nom,0,data); /* figure out initial gain */ p = Class_counts[posclass]; n = total_count-p; old_info = information(p,n); if (compute_threshold) { threshold = gainp*(old_info - information(gainp,0)); } /* re-allocate symbol_pair array */ if (Symbol_pair != NULL) { freemem(Symbol_pair); } if (Symbol_pair==NULL || N_pairs<n_visited_symbols()) { Symbol_pair = newmem(n_visited_symbols(),symbol_pair_t); N_pairs = n_visited_symbols(); } /* populate symbol pair array */ for (i=0; i<n_visited_symbols(); i++) { Symbol_pair[i].w = visited_symbol(i); Symbol_pair[i].val = (*Filter_f)(Symbol_pair[i].w,p,n,old_info); } /* sort symbol_pair by value and give high scores low indices */ qsort((char *)Symbol_pair,n_visited_symbols(), sizeof(symbol_pair_t),&symbol_pair_lt); for (i=0; i<n_visited_symbols(); i++) Symbol_pair[i].w->index = i; /* figure out how many features to keep or drop */ if (nfeats!=0) { Maxfeat = nfeats; } else if (pfeats>0) { Maxfeat = pfeats*n_visited_symbols()+0.5; } else /* threshold */ { Maxfeat = 0; for (i=0; i<n_visited_symbols(); i++) { if (Symbol_pair[i].val >= threshold) Maxfeat=i; else break; } } if (!best) { /* delete the worst, don't keep the best */ Maxfeat = n_visited_symbols()-Maxfeat; } trace(LONG) { printf("// %d retained features:\n",Maxfeat); for (i=0; i<Maxfeat; i++) { pos_field_stat(Symbol_pair[i].w,&p,&n); printf("// %10g (%g/%g) %s\n", Symbol_pair[i].val,p,n,Symbol_pair[i].w->name); } }}static void create_c4_names(char *fname){ int i; FILE *fp; if ((fp=fopen(fname,"w")) == NULL) { fatal("can't write %s",fname); } trace(SUMM) { printf("// creating file %s...\n",fname); fflush(stdout); } for (i=0; i<vmax(Classes); i++) { if (i>0) fprintf(fp,","); fprint_symbol(fp,vref(atom_t,Classes,i)->nom); } fprintf(fp,".\n"); for (i=0; i<Maxfeat; i++) { fprint_symbol(fp,Symbol_pair[i].w); fprintf(fp,": f,t.\n"); } fclose(fp);}static void filter_data(vec_t *data, char *fname,BOOL c4_output){ int i,j; example_t *exi; vec_t *seti; int wij; FILE *fp; static BOOL *c4exi = NULL; if ((fp=fopen(fname,"w")) == NULL) { fatal("can't write %s",fname); } trace(SUMM) { printf("// creating file %s...\n",fname); fflush(stdout); } /* allocate a vector if necessary */ if (c4_output && !c4exi) { c4exi = newmem(Maxfeat,BOOL); } for (i=0; i<vmax(data); i++) { exi = vref(example_t,data,i); seti = vref(aval_t,exi->inst,0)->u.set; if (c4_output) { /* construct and print vector */ for (j=0; j<Maxfeat; j++) c4exi[j] = FALSE; for (j=0; j<vmax(seti); j++) { wij = (*vref(symbol_t *,seti,j))->index; if (wij<Maxfeat) c4exi[wij] = TRUE; } for (j=0; j<Maxfeat; j++) { fprintf(fp,"%c,","ft"[c4exi[j]]); } fprint_symbol(fp,exi->lab.nom); fprintf(fp,".\n"); } else { /* replace deleted words */ for (j=0; j<vmax(seti); j++) { wij = (*vref(symbol_t *,seti,j))->index; if (wij>=Maxfeat) vset(symbol_t *,seti,j,&Deleted); } fprint_example(fp,exi); } } /* for example i */ fclose(fp);}/****************************************************************************//* for sorting */static int symbol_pair_lt(symbol_pair_t *wp1,symbol_pair_t *wp2){ if (wp1->val < wp2->val) return 1; else if (wp1->val > wp2->val) return -1; else return 0;}/* value functions */static double symbol_frequency(symbol_t *s,int allp,int alln,double old_info){ ex_count_t p,n; pos_field_stat(s,&p,&n); return (double) p+n;}static double symbol_gain(symbol_t *s,int allp,int alln,double old_info){ return max(pos_symbol_gain(s,old_info), neg_symbol_gain(s,old_info));}static double symbol_entropy(symbol_t *s,int allp,int alln,double old_info){ ex_count_t totp,totn; double p,n; pos_field_stat(s,&totp,&totn); if (totp+totn == 0) { return 0.0; } else { p = (totp+1)/(totp+totn+2); n = (totn+1)/(totp+totn+2); return 1 - (- p*Log2(p) - n*Log2(n)); }}static double symbol_pos_entropy(symbol_t *s,int allp,int alln,double old_info){ ex_count_t totp,totn; double p,n; pos_field_stat(s,&totp,&totn); if (totp+totn == 0 || totp<totn) { return 0.0; } else { p = (totp+1)/(totp+totn+2); n = (totn+1)/(totp+totn+2); return 1 - (- p*Log2(p) - n*Log2(n)); }}static double symbol_neg_entropy(symbol_t *s,int allp,int alln,double old_info){ ex_count_t totp,totn; double p,n; pos_field_stat(s,&totp,&totn); if (totp+totn == 0 || totn<totp) { return 0.0; } else { p = (totp+1)/(totp+totn+2); n = (totn+1)/(totp+totn+2); return 1 - (- p*Log2(p) - n*Log2(n)); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -