⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filter-text-main.c

📁 Ripper 分类算法
💻 C
字号:
#include <stdio.h>#include "ripper.h"#include "protos.h"#include "mdb.h"/******************************************************************************/char *Program="filtertext";char *Help_str[] = {    "syntax: filtertext [options] [stem]",    "   remove words from a set-valued fields in a dataset",    "",    "options are:",    "  -s        read from stdin",    "  -v#       set verbosity",    "  -f <opt>  filter by opt",    "            entropy, pos_entropy, neg_entropy, frequency, gain",    "  -r #      retain best # features (#<1.0 is a fraction)",    "  -d #      discard the worse # features (#<1.0 is a fraction)",    "  -k #      keep all features with value at least #",    "  -0 <str>  use <str> to indicate deleted words (default is ___)",     "  -4        output in c4.5 format (feature vector)",      "  -ag       use class order given in names file",    NULL};/* filter functions */static double symbol_frequency(symbol_t *,int,int,double);static double symbol_gain(symbol_t *,int,int,double);static double symbol_entropy(symbol_t *,int,int,double);static double symbol_pos_entropy(symbol_t *,int,int,double);static double symbol_neg_entropy(symbol_t *,int,int,double);double (*Filter_f)(symbol_t *,int,int,double);/* marker for deleted symbol */static symbol_t *Deleted;static void init_filtering(vec_t *,int,double,BOOL,double,int,BOOL);static void filter_data(vec_t *, char *,BOOL);static void create_c4_names(char *);static BOOL Given_class_ordering;main(argc,argv)int argc;char *argv[];{    vec_t *train_data,*test_data;    char *stem;    int use_stdin;    double tmp;    int nfeats;    double pfeats;    BOOL compute_threshold;    double threshold=0.0;    BOOL best;    int o;    char gainp;    BOOL c4_output;    /* defaults */    set_trace_level(SUMM);    use_stdin = FALSE;    best = TRUE;    pfeats = 0.50;    nfeats = 0;    Deleted = intern("___");     c4_output = FALSE;    Filter_f = &symbol_frequency;    Given_class_ordering = FALSE;    while ((o=getopt(argc,argv,"a:stv:f:r:d:0:k:h4"))!=EOF) {	switch (o) {  	  case 'a':	    Given_class_ordering = TRUE;	    break;  	  case '4':	    c4_output = TRUE;  	    break;	  case 's':	    use_stdin = TRUE;	    break;	  case 'v':	    set_trace_level(atoi(optarg)); 	    break;	  case 'f':	    switch (optarg[0]) {	      case 'e':		  printf("option: filter by entropy\n");		  Filter_f = &symbol_entropy; break;	      case 'p':		  printf("option: filter by pos entropy\n");		  Filter_f = &symbol_pos_entropy; break;	      case 'n':		  printf("option: filter by neg entropy\n");		  Filter_f = &symbol_neg_entropy; break;	      case 'f': 	    		  printf("option: filter by frequency\n");		  Filter_f = &symbol_frequency; break;	      case 'g':		  printf("option: filter by FOIL gain\n");		  Filter_f = &symbol_gain; break;	      default:		  give_help();		  fatal("unimplemented filtering option");	    }	    break;	  case 'k':	    best = TRUE; nfeats = 0; pfeats = 0.0;	    if (optarg[0]=='g') {		/* gain(p,n) */		gainp = atoi(optarg+1);		compute_threshold = TRUE;		printf("option: retain if gain>=gain(p=%d,n=0)\n",gainp);	    } else {		compute_threshold = FALSE;		threshold = atof(optarg);		printf("option: retain if metric>=%g\n",threshold);	    }	    break;	  case 'r':	  case 'd':	    best = (o=='r');	    tmp = atof(optarg);	    if (tmp<1.0) {		pfeats = tmp;		nfeats = 0;	    } else {		nfeats=tmp; 	    }	    break;	  case '0':	    Deleted = intern(optarg);	    break;	  case 'h':	  case '?':	  default: 	    give_help();	    if (o=='h') exit(0);	    else fatal("option not implemented");	}    }    if (optind<argc) {	stem = argv[optind++];	ld_names(add_ext(stem,".names"));	if (use_stdin) train_data = ld_data(NULL);	else {	    train_data = ld_data(add_ext(stem,".data"));	    test_data = ld_data(add_ext(stem,".test"));	}    } else {	train_data = ld_data(NULL);    }    if (optind<argc) {	warning("not all arguments were used: %s ...",argv[optind]);    }    if (!train_data || vmax(train_data)==0) fatal("no examples");    init_filtering(train_data,		   nfeats,pfeats,compute_threshold,threshold,gainp,best);    if (c4_output) {	if (stem==NULL) stem="foo.names";	create_c4_names(add_ext(stem,"_f.names"));    }    filter_data(train_data,add_ext(stem,"_f.data"),c4_output);    if (test_data) {	filter_data(test_data,add_ext(stem,"_f.test"),c4_output);	    }}ex_count_t Total_count;int Maxfeat;typedef struct symbol_pair_s {    symbol_t *w;    double val;} symbol_pair_t;symbol_pair_t *Symbol_pair;int N_pairs;static int symbol_pair_lt(symbol_pair_t *,symbol_pair_t *);static void init_filtering(    vec_t *data,    int nfeats,double pfeats,BOOL compute_threshold,double threshold,    int gainp,BOOL best){    int i;    ex_count_t p,n,ctmp,total_count;    double old_info;    int posclass;    /* count class distribution */    Total_count = 0;    Class_counts = newmem(vmax(Classes),ex_count_t);     for (i=0; i<vmax(Classes); i++) {	count_class_freq(vref(atom_t,Classes,i)->nom,			 data,&Class_counts[i],&ctmp);	total_count += Class_counts[i];    }    /* find minority class, for gain computations, etc */    posclass = 0;    for (i=1; i<vmax(Classes); i++) {	if (Class_counts[i]<Class_counts[posclass]) {	    posclass=i; 	}    }    if (Given_class_ordering) posclass = 0;     Symbol_pair = NULL;    N_pairs = 0;    if (!set_field(0) || vmax(Names)>1) {	fatal("filter-text only works on with one set-valued attribute");    }    /* cache out stats wrt minority class */    compute_field_stats(vref(atom_t,Classes,posclass)->nom,0,data);    /* figure out initial gain */    p = Class_counts[posclass];    n = total_count-p;    old_info = information(p,n);    if (compute_threshold) {	threshold = gainp*(old_info - information(gainp,0));    }    /* re-allocate symbol_pair array */    if (Symbol_pair != NULL) {	freemem(Symbol_pair);    }    if (Symbol_pair==NULL || N_pairs<n_visited_symbols()) {	Symbol_pair = newmem(n_visited_symbols(),symbol_pair_t);	N_pairs = n_visited_symbols();    }    /* populate symbol pair array */    for (i=0; i<n_visited_symbols(); i++) {	Symbol_pair[i].w = visited_symbol(i);	Symbol_pair[i].val = (*Filter_f)(Symbol_pair[i].w,p,n,old_info);    }    /* sort symbol_pair by value and give high scores low indices */    qsort((char *)Symbol_pair,n_visited_symbols(),	  sizeof(symbol_pair_t),&symbol_pair_lt);    for (i=0; i<n_visited_symbols(); i++) Symbol_pair[i].w->index = i;    /* figure out how many features to keep or drop */    if (nfeats!=0) {	Maxfeat = nfeats;    } else if (pfeats>0) {	Maxfeat = pfeats*n_visited_symbols()+0.5;    } else /* threshold */ {	Maxfeat = 0;	for (i=0; i<n_visited_symbols(); i++) {	    if (Symbol_pair[i].val >= threshold) Maxfeat=i;	    else break;	}    }    if (!best) {	/* delete the worst, don't keep the best */	Maxfeat = n_visited_symbols()-Maxfeat;    }    trace(LONG) {	printf("// %d retained features:\n",Maxfeat);	for (i=0; i<Maxfeat; i++) {	    pos_field_stat(Symbol_pair[i].w,&p,&n);	    printf("// %10g (%g/%g) %s\n",		   Symbol_pair[i].val,p,n,Symbol_pair[i].w->name);	}    }}static void create_c4_names(char *fname){    int i;    FILE *fp;    if ((fp=fopen(fname,"w")) == NULL) {	fatal("can't write %s",fname);     }     trace(SUMM) {	printf("// creating file %s...\n",fname);	fflush(stdout);    }    for (i=0; i<vmax(Classes); i++) {	if (i>0) fprintf(fp,",");	fprint_symbol(fp,vref(atom_t,Classes,i)->nom);    }    fprintf(fp,".\n");    for (i=0; i<Maxfeat; i++) {	fprint_symbol(fp,Symbol_pair[i].w);	fprintf(fp,": f,t.\n");    }    fclose(fp);}static void filter_data(vec_t *data, char *fname,BOOL c4_output){    int i,j;    example_t *exi;    vec_t *seti;    int wij;    FILE *fp;    static BOOL *c4exi = NULL;    if ((fp=fopen(fname,"w")) == NULL) {	fatal("can't write %s",fname);     }     trace(SUMM) {	printf("// creating file %s...\n",fname);	fflush(stdout);    }    /* allocate a vector if necessary */    if (c4_output && !c4exi) {	c4exi = newmem(Maxfeat,BOOL);    }    for (i=0; i<vmax(data); i++) {	exi = vref(example_t,data,i);	seti = vref(aval_t,exi->inst,0)->u.set;	if (c4_output) {	    /* construct and print vector */	    for (j=0; j<Maxfeat; j++) c4exi[j] = FALSE;	    for (j=0; j<vmax(seti); j++) {		wij = (*vref(symbol_t *,seti,j))->index;		if (wij<Maxfeat) c4exi[wij] = TRUE;	    }	    for (j=0; j<Maxfeat; j++) {		fprintf(fp,"%c,","ft"[c4exi[j]]);	    }	    fprint_symbol(fp,exi->lab.nom);	    fprintf(fp,".\n");	} else {	    /* replace deleted words */	    for (j=0; j<vmax(seti); j++) {		wij = (*vref(symbol_t *,seti,j))->index;		if (wij>=Maxfeat) vset(symbol_t *,seti,j,&Deleted);	    }	    fprint_example(fp,exi);	}    } /* for example i */    fclose(fp);}/****************************************************************************//* for sorting */static int symbol_pair_lt(symbol_pair_t *wp1,symbol_pair_t *wp2){    if (wp1->val < wp2->val) return 1;    else if (wp1->val > wp2->val) return -1;    else return 0;}/* value functions */static double symbol_frequency(symbol_t *s,int allp,int alln,double old_info){    ex_count_t p,n;    pos_field_stat(s,&p,&n);    return (double) p+n;}static double symbol_gain(symbol_t *s,int allp,int alln,double old_info){    return max(pos_symbol_gain(s,old_info),	       neg_symbol_gain(s,old_info));}static double symbol_entropy(symbol_t *s,int allp,int alln,double old_info){    ex_count_t totp,totn;    double p,n;    pos_field_stat(s,&totp,&totn);    if (totp+totn == 0) {	return 0.0;    } else {	p = (totp+1)/(totp+totn+2);	n = (totn+1)/(totp+totn+2);	return 1 - (- p*Log2(p) - n*Log2(n));    }}static double symbol_pos_entropy(symbol_t *s,int allp,int alln,double old_info){    ex_count_t totp,totn;    double p,n;    pos_field_stat(s,&totp,&totn);    if (totp+totn == 0 || totp<totn) {	return 0.0;    } else {	p = (totp+1)/(totp+totn+2);	n = (totn+1)/(totp+totn+2);	return 1 - (- p*Log2(p) - n*Log2(n));    }}static double symbol_neg_entropy(symbol_t *s,int allp,int alln,double old_info){    ex_count_t totp,totn;    double p,n;    pos_field_stat(s,&totp,&totn);    if (totp+totn == 0 || totn<totp) {	return 0.0;    } else {	p = (totp+1)/(totp+totn+2);	n = (totn+1)/(totp+totn+2);	return 1 - (- p*Log2(p) - n*Log2(n));    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -