⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mdb_util.c

📁 Ripper 分类算法
💻 C
字号:
/****************************************************************************** mdb_util.c - miscellaneous utilities for datasets******************************************************************************/#include <stdio.h>#include "ripper.h"#include "mdb.h"#define Min_pos_fraction 0.2/****************************************************************************//* basic utilities *//****************************************************************************//* make an empty holder for data */DATA *new_data(int m){    return new_vecn(example_t,m);}/* make a "copy" of a dataset, without allocating new storage */void copy_data(DATA *copy,DATA *data) /* copy is side-effected */ {    share_vec(example_t,copy,data);}/* sample about m elements of data and put them in sample */void sample_data(symbol_t *cl,int m,DATA *data,DATA **space){    int q,r,i;    DATA data1;    example_t *exi,*exr;    static vec_t *index=NULL;    int npos;    example_t dummy_ex;    ex_count_t wfact;    /* first allocate space, if necessary */    if ((*space)==NULL) {	/* allocate space and fill it with examples */	(*space) = new_data(Max_sample);	for (i=0; i<Max_sample; i++) {	    ext_vec(example_t,(*space),&dummy_ex);	}    }    /* next find all positive examples (of class cl) */    if (index==NULL) index=new_vec(int);    else clear_vec(int,index);    for (i=0; i<vmax(data);i++) {	exi = vref(example_t,data,i);	if (exi->lab.nom==cl) {	    ext_vec(int,index,&i);	}    }    /*    trace(SUMM) printf("// sampling %d from %d/%d\n",		       m,vmax(index),vmax(data)-vmax(index));    */		           /* below: wfact is such that      *  (sum of weights of sample)*wfact = (sum of weights in data)    */       copy_data(&data1,data);    if (vmax(index)/vmax(data) >= Min_pos_fraction) {	wfact = ((double)vmax(data))/((double)m);	for (i=0; i<m; i++) {	    r = random()%vmax(&data1);	    exi = vref(example_t,(*space),i);	    exr = vref(example_t,&data1,r);	    exi->lab.nom = exr->lab.nom;	    exi->wt = exr->wt * wfact;	    	    exi->inst = exr->inst;	    swap_out_example(&data1,r);	}    } else {	npos = (int)Min_pos_fraction*vmax(data)+0.5;	if (npos >= vmax(index)) npos=vmax(index);	wfact = ((double)vmax(index))/((double)npos);	for (i=0; i<npos; i++) {	    q = random()%vmax(index);	    r = *vref(int,index,q);	    exi = vref(example_t,(*space),i);	    exr = vref(example_t,&data1,r);	    exi->lab.nom = exr->lab.nom;	    exi->wt = exr->wt * wfact;	    	    exi->inst = exr->inst;	    swap_out_example(&data1,r);	    /* "swap out" position q of index */	    copy(int,vref(int,index,q),vref(int,index,vmax(index)-1));	    shorten_vec(int,index);	}	wfact = ((double)vmax(data)-vmax(index))/((double)m-npos);	for ( ;i<m; i++) {	    exi = vref(example_t,(*space),i);	    r = random()%vmax(&data1);	    exr = vref(example_t,&data1,r);	    exi->lab.nom = exr->lab.nom;	    exi->wt = exr->wt;	    	    exi->inst = exr->inst;	    swap_out_example(&data1,r);	}    }}/* remove examples in data covered by form */void remove_covered_examples(vec_t *form,DATA *data){    int i;    example_t *exi;    i=0;    while (i<vmax(data)) {	exi = vref(example_t,data,i);	if (form_covers(form,exi->inst)) {	    swap_out_example(data,i);	} else i++;    } }/* remove examples in data NOT covered by form */void remove_uncovered_examples(vec_t *form,DATA *data){    int i;    example_t *exi;    /* update the data by removing uncovered examples */    i=0;    while (i<vmax(data)) {	exi = vref(example_t,data,i);	if (!form_covers(form,exi->inst)) {	    swap_out_example(data,i);	} else i++;    } }/****************************************************************************//* routines used for cross-validation *//****************************************************************************//* randomize order of data so as to preserve  * class frequencies in each of several partitions, * where partition i of size-n dataset is the range  *     n/splits*i...n/splits*(i+1) */void stratify_and_shuffle_data(DATA *data,int splits){    int i,j,k,r,offset;    static example_t *tmp=NULL, extmp, *exi;    static int tmpsize=0,n;    static int *class_tmp=NULL;    symbol_t *classi;    /* allocate class_tmp array if needed */    if (class_tmp==NULL) {	class_tmp = newmem(vmax(Classes),int);    }    /* allocate a larger tmp array, if needed */    if (tmpsize<vmax(data)) {	if (tmp) freemem(tmp);	tmp = newmem(vmax(data),example_t);	tmpsize = vmax(data);    }     /* set logical size of tmp */    n = vmax(data);    /* count #examples of each class */    for (i=0; i<vmax(Classes); i++) {	class_tmp[i]=0;    }    for (i=0; i<n; i++) {	exi = vref(example_t,data,i);	class_tmp[exi->lab.nom->index]++;    }    /* next, make class_tmp[i] be offset at which        to store examples of class i */       for (i=vmax(Classes)-1; i>=0; i--) {	offset = 0;	for (j=0; j<i; j++) offset += class_tmp[j];	class_tmp[i] = offset;    }    /* now copy in the data, sorting by class */    for (i=0; i<n; i++) {	exi = vref(example_t,data,i);	offset = class_tmp[exi->lab.nom->index]++;	copy(example_t,&tmp[offset],exi);    }    /* shuffle each class separately */    i=0;    while (i<n) {	classi = tmp[i].lab.nom;	for (j=i; j<n && tmp[j].lab.nom==classi; j++) 	    ; /* do nothing */	if (j<n) {	    /* shuffle range i...j */	    for (k=i; k<j; k++) {		r = k+random()%(n-k);		copy(example_t,&extmp,&tmp[k]);		copy(example_t,&tmp[k],&tmp[r]);		copy(example_t,&tmp[r],&extmp);	    }	}	i=j;	    }    /* store back into original array in set order */    clear_vec(example_t,data);    for (j=0; j<splits; j++) {	for (i=j; i<n; i+=splits) {	    ext_vec(example_t,data,&tmp[i]);	}    }}/* assuming data has been shuffled, put the i-th  * partition in data2 and the rest in data1*/void ith_stratified_partition(DATA *data,int i,int folds,DATA *data1,DATA *data2){    int lo,hi,j,m;    m = vmax(data)/folds;    clear_vec(example_t,data1);    clear_vec(example_t,data2);    lo = i*m;    hi = i==folds-1? vmax(data) : (i+1)*m;    for (j=0; j<vmax(data); j++) {	if (j>=lo && j<hi) {	    ext_vec(example_t,data2,vref(example_t,data,j));	} else {	    ext_vec(example_t,data1,vref(example_t,data,j));	}    }} /* conceptually split data as for a k-fold stratified  * cross-validation, where splits=k, and then put * the first (k-1) partitions in data1, and the * last partition in data2*/void stratified_partition(DATA *data,int splits,DATA *data1,DATA *data2){    int cut;    stratify_and_shuffle_data(data,splits);    cut = vmax(data)*(splits-1.0)/splits;    share_subvec(example_t,data1,data,0,cut);    share_subvec(example_t,data2,data,cut,vmax(data)-cut);}/****************************************************************************//* mark i-th example as removed --- without destroying any data */void swap_out_example(data,i)vec_t *data;int i;{    example_t *exi,*ex_last,tmp;    exi = vref(example_t,data,i);    ex_last = vref(example_t,data,vmax(data)-1);    copy(example_t,&tmp,exi);    copy(example_t,exi,ex_last);    copy(example_t,ex_last,&tmp);    shorten_vec(example_t,data);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -