svm_al.c

来自「机器学习作者tom mitchell的书上代码」· C语言代码 · 共 902 行 · 第 1/2 页
902 行
/* Copyright (C) 1999 Greg Schohn - gcs@jprc.com *//* ******************** svm_al.c ******************* * Active learning add-ons for SVM's. */#include <bow/svm.h>#define NDIM_INSPECTED 14int dim_map(int i) {  static int map[] = {1,2,3,4,6,8,12,16,24,32,48,64,128,256};  return (map[i]);}/* compute the prec. recall breakeven point shifting the value of b */double prec_recall_breakeven(double *test_evals, int *test_yvect, int n, int total_pos) {  struct di *ey;  double max;  int npos;  int i;  ey = (struct di *) malloc(sizeof(struct di)*n);  for (i=0; i<n; i++) {    ey[i].d = -1*test_evals[i]; /* -1 is to force the sort the way i want it */    ey[i].i = test_yvect[i];  }  qsort(ey,n,sizeof(struct di),di_cmp);  max = -1.0;  for (i=npos=0; i<n; i++) {    double min;    if (ey[i].i > 0) {      npos ++;    }    min = MIN(((double)npos)/(i+1), ((double)npos)/total_pos);    if (min > max) {      max = min;    }  }  free(ey);  return max;}struct al_test_data {  int ntest;  int ndim_sat;  int *docs_added;  int *test_yvect, *apvect, *anvect;  int *train_apvect, *train_anvect, *query_apvect, *query_anvect;  int *nsv_vect, *nbsv_vect, *time_vect, *nkce_vect;  int *npos_added, *nneg_added;  double *prb, *scores_added;  int **sv_dim_sat_vect, **train_dim_sat_vect;  double **test_scores;  bow_wv **test_docs;};/* the data coming in should have train data in the first (ndocs-ntrans) * slots & the permanently unlabel-able data in the next ntrans slots *//* no unlabeled data will be used unless svm_al_do_trans is set - if it * is then ALL unlabeled data will be used. *//* this fn. does active learning by selecting which docs to pass into  * the svm solver */int al_svm_guts(bow_wv **train_docs, int *train_yvect, double *weights, 		double *b, double **W, int nperm_unlabeled, int ndocs, 		struct al_test_data *astd, int do_random_learning) {  int          changed;  int         *cur_hyp_yvect;  int          dec;  int         *hyp_yvect; /* for transduction */  int          last_subndocs;  int          nplabeled; /* # of potentially labeled */  int          nleft;  int          nsv;  int          n_trans_correct;  int          num_words;  int         *old_svbitmap;  int          qsize;    /* query size, size of chunks to grow training set by */  struct di   *train_scores, *train_cscores;  int         *train_sat_vect;  int         *sv_sat_vect; /* shows how many  */  int          sub_ndocs;  double       tb;  int         *tdocs;    /* translation table */  double      *tvals;  int i,j,k,n,nloop;  sub_ndocs = MIN(ndocs,svm_init_al_tset);  train_scores = (struct di *) malloc(sizeof(struct di)*ndocs);  tvals = (double *) malloc(sizeof(double)*ndocs);  tdocs = (int *) malloc(sizeof(int)*ndocs);  for (i=0; i<ndocs; i++) {    tdocs[i] = i;  }  /* hyp_yvect is a hack - it holds the hypotheses, but also the   * correct labels for the queried docs (so a proper vect can be   * passed to eval) */  cur_hyp_yvect = hyp_yvect = (int *) malloc(sizeof(int)*ndocs);  nplabeled = ndocs - nperm_unlabeled;  num_words = bow_num_words();  /* BEGIN LOGGING CODE */  if (astd->sv_dim_sat_vect) {    train_sat_vect = (int *) malloc(sizeof(int)*num_words);  } else {    train_sat_vect = NULL;  }  if (astd->train_dim_sat_vect) {    old_svbitmap = (int *) malloc((ndocs+7)/8);    sv_sat_vect = (int *) malloc(sizeof(int)*num_words);  } else {    old_svbitmap = NULL;//(int *) malloc((ndocs+7)/8);    sv_sat_vect = NULL;//(int *) malloc(sizeof(int)*num_words);  }  /* initialize accounting stuff */  if (sv_sat_vect) {    memset(old_svbitmap, 0, (ndocs+7)/8);    for (i=0; i<num_words; i++) {      sv_sat_vect[i] = 0.0;    }  }  if (train_sat_vect) {    for (i=0; i<num_words; i++) {      train_sat_vect[i] = 0.0;    }  }  /* END LOGGING CODE */  /* initialize... */  nsv = 0;  for (i=0; i<ndocs; i++) {    weights[i] = 0.0;    tvals[i] = 0.0;  }  qsize = svm_al_qsize;  /* select an initial set of things to classify */  /* the following is equivalent to asking the user to classify 1/2 the   * documents as positive & the other half as negative */  for (k=-1, i=0, n=sub_ndocs/2; k<2; n=sub_ndocs,k=k+2) {    for (j=0; i<n && j<nplabeled; j++) {      if (train_yvect[j] != k) {	continue;      }      {	int t;	bow_wv *twv;	t = tdocs[j];	tdocs[j] = tdocs[i];	tdocs[i] = t;	t = train_yvect[j];	train_yvect[j] = train_yvect[i];	train_yvect[i] = t;	twv = train_docs[j];	train_docs[j] = train_docs[i];	train_docs[i] = twv;      }      i++;    }  }  sub_ndocs = i;  last_subndocs = 0;  /* copy the initial yvect into hyp_yvect (for evaluate) */  for (i=0; i<sub_ndocs; i++) {    hyp_yvect[i] = train_yvect[i];  }  cur_hyp_yvect = &(hyp_yvect[sub_ndocs]);  for (i=train_yvect[0],j=1; j<sub_ndocs; j++) {    if (j != i) break;  }  if (j == i) {    bow_error("Can't active learn when all examples are from the same class!");  }  train_cscores = NULL;  dec = 0;  nleft = ndocs - sub_ndocs;  changed = 1; /* for code where transduction is done */  n_trans_correct = 0;  for (nloop=0; ;nloop++) {    struct tms t1, t2;    /* BEGIN LOGGING CODE */    /* this is done at the beginning of the loop so that the base case       (ie. after initial set) works too... */    if (astd->npos_added && astd->nneg_added) {      astd->npos_added[nloop] = 0;      astd->nneg_added[nloop] = 0;      for (i=last_subndocs; i<sub_ndocs; i++) {	if (train_yvect[i] == 1) {	  astd->npos_added[nloop] ++;	} else {	  astd->nneg_added[nloop] ++;	}      }    }    /* add the document indices */    if (astd->docs_added) {      for (i=last_subndocs; i<sub_ndocs; i++) {	astd->docs_added[i] = tdocs[i];      }    }    if (train_sat_vect) {      for (i=last_subndocs; i<sub_ndocs; i++) {	for (j=0; j<train_docs[i]->num_entries; j++) {	  train_sat_vect[train_docs[i]->entry[j].wi] ++;	}      }    }    svm_nkc_calls = 0;    /* END LOGGING CODE */    fprintf(stderr,"\r%dth AL iteration",nloop);    times(&t1);    if (nloop==2) {      //exit(1);    }    /* the changed flag shows whether or not the theorized y's are different than     * the actual ones (if they are, retraining is done, otherwise it isn't). */    if (svm_al_do_trans) {      if (changed) {	changed = svm_trans_or_chunk(train_docs, train_yvect, cur_hyp_yvect, weights,				     tvals, &tb, W, nleft+nperm_unlabeled, ndocs, &nsv);      }    } else {      changed = svm_trans_or_chunk(train_docs, train_yvect, cur_hyp_yvect, weights,				   tvals, &tb, W, 0, sub_ndocs, &nsv);    }    times(&t2);    /* BEGIN LOGGING CODE */    /* a couple of accounting things that are independent of a test/validation set */    if (astd->time_vect)      astd->time_vect[nloop] = (int) (t2.tms_utime - t1.tms_utime + t2.tms_stime - t1.tms_stime);    if (astd->nsv_vect)      astd->nsv_vect[nloop] = nsv;    if (astd->nbsv_vect) {      astd->nbsv_vect[nloop] = 0;      for (j=0; j<sub_ndocs; j++) {	/* note - use svm_C because the label IS known */	if (weights[j] >= svm_C - svm_epsilon_a)	  astd->nbsv_vect[nloop] ++;      }    }    if (astd->nkce_vect)       astd->nkce_vect[nloop] = svm_nkc_calls;    /* END LOGGING CODE */    /* find the next example that is closest to the hyperplane that we just found */    /* the scores need to be recalculated if any of the weights changed... */    if (changed) {      train_cscores = train_scores;      if (svm_kernel_type == 0) {	for (j=sub_ndocs,nleft=0; j<nplabeled; j++) {	  train_scores[nleft].d = fabs(evaluate_model_hyperplane(*W, tb, train_docs[j]));	  train_scores[nleft].i = j;	  nleft ++;	}      } else {	for (j=sub_ndocs,nleft=0; j<nplabeled; j++) {	  train_scores[nleft].d = fabs(evaluate_model_cache(train_docs, weights, hyp_yvect, tb, train_docs[j], nsv));	  train_scores[nleft].i = j;	  nleft ++;	}      }      /* BEGIN LOGGING CODE */            if (astd->train_anvect && astd->train_apvect) {	double out;	astd->train_anvect[nloop] = astd->train_apvect[nloop] = 0;	for (i=0; i<sub_ndocs; i++) {	  if (svm_kernel_type == 0) {	    out = evaluate_model_hyperplane(*W,tb,train_docs[i]);	  } else {	    out = evaluate_model_cache(train_docs, weights, hyp_yvect, tb, train_docs[i], nsv);	  }	  if (train_yvect[i]*out > 0) {	    if (train_yvect[i] > 0) {	      astd->train_apvect[nloop] ++;	    } else {	      astd->train_anvect[nloop] ++;	    }	  }	}      }      /* lets figure out the change in fdim saturation... */      if (sv_sat_vect) {	for (i=0; i<sub_ndocs; i++) {	  if ((weights[i] == 0.0) && (GETVALID(old_svbitmap,i))) {	    SETINVALID(old_svbitmap,i);	    for (j=0; j<train_docs[i]->num_entries; j++) {	      sv_sat_vect[train_docs[i]->entry[j].wi] --;	    }	  } else if ((weights[i] != 0.0) && (!GETVALID(old_svbitmap,i))) {	    SETVALID(old_svbitmap,i);	    for (j=0; j<train_docs[i]->num_entries; j++) {	      sv_sat_vect[train_docs[i]->entry[j].wi] ++;	    }	  }	}      		/* this could be smarter - but it would involve more arrays... */	/* update the history vector... */	for (i=0; i<astd->ndim_sat; i++) {	  astd->sv_dim_sat_vect[i][nloop] = 0;	}		for (j=0; j<num_words; j++) {	  for (i=0; sv_sat_vect[j]>=dim_map(i) && i<astd->ndim_sat; i++) {	    astd->sv_dim_sat_vect[i][nloop] ++;	  }	}      }            if (astd->train_dim_sat_vect) {	for (i=0; i<astd->ndim_sat; i++) {	  astd->train_dim_sat_vect[i][nloop] = 0;	}	for (j=0; j<num_words; j++) {	  for (i=0; train_sat_vect[j]>=dim_map(i) && i<astd->ndim_sat; i++) {	    astd->train_dim_sat_vect[i][nloop] ++;	  }	}      }      /* now lets find the accuracy... */      if (astd->prb) {	int npos;	double *test_evals = (double *) malloc(sizeof(double)*astd->ntest);		astd->anvect[nloop] = astd->apvect[nloop] = 0;	if (svm_kernel_type == 0) {	  for (j=0; j<astd->ntest; j++) {	    test_evals[j] = evaluate_model_hyperplane(*W, tb, astd->test_docs[j]);	  }	} else {	  for (j=0; j<astd->ntest; j++) {	    test_evals[j] = evaluate_model(train_docs, weights, hyp_yvect, 					   tb, astd->test_docs[j], nsv);	  }	}	if (astd->test_scores) {	  for (j=0; j<astd->ntest; j++) {	    astd->test_scores[nloop][j] = test_evals[j];	  }	}	for (j=npos=0; j<astd->ntest; j++) {	  if (astd->test_yvect[j] * test_evals[j] > 0.0) {	    if (astd->test_yvect[j] == -1) 	      astd->anvect[nloop] ++;	    else	      astd->apvect[nloop] ++;	  }	  if (astd->test_yvect[j] > 0) {	    npos ++;	  }	}	if (svm_al_do_trans && (astd->apvect[nloop] == 0 || astd->anvect[nloop] == 0)) {	  fprintf(stderr,"Unlikely occurence that all test (%d) examples have the same \n"		  "label when classified with the current model of %d support vectors.\n"		  "Unless this is expected, there is probably a bug in the program.\n"		  "Please send the author (gcs@cmu.edu) email (note the cmd line arguments).\n"		  "The function has stopped the program so that it may be debugged, terminated"		  "or continued\n", astd->ntest, nsv);	  for (j=0; j<astd->ntest; j++) {	    if (test_evals[j] * evaluate_model(train_docs, weights, hyp_yvect, 					       tb, astd->test_docs[j], nsv) < 0.0) {	      fprintf(stderr, "bad hyperplane (j=%d, te[j]=%f, actual_sv=%f, actual_hyp=%f)\n",j,		      evaluate_model(train_docs, weights, hyp_yvect, tb, astd->test_docs[j], nsv),		      test_evals[j],evaluate_model_hyperplane(*W, tb, astd->test_docs[j]));	      fflush(stderr);	      kill(getpid(),SIGSTOP);	    }	  }#ifdef GCSJPRC	  system("echo \"rainbow did a boo-boo - stopping!\" | /usr/sbin/sendmail gcs@jules.res.cmu.edu");#endif	  /* if it didn't get stopped before */	  if (j == astd->ntest) {	    fflush(stderr);	    kill(getpid(),SIGSTOP);	  }	}	/* precision recall breakevens too */	astd->prb[nloop] = prec_recall_breakeven(test_evals, astd->test_yvect, 						 astd->ntest, npos);	free(test_evals);      }      /* END LOGGING CODE */    } else {      /* BEGIN LOGGING CODE */      /* we can use the scores that we got last time (they'll still be the same) - just       * remove the previous ones from the scores array... */      /* since nothing changed, we don't need to recalculate the test accuracy */      if (astd->prb) {	astd->apvect[nloop] = astd->apvect[nloop-1];	astd->anvect[nloop] = astd->anvect[nloop-1];	astd->prb[nloop] = astd->prb[nloop-1];      }      /* quite nasty (because of the dependency on nneg_added & npos_added) */      if (astd->train_anvect && astd->train_apvect && astd->nneg_added && astd->npos_added) {	astd->train_anvect[nloop] = astd->train_anvect[nloop-1] + astd->nneg_added[nloop];	astd->train_apvect[nloop] = astd->train_apvect[nloop-1] + astd->npos_added[nloop];      }
svm_al.c - 源码说明

本页面展示了「机器学习作者tom mitchell的书上代码」中的 svm_al.c 源码文件，采用 C语言编程语言编写，共 902 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与mitchell相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?