📄 svm_base.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 5 页
字号:
      grand_total ++;    }  }  /* Calculate the total entropy */  total_entropy = entropy (grand_totals[0], grand_totals[1]);  sum = 0.0;  /* the fc[...] are like the with_word totals */  for (i=0; i<ndocs; i++) {    if (yvect[i]) {      int y = (yvect[i]+1)/2;      for (j=0; j<docs[i]->num_entries; j++) {	fc[y][docs[i]->entry[j].wi] ++;      }    }  }  for (i=0; i<num_words; i++) {    with_word_total = fc[0][i] + fc[1][i];    without_word_total = grand_total - with_word_total;    with_word_entropy = entropy((float)fc[0][i],(float)fc[1][i]);    without_word_entropy = entropy((float)(grand_totals[0] - fc[0][i]),				       (float)(grand_totals[1] - fc[1][i]));    ret[i]=(float) (total_entropy - 	    (((double)with_word_total/(double)grand_total)*with_word_entropy) -	    (((double)without_word_total/(double)grand_total)*without_word_entropy));    assert (ret[i] >= -1e-7);    sum += ret[i];  }  free(fc[0]);  free(fc[1]);  /* "normalize" in similar fashion to tfidf */  for (i=0; i<num_words; i++)  {    /* Get the document vector for this word WI */    ret[i] = num_words*ret[i]/sum;  }    bow_verbosify (bow_progress, "\n");  return ret;}/* this sets the already transformed weights THEN does the normalizing... */static void svm_set_barrel_weights(bow_wv **docs, int *yvect, int ndocs, float **weight_vect) {  int i,j;    /* the weights have yet to be set & since that's what we're using... */  if (svm_kernel_type == FISHER) {    svm_set_fisher_barrel_weights(docs, ndocs);    return;  } else if (weight_type == RAW) {    for (i=0; i<ndocs; i++) {      for (j=0; j<docs[i]->num_entries; j++) {	docs[i]->entry[j].weight *= docs[i]->normalizer;      }    }    return;  } else if (weight_type == TFIDF) {    *weight_vect = tfidf(docs, ndocs);  } else if (weight_type == INFOGAIN) {    *weight_vect = infogain(docs, yvect, ndocs);  }  /* Now loop through all the documents, setting their weights */  for (i=0; i<ndocs; i++) {    double sum = 0.0;    for (j=0; j<docs[i]->num_entries; j++) {      docs[i]->entry[j].weight *= 	docs[i]->normalizer * (*weight_vect)[docs[i]->entry[j].wi];      sum += docs[i]->entry[j].weight;    }    if (sum >0.0) {      bow_wv_normalize_weights_by_summing(docs[i]);      for (j=0; j<docs[i]->num_entries; j++) {	docs[i]->entry[j].weight *= docs[i]->normalizer;      }    }  }}/* similar to barrel weights above, but this only works on 1 wv at a time *//* will set weights from an already transformed oweights vector (if it was transformed),  * then normalize the weights */static void svm_set_wv_weights(bow_wv *qwv, float *oweights, float *weight_vect) {  double sum;  int i;  sum = 0.0;  if (weight_type == TFIDF || weight_type == INFOGAIN) {    if (tf_transform_type) {      for (i=0; i<qwv->num_entries; i++) {	qwv->entry[i].weight = 	  weight_vect[qwv->entry[i].wi] * oweights[i];	sum += qwv->entry[i].weight;      }    } else {      for (i=0; i<qwv->num_entries; i++) {	/* since no transform was used - just use the raw count*/	qwv->entry[i].weight = 	  weight_vect[qwv->entry[i].wi] * ((float) qwv->entry[i].count);	sum += qwv->entry[i].weight;      }    }  } else {    for (i=0; i<qwv->num_entries && sum == 0.0; i++) {      sum += qwv->entry[i].weight;    }  }  if (sum > 0.0) {    bow_wv_normalize_weights_by_summing(qwv);    for (i=0; i<qwv->num_entries; i++) {      qwv->entry[i].weight *= qwv->normalizer;    }  }}/* the below comment is correct - but there are instances (& in some * cases a substantial proportion) where some data may create an  * excellent starting point for the algorithms, even though so much has changed  * --- therefore, this should be changed to be more intelligent *//* since removing bound support vectors is hard * (since each bound support vector removed drastically *  changes the constraints) I don't bother to do it  * intuitively for each algorithm (that was tried &  * performance did not improve (see above)) - this  * function is nice because its modular & independent * of any implementation. *//* tvals is ignored, but the values filled in by the * algorithm are not changed. */int svm_remove_bound_examples(bow_wv **docs, int *yvect, double *weights,			   double *b, double **W, int ndocs, double *tvals,			   float *cvect, int *nsv) {  int      nbound=0;  int     *tdocs;     /* trans table */  float   *sub_cvect;  bow_wv **sub_docs;  int      sub_ndocs=0;  int     *sub_yvect;  int i,j,x;  sub_docs = (bow_wv **) alloca(sizeof(bow_wv *)*ndocs);   sub_yvect = (int *) alloca(sizeof(int)*ndocs);  tdocs = (int *) alloca(sizeof(int)*ndocs);  sub_cvect = (float *) alloca(sizeof(float)*ndocs);  if (svm_remove_misclassified==REMOVE_BOUND) {    for (i=nbound=sub_ndocs=0; i<ndocs; i++) {      if (weights[i] > cvect[i] - svm_epsilon_a) {	nbound ++;      } else {	sub_docs[sub_ndocs] = docs[i];	sub_yvect[sub_ndocs] = yvect[i];	tdocs[sub_ndocs] = i;	sub_ndocs++;      }    }  } else if (svm_remove_misclassified==REMOVE_WRONG) {    if (svm_kernel_type == 0) {      for (i=nbound=sub_ndocs=0; i<ndocs; i++) {	if (yvect[i]*evaluate_model_hyperplane(*W, *b, docs[i]) < 0.0) {	  nbound ++;	} else {	  sub_docs[sub_ndocs] = docs[i];	  sub_yvect[sub_ndocs] = yvect[i];	  tdocs[sub_ndocs] = i;	  sub_ndocs++;	}      }    } else {      for (i=nbound=sub_ndocs=0; i<ndocs; i++) {	if (yvect[i]*evaluate_model_cache(docs, weights, yvect, *b, docs[i], *nsv) < 0.0) {	  nbound ++;	} else {	  sub_docs[sub_ndocs] = docs[i];	  sub_yvect[sub_ndocs] = yvect[i];	  tdocs[sub_ndocs] = i;	  sub_ndocs++;	}      }    }  }  if (nbound) {    fprintf(stderr, "Removing %d bound examples\n",nbound);    fprintf(stdout, "Removing %d bound examples\n",nbound);  } else {    return 0;  }  /* prb not worthwile to resize arrays */    /* "unbound" everything & set weights & tvals... */  for (i=0; i<sub_ndocs; i++) {    tvals[i] = 0.0;    weights[i] = 0.0;    sub_cvect[i] = MAXFLOAT;  }  *nsv = 0;  if (svm_use_smo) {    x = smo(sub_docs, sub_yvect, weights, b, W, sub_ndocs, tvals, sub_cvect, nsv);  } else {#ifdef HAVE_LOQO    x = build_svm_guts(sub_docs, sub_yvect, weights, b, W, sub_ndocs, tvals, 		       sub_cvect, nsv);#else    bow_error("Must build rainbow with pr_loqo to use this solver!\n");#endif  }  /* place the weights in the proper slots */  for (i=ndocs-1, j=sub_ndocs-1; i>0; i--) {    if (tdocs[j] == i) {      weights[i] = weights[j];      tvals[i] = tvals[j];      j--;    } else {      weights[i] = 0.0;      tvals[i] = 0.0;    }  }  return x;}/* returns whether or not x has changed */inline int solve_svm(bow_wv **docs, int *yvect, double *weights, double *ab, 		     double **W, int ndocs, double *tvals, float *cvect, 		     int *nsv) {  int x;  if (svm_use_smo) {    x = smo(docs, yvect, weights, ab, W, ndocs, tvals, cvect, nsv);  } else {#ifdef HAVE_LOQO    x = build_svm_guts(docs, yvect, weights, ab, W, ndocs, tvals, cvect, nsv);#else    bow_error("Must build rainbow with pr_loqo to use this solver!\n");#endif  }  if (svm_remove_misclassified) {    x |= svm_remove_bound_examples(docs,yvect,weights,ab,W,ndocs,tvals,				  cvect,nsv);  }  return x;}/* returns if the weights have changed */int svm_trans_or_chunk(bow_wv **docs, int *yvect, int *trans_yvect, 		       double *weights, double *tvals, double *ab, 		       double **W, int ntrans, int ndocs, int *nsv) {  if (ntrans) {    return (transduce_svm(docs, yvect, trans_yvect, weights, tvals, ab, 			  W, ndocs, ntrans, nsv));  } else {    int i;    float *cvect = (float *) alloca(sizeof(float)*ndocs);    for (i=0; i<ndocs; i++) {      cvect[i] = svm_C;    }    return(solve_svm(docs, yvect, weights, ab, W, ndocs, tvals, cvect, nsv));  }}/* cover for all the functions *//* this function does a small amount of pre & post-processing for the * algorithm independent stuff (like randomly permuting everything & * outputting a hyperplane if possible) */int tlf_svm(bow_wv **docs, int *yvect, double *weights, double *ab, 	    bow_wv **W_wv, int ntrans, int ndocs) {  int          nlabeled;  int          misclass;  int          nsv;  int         *permute_table;  double      *tvals;  double      *W=NULL;  int i,j;  struct tms t1, t2;  if (svm_random_seed) {    srandom(svm_random_seed);  } else {    svm_random_seed = (int) time(NULL);    srandom(svm_random_seed);    printf("random seed to chop test/train split: %d\n",svm_random_seed);    fprintf(stderr,"random seed to chop test/train split: %d\n",svm_random_seed);  }  permute_table = (int *) malloc(sizeof(int)*ndocs);  nlabeled = ndocs - ntrans;  /* permute each part, but don't mudge them together, because the    * solvers are going to expect all unlabeled data (data with a    * different C* to be in the latter half) */  svm_permute_data(permute_table, docs, yvect, nlabeled);  svm_permute_data(&(permute_table[nlabeled]), &(docs[nlabeled]), &(yvect[nlabeled]), ntrans);  /* lets try to reduce determinism... */  srandom((int) time(NULL));  times(&t1);        if (do_active_learning) {    if (test_in_train) {      nsv = al_svm_test_wrapper(docs, yvect, weights, ab, &W, ntrans, ndocs,				(suppress_score_mat ? 0 : 1),				al_pick_random, permute_table);    } else {      nsv = al_svm(docs, yvect, weights, ab, &W, ntrans, ndocs, al_pick_random);    }  } else {    /* initialize... */    tvals = (double *) alloca(sizeof(double)*ndocs);    nsv = 0;    for (i=0; i<ndocs; i++) {      weights[i] = 0.0;      tvals[i] = 0.0;    }    svm_trans_or_chunk(docs, yvect, NULL, weights, tvals, ab, &W, ntrans, ndocs, &nsv);  }  times(&t2);  fprintf(stderr,"user: %d, system:%d, kernel_calls:%d\n", (int)(t2.tms_utime-t1.tms_utime),	  (int) (t2.tms_stime - t1.tms_stime), svm_nkc_calls);  printf("user: %d, system:%d, kernel_calls:%d\n", (int)(t2.tms_utime-t1.tms_utime),	  (int) (t2.tms_stime - t1.tms_stime), svm_nkc_calls);    /* unpermute data */  svm_unpermute_data(permute_table, docs, yvect, nlabeled);  svm_unpermute_data(&(permute_table[nlabeled]), &(docs[nlabeled]), &(yvect[nlabeled]), ntrans);  free(permute_table);    if (svm_kernel_type == 0) {    *W_wv = svm_darray_to_wv(W);    free(W);  }  printf("support vectors: ");  for (i=j=0; j<nsv; i++) {    if (weights[i] > svm_epsilon_a) {      printf("%d(%f) ",i,weights[i]);      j++;    }  }  misclass = 0;  if (!svm_remove_misclassified) {    for (i=misclass=0; i<nlabeled; i++) {      if (weights[i] > svm_C-svm_epsilon_a) {	misclass++;      }    }    for (i=0; i<ntrans; i++) {      if (weights[nlabeled+i] > svm_trans_cstar-svm_epsilon_a) {	misclass++;      }    }  }  printf("\n%d support vectors (%d bounded)\n", nsv, misclass);  return nsv;}bow_wv *svm_darray_to_wv(double *W) {  bow_wv *W_wv;  int     num_words, i, j;    num_words = bow_num_words();  for (i=j=0; i<num_words; i++) {    if (W[i] != 0.0)       j++;  }    W_wv = bow_wv_new(j);  for (i=j=0; j<W_wv->num_entries; i++) {    if (W[i] != 0.0) {      W_wv->entry[j].wi = i;      W_wv->entry[j].count = 1; /* just so that an assertion doesn't throw up later */      W_wv->entry[j].weight = W[i];      j++;    }  }  return (W_wv);}/* note - these 2 fn's are not MEANT to be inverses of each * other - they don't need to be & shouldn't be!  *//* given a 'focus' value, this transforms x into some int * this must be a BINARY function, outputting ONLY 1 & -1 * because that's what the SVM use for y. */int map_class_to_y(int focus, int x) {  if (focus == x) {    return 1;  } else {    return (-1);  }}/* each pass over these things take up 2 labels... *//* 1->1, -1->0 */int map_y_to_class(int focus, int x) {  return ((focus*2)+((x+1)/2));}/* helper to do whatever transform on a wv & then normalize it... */static void tf_transform(bow_wv *doc) {  int j;  for (j=0; j<doc->num_entries; j++) {    if (tf_transform_type == LOG) {      doc->entry[j].weight = log2f((float) (doc->entry[j].count + 1));    } else {       doc->entry[j].weight = (float) doc->entry[j].count;    }  }}/* sets counts & the normalizer too *//* pulls from the barrel those docs that satisfy dec_fn & turns them into a doc array */int make_doc_array(bow_barrel *barrel, bow_wv **docs, int *tdocs, int(*dec_fn)(bow_cdoc *)) {  bow_dv_heap  *heap;  int ndocs;  bow_wv       *wv_tmp1;  bow_wv       *wv_tmp;  int j;  /* Create the Heap of vectors of all documents */  heap = bow_make_dv_heap_from_wi2dvf(barrel->wi2dvf);   for (ndocs=0; ; ndocs++) {    int t = bow_heap_next_wv(heap, barrel, &wv_tmp1, dec_fn);    if (t == -1) {      break;    } else {      tdocs[ndocs] = t;    }    wv_tmp = bow_wv_new(wv_tmp1->num_entries);    for (j=0; j<wv_tmp->num_entries; j++) {
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -