📄 svm_fisher.c

📁 机器学习作者tom mitchell的书上代码
💻 C
字号:
/* Copyright (C) 1999 Greg Schohn - gcs@jprc.com *//* ******************** svm_fisher.c *******************  * An implementation of the naive bayes fisher kernel. * This is still a work very much in progress, ridden with * numerical precision problems. */#include <bow/svm.h>static bow_barrel *rainbow_nb_barrel;static double fisher_norm0;static int total_num_words_occurences;static double *dIi;  /* approximate diagonal inverse information matrix for classes */static double *dIij; /* approximate diagonal inverse information matrix for class-words */typedef struct _NPair {  int N;  int index;} NPair;void svm_set_fisher_barrel_weights(bow_wv **docs, int ndocs) {  int i,j;  total_num_words_occurences = 0;  for (i=0; i<ndocs; i++) {    docs[i]->normalizer = 1.0;    for (j=0; j<docs[i]->num_entries; j++) {      docs[i]->entry[j].weight = (float) docs[i]->entry[j].count;      total_num_words_occurences += docs[i]->entry[j].count;    }  }}double svm_kernel_fisher(bow_wv *wv1, bow_wv *wv2) {  bow_cdoc  *cd;  int        max_entries;   /* max number of elements that can be in both */  int        nclasses;  int        nwords;  NPair     *Nvector;  double     rval;  double     tmp;  bow_we    *v1, *v2;  double t2, pci;  int i, j, k;  nwords = total_num_words_occurences;  nclasses = bow_barrel_num_classes(rainbow_nb_barrel);  max_entries = MIN(wv1->num_entries, wv2->num_entries);  Nvector = (NPair *) alloca(max_entries*sizeof(NPair));  v1 = wv1->entry;  v2 = wv2->entry;  /* compute the N(wi,X1)*N(wi,X2) vector */  for (i=j=k=0; (i<max_entries) && (j<max_entries); ) {    if(v1[i].wi > v2[j].wi) {      j++;    }    else if (v1[i].wi < v2[j].wi) {      i++;    }    else {      Nvector[k].index = v1[i].wi;      Nvector[k].N = (v1[i].count)*(v2[j].count);      k++;      i++;      j++;    }  }  max_entries = k;  rval = 0.0;  /* now we have all of the P(X*|C*) terms - in ascending order with   * regards to class index */  for (i=0; i<nclasses; i++) {    for (j=0, tmp=0; j<max_entries; j++) {      t2 = bow_naivebayes_pr_wi_ci(rainbow_nb_barrel, Nvector[j].index, i, -1, 0, 0, NULL, NULL);      t2 = t2*t2;      tmp += Nvector[j].N / (dIij[i*nclasses + Nvector[j].index] * t2);      assert(finite(tmp));    }    /* compute P(x[12]|ci)/P(x[12]) */    {      double p_w;      bow_we *v;      bow_wv *w;      int k,h,n;            t2 = fisher_norm0;      for (w=wv1, n=0; n<2; n++, w=wv2) {	v = w->entry;	for (h=0; h<w->num_entries; h++) {	  double sum, t;	  bow_dv *dv = bow_wi2dvf_dv(rainbow_nb_barrel->wi2dvf, v[h].wi);	  assert(dv);	  /* sum up the number of words that appeared in all of the classes */	  for (k=0, sum=0.0; k<dv->length; k++) {	    sum += dv->entry[k].weight;	  }	  p_w = log(sum/nwords)*v[h].weight;	  t = (double) bow_naivebayes_pr_wi_ci (rainbow_nb_barrel, v[h].wi, i, -1,						       0.0, 0.0, NULL, NULL);	  t = log(t) * v[h].weight;	  t2 += t - p_w;	  assert(finite(t2));	  //printf("P(w%d|c%d)^%f, p_w^%f\n", v[h].wi, k, t, p_w);	}      }    }    cd = GET_CDOC_ARRAY_EL(rainbow_nb_barrel,i);    pci = cd->prior;    rval += exp(t2 + log(dIi[i] + (tmp*pci*pci)));    assert(finite(rval));  }  //rval = exp(rval);  printf("kernel=%f\n",rval);  return rval;}void svm_setup_fisher(bow_barrel *old_barrel, bow_wv **docs, int nclasses, int ndocs) {  double *PXk, PX;  int     i,j,k;  rainbow_method *tmp = old_barrel->method;  old_barrel->method = &bow_method_naivebayes;  rainbow_nb_barrel = bow_barrel_new_vpc_merge_then_weight (old_barrel);  old_barrel->method = tmp;    /* set some global variables that naivebayes.c uses */  naivebayes_score_returns_doc_pr = 1;  naivebayes_score_unsorted = 1;  fprintf(stderr, "Finding maximum kernel value for normalizing\n");  i = bow_num_words()*nclasses;  dIi = (double *) malloc(sizeof(double)*nclasses);  PXk = (double *) malloc(sizeof(double)*nclasses);  dIij = (double *) malloc(sizeof(double)*i);  for (j=0; j<i; j++) {    dIij[j] = 0.0;  }  for (j=0; j<nclasses; j++) {    dIi[j] = 0.0;  }  for (i=0; i<ndocs; i++) {    double max_lpr;    bow_score *scores = malloc(sizeof(bow_score)*nclasses);    /* compute the P(X|class) * P(class) terms (since they're used so often) */    PX = 0.0;    /* NOTE: with the ...returns_doc_pr variable set, the scores are not probabilities,     * but instead log probabilities */    bow_naivebayes_score(rainbow_nb_barrel, docs[i], scores, nclasses, -1);    max_lpr = scores[0].weight;    for (k=1; k<nclasses; k++) {      if (scores[k].weight > max_lpr) 	max_lpr = scores[k].weight;    }    for (k=0; k<nclasses; k++) {      bow_cdoc *cd = GET_CDOC_ARRAY_EL(rainbow_nb_barrel,k);      /* the max lpr over everything is the same as multiplying both the       * denominator & the numerator by some large constant */      PXk[k] = cd->prior * exp(scores[k].weight - max_lpr);      /* hacky-hacky-hacky-smoothing */#define THRESH 1e-1      if (PXk[k] < THRESH) {	PXk[k] = THRESH;	printf("underflow on P(X%d|C%d) - setting to small val\n",i,k);	fflush(stdout);      }      PX += PXk[k];      assert(finite(PXk[k]) && PXk[k] != 0.0);    }    free(scores);    /* compute term for Iij - d/d-theta_ij * log(P(X|theta)) */    for (j=0; j<docs[i]->num_entries; j++) {      for (k=0; k<nclasses; k++) {	double tmp;	tmp = bow_naivebayes_pr_wi_ci (rainbow_nb_barrel, docs[i]->entry[j].wi,				       k, -1, 0.0, 0.0, NULL, NULL);	dIij[k*nclasses + docs[i]->entry[j].wi] += 	  ((((docs[i]->entry[j].count * PXk[k]) /tmp) /PX) * 	   (((docs[i]->entry[j].count * PXk[k]) /tmp) /PX));      }    }    /* compute term for Ii - d/d-theta_i * log(P(X|theta)) */    for (k=0; k<nclasses; k++) {      /* M is in both of these terms, so we don't need to worry about it... */      dIi[k] += (PXk[k]/ PX)*(PXk[k]/ PX);      assert(finite(dIi[k]));    }    if (!(i % 100)) {      fprintf(stderr, "\r%f%%", (float) ((double)i)/((double)ndocs)*100.0);    }  }  free(PXk);    /* now invert the values class values */  for (i=0; i<nclasses; i++) {    dIi[i] = 1/dIi[i];  }  fprintf(stderr,"%f%%\n",(float)100.0);    /* set "fisher normalizer" */  {    double max = -1;         int from=0;    fisher_norm0 = 0;  /* keep in mind, this is log(scalar) */    for (i=0; i<ndocs; i++) {      double tmp = svm_kernel_fisher(docs[i],docs[i]);      if (max < tmp) {	max = tmp;	from = i;      }    }    fisher_norm0 = log(1/max);  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -