📄 svm_base.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 5 页
字号:
    if (j == i) {      i++;      continue;    }    d = docs[j];    docs[j] = docs[i];    docs[i] = d;    y = yvect[j];    yvect[j] = yvect[i];    yvect[i] = y;    y = permute_table[j];    permute_table[j] = permute_table[i];    permute_table[i] = y;  }}/* Right now, the vectors it looks at are the raw freq vectors */double dprod(bow_wv *wv1, bow_wv *wv2) {  double sum;  bow_we *v1, *v2;  int i1, i2;  i1 = i2 = 0;  sum = 0.0;  v1 = wv1->entry;  v2 = wv2->entry;  while ((i1 < wv1->num_entries) && (i2 < wv2->num_entries)) {    if(v1[i1].wi > v2[i2].wi) {      i2++;    }    else if (v1[i1].wi < v2[i2].wi) {      i1++;    }    else {      sum += (v1[i1].weight) * (v2[i2].weight);      i1++;      i2++;    }  }  return(sum);}/* dot product between a sparce & non-sparse vector */double dprod_sd(bow_wv *wv, double *W) {  double sum;  bow_we *v;  int i;  i = 0;  sum = 0.0;  v = wv->entry;  while (i < wv->num_entries) {    sum += v[i].weight * W[v[i].wi];    i++;  }  return(sum);}/* this is a whole different function just because the kernel is the biggest bottleneck */double ddprod(bow_wv *wv1, bow_wv *wv2) {  double tmp;  double sum;  bow_we *v1, *v2;  int i1, i2;  i1 = i2 = 0;  sum = 0.0;  v1 = wv1->entry;  v2 = wv2->entry;  while ((i1 < wv1->num_entries) && (i2 < wv2->num_entries)) {    if(v1[i1].wi > v2[i2].wi) {      i2++;    }    else if (v1[i1].wi < v2[i2].wi) {      i1++;    }    else {      tmp = (v1[i1].weight) - (v2[i2].weight);      sum += tmp*tmp;      i1++;      i2++;    }  }  return(sum);}/* End of command-line options specific to SVMs */double kernel_poly(bow_wv *wv1, bow_wv *wv2) {  return (pow(kparm.poly.lin_co * dprod(wv1,wv2) + 	      kparm.poly.const_co, kparm.poly.degree));}double kernel_rbf(bow_wv *wv1, bow_wv *wv2) {  return (exp(-1*kparm.rbf.gamma * (ddprod(wv1,wv2))));}double kernel_sig(bow_wv *wv1, bow_wv *wv2) {  return(tanh(kparm.sig.lin_co * dprod(wv1,wv2)+kparm.sig.const_co));}static int rlength;typedef struct _kc_el {  bow_wv *i, *j;  double val;  unsigned int age;} kc_el;static kc_el *harray;static unsigned int max_age;void kcache_init(int nwide) {  int i;  max_age = 1;  svm_nkc_calls = 0;  rlength = nwide;  if ((harray = (kc_el *) malloc(sizeof(kc_el)*cache_size)) == NULL) {    cache_size = cache_size/2;    fprintf(stderr, "Could not allocate space for the kernel cache.\n"	    "Shrinking size to %d and trying again.\n", cache_size);    return (kcache_init(nwide));  }  for (i=0; i<cache_size; i++) {    harray[i].i = (bow_wv *) ~0;    harray[i].age = 0;  }}void kcache_clear() {  free(harray);}void kcache_age() {  max_age++;}#define NHASHES   3static int sub_nkcc=0; /* this makes nkc_calls = actual calls / 100 */double svm_kernel_cache(bow_wv *wv1, bow_wv *wv2) {  int h_index;  int k;  unsigned int min_age, min_from;  double d;  if (!((sub_nkcc++) % 100)) {    svm_nkc_calls ++;  }  min_age = ~((unsigned long) 0);  /* all of the kernels are symetric */  if (wv1>wv2) {    bow_wv *tmp;    tmp = wv2;    wv2 = wv1;    wv1 = tmp;  }  for (k=h_index=0; k<NHASHES; k++) {    h_index = ((((unsigned int)wv1)*rlength+((unsigned int)wv2))+h_index+19) % cache_size;        if ((harray[h_index].i == wv1) && (harray[h_index].j == wv2)) {      harray[h_index].age = max_age;      return (harray[h_index].val);    } else {      if (harray[h_index].age > 0) {	if (min_age > harray[h_index].age) {	  min_age = harray[h_index].age;	  min_from = h_index;	}	continue;      } else {	min_from = h_index;	break;      }    }  }  d = kernel(wv1,wv2);  harray[min_from].i = wv1;  harray[min_from].j = wv2;  harray[min_from].val = d;  harray[min_from].age = max_age;  return (d);}/* don't add the evaluation (useful if the items are getting deleted from a set) */double svm_kernel_cache_lookup(bow_wv *wv1, bow_wv *wv2) {  int h_index;  int k;  /* all of the kernels are symetric */  if (wv1>wv2) {    bow_wv *tmp;    tmp = wv2;    wv2 = wv1;    wv1 = tmp;  }  for (k=h_index=0; k<NHASHES; k++) {    h_index = ((((unsigned int)wv1)*rlength+((unsigned int)wv2))+h_index+19) % cache_size;        if ((harray[h_index].i == wv1) && (harray[h_index].j == wv2)) {      return (harray[h_index].val);    }  }  return (kernel(wv1,wv2));}/* random qsort helpers */int di_cmp(const void *v1, const void *v2) {  double d1, d2;  d1 = ((struct di *) v1)->d;  d2 = ((struct di *) v2)->d;  if (d1 < d2) {    return (-1);  } else if (d1 > d2) {    return (1);  } else {    return 0;  }}int i_cmp(const void *v1, const void *v2) {  int d1, d2;  d1 = *((int *) v1);  d2 = *((int *) v2);  if (d1 < d2) {    return (-1);  } else if (d1 > d2) {    return (1);  } else {    return 0;  }}int d_cmp(const void *v1, const void *v2) {  double d1, d2;  d1 = *((double *) v1);  d2 = *((double *) v2);  if (d1 < d2) {    return (-1);  } else if (d1 > d2) {    return (1);  } else {    return 0;  }}int s_cmp(const void *v1, const void *v2) {  bow_score *s1, *s2;  s1 = ((bow_score *) v1);  s2 = ((bow_score *) v2);  if (s1->weight < s2->weight) {    return (1);  } else if (s1->weight > s2->weight) {    return (-1);  } else {    if (s1->di < s2->di) {      return (-1);    } else if (s1->di > s2->di) {      return (1);    } else {      return 0;    }  }}/* useful alternative to qsort or radix sort *//* stick the top n values in the first n slots of arr */void get_top_n(struct di *arr, int len, int n) {  double mind, tmpd;  int    minfrom, tmpi;  int i,j;  if (len < n) {    return;  }  for (i=0; i<n && i<len; i++) {    mind = arr[i].d;    minfrom = i;        for (j=i+1; j<len; j++) {      if (arr[j].d < mind) {	mind = arr[j].d;	minfrom = j;      }    }    tmpi = arr[minfrom].i;    tmpd = arr[minfrom].d;        arr[minfrom].d = arr[i].d;    arr[minfrom].i = arr[i].i;        arr[i].d = tmpd;    arr[i].i = tmpi;  }  return;}/* takes in docs, creates an idf vector & then weights the document *//* sets doc weights by using counts & normalizer */static float *tfidf(bow_wv **docs, int ntrain) {  float    idf_sum;            /* sum of all the idf values */  int      max_wi;	       /* the highest "word index" */  float   *new_idf_vect;  int i, j;  bow_verbosify (bow_progress, "Setting weights over words:          ");  max_wi = bow_num_words();  new_idf_vect = (float *) malloc(sizeof(float)*max_wi);  for (i=0; i<max_wi; i++) {    new_idf_vect[i] = 0.0;  }  idf_sum = 0.0;  /* First calculate document frequencies. */  for (i=0; i<ntrain; i++)  {    for (j=0; j<docs[i]->num_entries; j++) {      if (df_counts == bow_tfidf_occurrences) {	/* Make DV be the number of documents in which word WI occurs 	   at least once.  (We can't just set it to DV->LENGTH because	   we have to check to make sure each document is part of the	   model. */	new_idf_vect[docs[i]->entry[j].wi] ++;      } else if (df_counts == bow_tfidf_words) {	/* Make DV be the total number of times word WI appears	   in any document. */	new_idf_vect[docs[i]->entry[j].wi] += docs[i]->entry[j].count;      } else {	bow_error ("Bad TFIDF parameter df_counts.");      }    }  }  for (i=0; i<max_wi; i++)  {    /* Set IDF from DF. */    /* following what Thorsten alledgedly does */    if (new_idf_vect[i] >= 3.0) {      if (df_transform == LOG)	new_idf_vect[i] = log2f (ntrain / new_idf_vect[i]);      else if (df_transform == SQRT)	new_idf_vect[i] = sqrtf (ntrain / new_idf_vect[i]);      else if (df_transform == RAW)	new_idf_vect[i] = ntrain / new_idf_vect[i];      else {	new_idf_vect[i] = 0;		/* to avoid gcc warning */	bow_error ("Bad TFIDF parameter df_transform.");      }      idf_sum += new_idf_vect[i];    } else {      new_idf_vect[i] = 0.0;    }  }  /* "normalize" the idf values */  for (i=0; i<max_wi; i++)  {    /* Get the document vector for this word WI */    new_idf_vect[i] = max_wi*new_idf_vect[i]/idf_sum;  }  bow_verbosify (bow_progress, "\n");  return new_idf_vect;}/* next 2 fn's stolen from info-gain.c *//* Return the entropy given counts for each type of element. */static double entropy(float e1, float e2) {  double total = 0;		/* How many elements we have in total */  double entropy = 0.0;  double fraction;  total = e1 + e2;  /* If we have no elements, then the entropy is zero. */  if (total == 0) {    return 0.0;  }  entropy = 0.0;  /* Now calculate the entropy */  fraction = e1 / total;  if (fraction != 0.0) {    entropy  = -1 * fraction * log2f (fraction);  }  fraction = e2 / total;  if (fraction != 0.0) {    entropy -= fraction * log2f (fraction);  }  return entropy;}/* Return a malloc()'ed array containing an infomation-gain score for   each word index. */float *infogain(bow_wv **docs, int *yvect, int ndocs) {  int grand_totals[2];  /* Totals for each class. */  double total_entropy;             /* The entropy of the total collection. */  double with_word_entropy;         /* The entropy of the set of docs with				       the word in question. */  double without_word_entropy;      /* The entropy of the set of docs without				       the word in question. */  float grand_total = 0;   float with_word_total = 0;  float without_word_total = 0;  int i, j;  float *ret;  double sum;  int *fc[2];  /* tallies for all the words in class 1 & 2 */  int num_words;  bow_verbosify (bow_progress, "Calculating info gain... words ::          ");  num_words = bow_num_words();  ret = bow_malloc (num_words*sizeof (float));  fc[0] = (int *) malloc(num_words*sizeof(double));  fc[1] = (int *) malloc(num_words*sizeof(double));  memset(fc[0], 0, num_words*sizeof(int));  memset(fc[1], 0, num_words*sizeof(int));  /* First set all the arrays to zero */  for(i = 0; i < 2; i++) {    grand_totals[i] = 0;  }  /* Now set up the grand totals. */  for (i = 0; i<ndocs; i++) {    if (yvect[i]) { /* if it is unlabeled, ignore it */      grand_totals[(yvect[i]+1)/2] ++;      /* this is only done incase some type of occurrence cnt should ever happen */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -