📄 svm_base.c
字号:
if (j == i) { i++; continue; } d = docs[j]; docs[j] = docs[i]; docs[i] = d; y = yvect[j]; yvect[j] = yvect[i]; yvect[i] = y; y = permute_table[j]; permute_table[j] = permute_table[i]; permute_table[i] = y; }}/* Right now, the vectors it looks at are the raw freq vectors */double dprod(bow_wv *wv1, bow_wv *wv2) { double sum; bow_we *v1, *v2; int i1, i2; i1 = i2 = 0; sum = 0.0; v1 = wv1->entry; v2 = wv2->entry; while ((i1 < wv1->num_entries) && (i2 < wv2->num_entries)) { if(v1[i1].wi > v2[i2].wi) { i2++; } else if (v1[i1].wi < v2[i2].wi) { i1++; } else { sum += (v1[i1].weight) * (v2[i2].weight); i1++; i2++; } } return(sum);}/* dot product between a sparce & non-sparse vector */double dprod_sd(bow_wv *wv, double *W) { double sum; bow_we *v; int i; i = 0; sum = 0.0; v = wv->entry; while (i < wv->num_entries) { sum += v[i].weight * W[v[i].wi]; i++; } return(sum);}/* this is a whole different function just because the kernel is the biggest bottleneck */double ddprod(bow_wv *wv1, bow_wv *wv2) { double tmp; double sum; bow_we *v1, *v2; int i1, i2; i1 = i2 = 0; sum = 0.0; v1 = wv1->entry; v2 = wv2->entry; while ((i1 < wv1->num_entries) && (i2 < wv2->num_entries)) { if(v1[i1].wi > v2[i2].wi) { i2++; } else if (v1[i1].wi < v2[i2].wi) { i1++; } else { tmp = (v1[i1].weight) - (v2[i2].weight); sum += tmp*tmp; i1++; i2++; } } return(sum);}/* End of command-line options specific to SVMs */double kernel_poly(bow_wv *wv1, bow_wv *wv2) { return (pow(kparm.poly.lin_co * dprod(wv1,wv2) + kparm.poly.const_co, kparm.poly.degree));}double kernel_rbf(bow_wv *wv1, bow_wv *wv2) { return (exp(-1*kparm.rbf.gamma * (ddprod(wv1,wv2))));}double kernel_sig(bow_wv *wv1, bow_wv *wv2) { return(tanh(kparm.sig.lin_co * dprod(wv1,wv2)+kparm.sig.const_co));}static int rlength;typedef struct _kc_el { bow_wv *i, *j; double val; unsigned int age;} kc_el;static kc_el *harray;static unsigned int max_age;void kcache_init(int nwide) { int i; max_age = 1; svm_nkc_calls = 0; rlength = nwide; if ((harray = (kc_el *) malloc(sizeof(kc_el)*cache_size)) == NULL) { cache_size = cache_size/2; fprintf(stderr, "Could not allocate space for the kernel cache.\n" "Shrinking size to %d and trying again.\n", cache_size); return (kcache_init(nwide)); } for (i=0; i<cache_size; i++) { harray[i].i = (bow_wv *) ~0; harray[i].age = 0; }}void kcache_clear() { free(harray);}void kcache_age() { max_age++;}#define NHASHES 3static int sub_nkcc=0; /* this makes nkc_calls = actual calls / 100 */double svm_kernel_cache(bow_wv *wv1, bow_wv *wv2) { int h_index; int k; unsigned int min_age, min_from; double d; if (!((sub_nkcc++) % 100)) { svm_nkc_calls ++; } min_age = ~((unsigned long) 0); /* all of the kernels are symetric */ if (wv1>wv2) { bow_wv *tmp; tmp = wv2; wv2 = wv1; wv1 = tmp; } for (k=h_index=0; k<NHASHES; k++) { h_index = ((((unsigned int)wv1)*rlength+((unsigned int)wv2))+h_index+19) % cache_size; if ((harray[h_index].i == wv1) && (harray[h_index].j == wv2)) { harray[h_index].age = max_age; return (harray[h_index].val); } else { if (harray[h_index].age > 0) { if (min_age > harray[h_index].age) { min_age = harray[h_index].age; min_from = h_index; } continue; } else { min_from = h_index; break; } } } d = kernel(wv1,wv2); harray[min_from].i = wv1; harray[min_from].j = wv2; harray[min_from].val = d; harray[min_from].age = max_age; return (d);}/* don't add the evaluation (useful if the items are getting deleted from a set) */double svm_kernel_cache_lookup(bow_wv *wv1, bow_wv *wv2) { int h_index; int k; /* all of the kernels are symetric */ if (wv1>wv2) { bow_wv *tmp; tmp = wv2; wv2 = wv1; wv1 = tmp; } for (k=h_index=0; k<NHASHES; k++) { h_index = ((((unsigned int)wv1)*rlength+((unsigned int)wv2))+h_index+19) % cache_size; if ((harray[h_index].i == wv1) && (harray[h_index].j == wv2)) { return (harray[h_index].val); } } return (kernel(wv1,wv2));}/* random qsort helpers */int di_cmp(const void *v1, const void *v2) { double d1, d2; d1 = ((struct di *) v1)->d; d2 = ((struct di *) v2)->d; if (d1 < d2) { return (-1); } else if (d1 > d2) { return (1); } else { return 0; }}int i_cmp(const void *v1, const void *v2) { int d1, d2; d1 = *((int *) v1); d2 = *((int *) v2); if (d1 < d2) { return (-1); } else if (d1 > d2) { return (1); } else { return 0; }}int d_cmp(const void *v1, const void *v2) { double d1, d2; d1 = *((double *) v1); d2 = *((double *) v2); if (d1 < d2) { return (-1); } else if (d1 > d2) { return (1); } else { return 0; }}int s_cmp(const void *v1, const void *v2) { bow_score *s1, *s2; s1 = ((bow_score *) v1); s2 = ((bow_score *) v2); if (s1->weight < s2->weight) { return (1); } else if (s1->weight > s2->weight) { return (-1); } else { if (s1->di < s2->di) { return (-1); } else if (s1->di > s2->di) { return (1); } else { return 0; } }}/* useful alternative to qsort or radix sort *//* stick the top n values in the first n slots of arr */void get_top_n(struct di *arr, int len, int n) { double mind, tmpd; int minfrom, tmpi; int i,j; if (len < n) { return; } for (i=0; i<n && i<len; i++) { mind = arr[i].d; minfrom = i; for (j=i+1; j<len; j++) { if (arr[j].d < mind) { mind = arr[j].d; minfrom = j; } } tmpi = arr[minfrom].i; tmpd = arr[minfrom].d; arr[minfrom].d = arr[i].d; arr[minfrom].i = arr[i].i; arr[i].d = tmpd; arr[i].i = tmpi; } return;}/* takes in docs, creates an idf vector & then weights the document *//* sets doc weights by using counts & normalizer */static float *tfidf(bow_wv **docs, int ntrain) { float idf_sum; /* sum of all the idf values */ int max_wi; /* the highest "word index" */ float *new_idf_vect; int i, j; bow_verbosify (bow_progress, "Setting weights over words: "); max_wi = bow_num_words(); new_idf_vect = (float *) malloc(sizeof(float)*max_wi); for (i=0; i<max_wi; i++) { new_idf_vect[i] = 0.0; } idf_sum = 0.0; /* First calculate document frequencies. */ for (i=0; i<ntrain; i++) { for (j=0; j<docs[i]->num_entries; j++) { if (df_counts == bow_tfidf_occurrences) { /* Make DV be the number of documents in which word WI occurs at least once. (We can't just set it to DV->LENGTH because we have to check to make sure each document is part of the model. */ new_idf_vect[docs[i]->entry[j].wi] ++; } else if (df_counts == bow_tfidf_words) { /* Make DV be the total number of times word WI appears in any document. */ new_idf_vect[docs[i]->entry[j].wi] += docs[i]->entry[j].count; } else { bow_error ("Bad TFIDF parameter df_counts."); } } } for (i=0; i<max_wi; i++) { /* Set IDF from DF. */ /* following what Thorsten alledgedly does */ if (new_idf_vect[i] >= 3.0) { if (df_transform == LOG) new_idf_vect[i] = log2f (ntrain / new_idf_vect[i]); else if (df_transform == SQRT) new_idf_vect[i] = sqrtf (ntrain / new_idf_vect[i]); else if (df_transform == RAW) new_idf_vect[i] = ntrain / new_idf_vect[i]; else { new_idf_vect[i] = 0; /* to avoid gcc warning */ bow_error ("Bad TFIDF parameter df_transform."); } idf_sum += new_idf_vect[i]; } else { new_idf_vect[i] = 0.0; } } /* "normalize" the idf values */ for (i=0; i<max_wi; i++) { /* Get the document vector for this word WI */ new_idf_vect[i] = max_wi*new_idf_vect[i]/idf_sum; } bow_verbosify (bow_progress, "\n"); return new_idf_vect;}/* next 2 fn's stolen from info-gain.c *//* Return the entropy given counts for each type of element. */static double entropy(float e1, float e2) { double total = 0; /* How many elements we have in total */ double entropy = 0.0; double fraction; total = e1 + e2; /* If we have no elements, then the entropy is zero. */ if (total == 0) { return 0.0; } entropy = 0.0; /* Now calculate the entropy */ fraction = e1 / total; if (fraction != 0.0) { entropy = -1 * fraction * log2f (fraction); } fraction = e2 / total; if (fraction != 0.0) { entropy -= fraction * log2f (fraction); } return entropy;}/* Return a malloc()'ed array containing an infomation-gain score for each word index. */float *infogain(bow_wv **docs, int *yvect, int ndocs) { int grand_totals[2]; /* Totals for each class. */ double total_entropy; /* The entropy of the total collection. */ double with_word_entropy; /* The entropy of the set of docs with the word in question. */ double without_word_entropy; /* The entropy of the set of docs without the word in question. */ float grand_total = 0; float with_word_total = 0; float without_word_total = 0; int i, j; float *ret; double sum; int *fc[2]; /* tallies for all the words in class 1 & 2 */ int num_words; bow_verbosify (bow_progress, "Calculating info gain... words :: "); num_words = bow_num_words(); ret = bow_malloc (num_words*sizeof (float)); fc[0] = (int *) malloc(num_words*sizeof(double)); fc[1] = (int *) malloc(num_words*sizeof(double)); memset(fc[0], 0, num_words*sizeof(int)); memset(fc[1], 0, num_words*sizeof(int)); /* First set all the arrays to zero */ for(i = 0; i < 2; i++) { grand_totals[i] = 0; } /* Now set up the grand totals. */ for (i = 0; i<ndocs; i++) { if (yvect[i]) { /* if it is unlabeled, ignore it */ grand_totals[(yvect[i]+1)/2] ++; /* this is only done incase some type of occurrence cnt should ever happen */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -