📄 svm_base.c
字号:
grand_total ++; } } /* Calculate the total entropy */ total_entropy = entropy (grand_totals[0], grand_totals[1]); sum = 0.0; /* the fc[...] are like the with_word totals */ for (i=0; i<ndocs; i++) { if (yvect[i]) { int y = (yvect[i]+1)/2; for (j=0; j<docs[i]->num_entries; j++) { fc[y][docs[i]->entry[j].wi] ++; } } } for (i=0; i<num_words; i++) { with_word_total = fc[0][i] + fc[1][i]; without_word_total = grand_total - with_word_total; with_word_entropy = entropy((float)fc[0][i],(float)fc[1][i]); without_word_entropy = entropy((float)(grand_totals[0] - fc[0][i]), (float)(grand_totals[1] - fc[1][i])); ret[i]=(float) (total_entropy - (((double)with_word_total/(double)grand_total)*with_word_entropy) - (((double)without_word_total/(double)grand_total)*without_word_entropy)); assert (ret[i] >= -1e-7); sum += ret[i]; } free(fc[0]); free(fc[1]); /* "normalize" in similar fashion to tfidf */ for (i=0; i<num_words; i++) { /* Get the document vector for this word WI */ ret[i] = num_words*ret[i]/sum; } bow_verbosify (bow_progress, "\n"); return ret;}/* this sets the already transformed weights THEN does the normalizing... */static void svm_set_barrel_weights(bow_wv **docs, int *yvect, int ndocs, float **weight_vect) { int i,j; /* the weights have yet to be set & since that's what we're using... */ if (svm_kernel_type == FISHER) { svm_set_fisher_barrel_weights(docs, ndocs); return; } else if (weight_type == RAW) { for (i=0; i<ndocs; i++) { for (j=0; j<docs[i]->num_entries; j++) { docs[i]->entry[j].weight *= docs[i]->normalizer; } } return; } else if (weight_type == TFIDF) { *weight_vect = tfidf(docs, ndocs); } else if (weight_type == INFOGAIN) { *weight_vect = infogain(docs, yvect, ndocs); } /* Now loop through all the documents, setting their weights */ for (i=0; i<ndocs; i++) { double sum = 0.0; for (j=0; j<docs[i]->num_entries; j++) { docs[i]->entry[j].weight *= docs[i]->normalizer * (*weight_vect)[docs[i]->entry[j].wi]; sum += docs[i]->entry[j].weight; } if (sum >0.0) { bow_wv_normalize_weights_by_summing(docs[i]); for (j=0; j<docs[i]->num_entries; j++) { docs[i]->entry[j].weight *= docs[i]->normalizer; } } }}/* similar to barrel weights above, but this only works on 1 wv at a time *//* will set weights from an already transformed oweights vector (if it was transformed), * then normalize the weights */static void svm_set_wv_weights(bow_wv *qwv, float *oweights, float *weight_vect) { double sum; int i; sum = 0.0; if (weight_type == TFIDF || weight_type == INFOGAIN) { if (tf_transform_type) { for (i=0; i<qwv->num_entries; i++) { qwv->entry[i].weight = weight_vect[qwv->entry[i].wi] * oweights[i]; sum += qwv->entry[i].weight; } } else { for (i=0; i<qwv->num_entries; i++) { /* since no transform was used - just use the raw count*/ qwv->entry[i].weight = weight_vect[qwv->entry[i].wi] * ((float) qwv->entry[i].count); sum += qwv->entry[i].weight; } } } else { for (i=0; i<qwv->num_entries && sum == 0.0; i++) { sum += qwv->entry[i].weight; } } if (sum > 0.0) { bow_wv_normalize_weights_by_summing(qwv); for (i=0; i<qwv->num_entries; i++) { qwv->entry[i].weight *= qwv->normalizer; } }}/* the below comment is correct - but there are instances (& in some * cases a substantial proportion) where some data may create an * excellent starting point for the algorithms, even though so much has changed * --- therefore, this should be changed to be more intelligent *//* since removing bound support vectors is hard * (since each bound support vector removed drastically * changes the constraints) I don't bother to do it * intuitively for each algorithm (that was tried & * performance did not improve (see above)) - this * function is nice because its modular & independent * of any implementation. *//* tvals is ignored, but the values filled in by the * algorithm are not changed. */int svm_remove_bound_examples(bow_wv **docs, int *yvect, double *weights, double *b, double **W, int ndocs, double *tvals, float *cvect, int *nsv) { int nbound=0; int *tdocs; /* trans table */ float *sub_cvect; bow_wv **sub_docs; int sub_ndocs=0; int *sub_yvect; int i,j,x; sub_docs = (bow_wv **) alloca(sizeof(bow_wv *)*ndocs); sub_yvect = (int *) alloca(sizeof(int)*ndocs); tdocs = (int *) alloca(sizeof(int)*ndocs); sub_cvect = (float *) alloca(sizeof(float)*ndocs); if (svm_remove_misclassified==REMOVE_BOUND) { for (i=nbound=sub_ndocs=0; i<ndocs; i++) { if (weights[i] > cvect[i] - svm_epsilon_a) { nbound ++; } else { sub_docs[sub_ndocs] = docs[i]; sub_yvect[sub_ndocs] = yvect[i]; tdocs[sub_ndocs] = i; sub_ndocs++; } } } else if (svm_remove_misclassified==REMOVE_WRONG) { if (svm_kernel_type == 0) { for (i=nbound=sub_ndocs=0; i<ndocs; i++) { if (yvect[i]*evaluate_model_hyperplane(*W, *b, docs[i]) < 0.0) { nbound ++; } else { sub_docs[sub_ndocs] = docs[i]; sub_yvect[sub_ndocs] = yvect[i]; tdocs[sub_ndocs] = i; sub_ndocs++; } } } else { for (i=nbound=sub_ndocs=0; i<ndocs; i++) { if (yvect[i]*evaluate_model_cache(docs, weights, yvect, *b, docs[i], *nsv) < 0.0) { nbound ++; } else { sub_docs[sub_ndocs] = docs[i]; sub_yvect[sub_ndocs] = yvect[i]; tdocs[sub_ndocs] = i; sub_ndocs++; } } } } if (nbound) { fprintf(stderr, "Removing %d bound examples\n",nbound); fprintf(stdout, "Removing %d bound examples\n",nbound); } else { return 0; } /* prb not worthwile to resize arrays */ /* "unbound" everything & set weights & tvals... */ for (i=0; i<sub_ndocs; i++) { tvals[i] = 0.0; weights[i] = 0.0; sub_cvect[i] = MAXFLOAT; } *nsv = 0; if (svm_use_smo) { x = smo(sub_docs, sub_yvect, weights, b, W, sub_ndocs, tvals, sub_cvect, nsv); } else {#ifdef HAVE_LOQO x = build_svm_guts(sub_docs, sub_yvect, weights, b, W, sub_ndocs, tvals, sub_cvect, nsv);#else bow_error("Must build rainbow with pr_loqo to use this solver!\n");#endif } /* place the weights in the proper slots */ for (i=ndocs-1, j=sub_ndocs-1; i>0; i--) { if (tdocs[j] == i) { weights[i] = weights[j]; tvals[i] = tvals[j]; j--; } else { weights[i] = 0.0; tvals[i] = 0.0; } } return x;}/* returns whether or not x has changed */inline int solve_svm(bow_wv **docs, int *yvect, double *weights, double *ab, double **W, int ndocs, double *tvals, float *cvect, int *nsv) { int x; if (svm_use_smo) { x = smo(docs, yvect, weights, ab, W, ndocs, tvals, cvect, nsv); } else {#ifdef HAVE_LOQO x = build_svm_guts(docs, yvect, weights, ab, W, ndocs, tvals, cvect, nsv);#else bow_error("Must build rainbow with pr_loqo to use this solver!\n");#endif } if (svm_remove_misclassified) { x |= svm_remove_bound_examples(docs,yvect,weights,ab,W,ndocs,tvals, cvect,nsv); } return x;}/* returns if the weights have changed */int svm_trans_or_chunk(bow_wv **docs, int *yvect, int *trans_yvect, double *weights, double *tvals, double *ab, double **W, int ntrans, int ndocs, int *nsv) { if (ntrans) { return (transduce_svm(docs, yvect, trans_yvect, weights, tvals, ab, W, ndocs, ntrans, nsv)); } else { int i; float *cvect = (float *) alloca(sizeof(float)*ndocs); for (i=0; i<ndocs; i++) { cvect[i] = svm_C; } return(solve_svm(docs, yvect, weights, ab, W, ndocs, tvals, cvect, nsv)); }}/* cover for all the functions *//* this function does a small amount of pre & post-processing for the * algorithm independent stuff (like randomly permuting everything & * outputting a hyperplane if possible) */int tlf_svm(bow_wv **docs, int *yvect, double *weights, double *ab, bow_wv **W_wv, int ntrans, int ndocs) { int nlabeled; int misclass; int nsv; int *permute_table; double *tvals; double *W=NULL; int i,j; struct tms t1, t2; if (svm_random_seed) { srandom(svm_random_seed); } else { svm_random_seed = (int) time(NULL); srandom(svm_random_seed); printf("random seed to chop test/train split: %d\n",svm_random_seed); fprintf(stderr,"random seed to chop test/train split: %d\n",svm_random_seed); } permute_table = (int *) malloc(sizeof(int)*ndocs); nlabeled = ndocs - ntrans; /* permute each part, but don't mudge them together, because the * solvers are going to expect all unlabeled data (data with a * different C* to be in the latter half) */ svm_permute_data(permute_table, docs, yvect, nlabeled); svm_permute_data(&(permute_table[nlabeled]), &(docs[nlabeled]), &(yvect[nlabeled]), ntrans); /* lets try to reduce determinism... */ srandom((int) time(NULL)); times(&t1); if (do_active_learning) { if (test_in_train) { nsv = al_svm_test_wrapper(docs, yvect, weights, ab, &W, ntrans, ndocs, (suppress_score_mat ? 0 : 1), al_pick_random, permute_table); } else { nsv = al_svm(docs, yvect, weights, ab, &W, ntrans, ndocs, al_pick_random); } } else { /* initialize... */ tvals = (double *) alloca(sizeof(double)*ndocs); nsv = 0; for (i=0; i<ndocs; i++) { weights[i] = 0.0; tvals[i] = 0.0; } svm_trans_or_chunk(docs, yvect, NULL, weights, tvals, ab, &W, ntrans, ndocs, &nsv); } times(&t2); fprintf(stderr,"user: %d, system:%d, kernel_calls:%d\n", (int)(t2.tms_utime-t1.tms_utime), (int) (t2.tms_stime - t1.tms_stime), svm_nkc_calls); printf("user: %d, system:%d, kernel_calls:%d\n", (int)(t2.tms_utime-t1.tms_utime), (int) (t2.tms_stime - t1.tms_stime), svm_nkc_calls); /* unpermute data */ svm_unpermute_data(permute_table, docs, yvect, nlabeled); svm_unpermute_data(&(permute_table[nlabeled]), &(docs[nlabeled]), &(yvect[nlabeled]), ntrans); free(permute_table); if (svm_kernel_type == 0) { *W_wv = svm_darray_to_wv(W); free(W); } printf("support vectors: "); for (i=j=0; j<nsv; i++) { if (weights[i] > svm_epsilon_a) { printf("%d(%f) ",i,weights[i]); j++; } } misclass = 0; if (!svm_remove_misclassified) { for (i=misclass=0; i<nlabeled; i++) { if (weights[i] > svm_C-svm_epsilon_a) { misclass++; } } for (i=0; i<ntrans; i++) { if (weights[nlabeled+i] > svm_trans_cstar-svm_epsilon_a) { misclass++; } } } printf("\n%d support vectors (%d bounded)\n", nsv, misclass); return nsv;}bow_wv *svm_darray_to_wv(double *W) { bow_wv *W_wv; int num_words, i, j; num_words = bow_num_words(); for (i=j=0; i<num_words; i++) { if (W[i] != 0.0) j++; } W_wv = bow_wv_new(j); for (i=j=0; j<W_wv->num_entries; i++) { if (W[i] != 0.0) { W_wv->entry[j].wi = i; W_wv->entry[j].count = 1; /* just so that an assertion doesn't throw up later */ W_wv->entry[j].weight = W[i]; j++; } } return (W_wv);}/* note - these 2 fn's are not MEANT to be inverses of each * other - they don't need to be & shouldn't be! *//* given a 'focus' value, this transforms x into some int * this must be a BINARY function, outputting ONLY 1 & -1 * because that's what the SVM use for y. */int map_class_to_y(int focus, int x) { if (focus == x) { return 1; } else { return (-1); }}/* each pass over these things take up 2 labels... *//* 1->1, -1->0 */int map_y_to_class(int focus, int x) { return ((focus*2)+((x+1)/2));}/* helper to do whatever transform on a wv & then normalize it... */static void tf_transform(bow_wv *doc) { int j; for (j=0; j<doc->num_entries; j++) { if (tf_transform_type == LOG) { doc->entry[j].weight = log2f((float) (doc->entry[j].count + 1)); } else { doc->entry[j].weight = (float) doc->entry[j].count; } }}/* sets counts & the normalizer too *//* pulls from the barrel those docs that satisfy dec_fn & turns them into a doc array */int make_doc_array(bow_barrel *barrel, bow_wv **docs, int *tdocs, int(*dec_fn)(bow_cdoc *)) { bow_dv_heap *heap; int ndocs; bow_wv *wv_tmp1; bow_wv *wv_tmp; int j; /* Create the Heap of vectors of all documents */ heap = bow_make_dv_heap_from_wi2dvf(barrel->wi2dvf); for (ndocs=0; ; ndocs++) { int t = bow_heap_next_wv(heap, barrel, &wv_tmp1, dec_fn); if (t == -1) { break; } else { tdocs[ndocs] = t; } wv_tmp = bow_wv_new(wv_tmp1->num_entries); for (j=0; j<wv_tmp->num_entries; j++) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -