📄 svm_al.c
字号:
/* Copyright (C) 1999 Greg Schohn - gcs@jprc.com *//* ******************** svm_al.c ******************* * Active learning add-ons for SVM's. */#include <bow/svm.h>#define NDIM_INSPECTED 14int dim_map(int i) { static int map[] = {1,2,3,4,6,8,12,16,24,32,48,64,128,256}; return (map[i]);}/* compute the prec. recall breakeven point shifting the value of b */double prec_recall_breakeven(double *test_evals, int *test_yvect, int n, int total_pos) { struct di *ey; double max; int npos; int i; ey = (struct di *) malloc(sizeof(struct di)*n); for (i=0; i<n; i++) { ey[i].d = -1*test_evals[i]; /* -1 is to force the sort the way i want it */ ey[i].i = test_yvect[i]; } qsort(ey,n,sizeof(struct di),di_cmp); max = -1.0; for (i=npos=0; i<n; i++) { double min; if (ey[i].i > 0) { npos ++; } min = MIN(((double)npos)/(i+1), ((double)npos)/total_pos); if (min > max) { max = min; } } free(ey); return max;}struct al_test_data { int ntest; int ndim_sat; int *docs_added; int *test_yvect, *apvect, *anvect; int *train_apvect, *train_anvect, *query_apvect, *query_anvect; int *nsv_vect, *nbsv_vect, *time_vect, *nkce_vect; int *npos_added, *nneg_added; double *prb, *scores_added; int **sv_dim_sat_vect, **train_dim_sat_vect; double **test_scores; bow_wv **test_docs;};/* the data coming in should have train data in the first (ndocs-ntrans) * slots & the permanently unlabel-able data in the next ntrans slots *//* no unlabeled data will be used unless svm_al_do_trans is set - if it * is then ALL unlabeled data will be used. *//* this fn. does active learning by selecting which docs to pass into * the svm solver */int al_svm_guts(bow_wv **train_docs, int *train_yvect, double *weights, double *b, double **W, int nperm_unlabeled, int ndocs, struct al_test_data *astd, int do_random_learning) { int changed; int *cur_hyp_yvect; int dec; int *hyp_yvect; /* for transduction */ int last_subndocs; int nplabeled; /* # of potentially labeled */ int nleft; int nsv; int n_trans_correct; int num_words; int *old_svbitmap; int qsize; /* query size, size of chunks to grow training set by */ struct di *train_scores, *train_cscores; int *train_sat_vect; int *sv_sat_vect; /* shows how many */ int sub_ndocs; double tb; int *tdocs; /* translation table */ double *tvals; int i,j,k,n,nloop; sub_ndocs = MIN(ndocs,svm_init_al_tset); train_scores = (struct di *) malloc(sizeof(struct di)*ndocs); tvals = (double *) malloc(sizeof(double)*ndocs); tdocs = (int *) malloc(sizeof(int)*ndocs); for (i=0; i<ndocs; i++) { tdocs[i] = i; } /* hyp_yvect is a hack - it holds the hypotheses, but also the * correct labels for the queried docs (so a proper vect can be * passed to eval) */ cur_hyp_yvect = hyp_yvect = (int *) malloc(sizeof(int)*ndocs); nplabeled = ndocs - nperm_unlabeled; num_words = bow_num_words(); /* BEGIN LOGGING CODE */ if (astd->sv_dim_sat_vect) { train_sat_vect = (int *) malloc(sizeof(int)*num_words); } else { train_sat_vect = NULL; } if (astd->train_dim_sat_vect) { old_svbitmap = (int *) malloc((ndocs+7)/8); sv_sat_vect = (int *) malloc(sizeof(int)*num_words); } else { old_svbitmap = NULL;//(int *) malloc((ndocs+7)/8); sv_sat_vect = NULL;//(int *) malloc(sizeof(int)*num_words); } /* initialize accounting stuff */ if (sv_sat_vect) { memset(old_svbitmap, 0, (ndocs+7)/8); for (i=0; i<num_words; i++) { sv_sat_vect[i] = 0.0; } } if (train_sat_vect) { for (i=0; i<num_words; i++) { train_sat_vect[i] = 0.0; } } /* END LOGGING CODE */ /* initialize... */ nsv = 0; for (i=0; i<ndocs; i++) { weights[i] = 0.0; tvals[i] = 0.0; } qsize = svm_al_qsize; /* select an initial set of things to classify */ /* the following is equivalent to asking the user to classify 1/2 the * documents as positive & the other half as negative */ for (k=-1, i=0, n=sub_ndocs/2; k<2; n=sub_ndocs,k=k+2) { for (j=0; i<n && j<nplabeled; j++) { if (train_yvect[j] != k) { continue; } { int t; bow_wv *twv; t = tdocs[j]; tdocs[j] = tdocs[i]; tdocs[i] = t; t = train_yvect[j]; train_yvect[j] = train_yvect[i]; train_yvect[i] = t; twv = train_docs[j]; train_docs[j] = train_docs[i]; train_docs[i] = twv; } i++; } } sub_ndocs = i; last_subndocs = 0; /* copy the initial yvect into hyp_yvect (for evaluate) */ for (i=0; i<sub_ndocs; i++) { hyp_yvect[i] = train_yvect[i]; } cur_hyp_yvect = &(hyp_yvect[sub_ndocs]); for (i=train_yvect[0],j=1; j<sub_ndocs; j++) { if (j != i) break; } if (j == i) { bow_error("Can't active learn when all examples are from the same class!"); } train_cscores = NULL; dec = 0; nleft = ndocs - sub_ndocs; changed = 1; /* for code where transduction is done */ n_trans_correct = 0; for (nloop=0; ;nloop++) { struct tms t1, t2; /* BEGIN LOGGING CODE */ /* this is done at the beginning of the loop so that the base case (ie. after initial set) works too... */ if (astd->npos_added && astd->nneg_added) { astd->npos_added[nloop] = 0; astd->nneg_added[nloop] = 0; for (i=last_subndocs; i<sub_ndocs; i++) { if (train_yvect[i] == 1) { astd->npos_added[nloop] ++; } else { astd->nneg_added[nloop] ++; } } } /* add the document indices */ if (astd->docs_added) { for (i=last_subndocs; i<sub_ndocs; i++) { astd->docs_added[i] = tdocs[i]; } } if (train_sat_vect) { for (i=last_subndocs; i<sub_ndocs; i++) { for (j=0; j<train_docs[i]->num_entries; j++) { train_sat_vect[train_docs[i]->entry[j].wi] ++; } } } svm_nkc_calls = 0; /* END LOGGING CODE */ fprintf(stderr,"\r%dth AL iteration",nloop); times(&t1); if (nloop==2) { //exit(1); } /* the changed flag shows whether or not the theorized y's are different than * the actual ones (if they are, retraining is done, otherwise it isn't). */ if (svm_al_do_trans) { if (changed) { changed = svm_trans_or_chunk(train_docs, train_yvect, cur_hyp_yvect, weights, tvals, &tb, W, nleft+nperm_unlabeled, ndocs, &nsv); } } else { changed = svm_trans_or_chunk(train_docs, train_yvect, cur_hyp_yvect, weights, tvals, &tb, W, 0, sub_ndocs, &nsv); } times(&t2); /* BEGIN LOGGING CODE */ /* a couple of accounting things that are independent of a test/validation set */ if (astd->time_vect) astd->time_vect[nloop] = (int) (t2.tms_utime - t1.tms_utime + t2.tms_stime - t1.tms_stime); if (astd->nsv_vect) astd->nsv_vect[nloop] = nsv; if (astd->nbsv_vect) { astd->nbsv_vect[nloop] = 0; for (j=0; j<sub_ndocs; j++) { /* note - use svm_C because the label IS known */ if (weights[j] >= svm_C - svm_epsilon_a) astd->nbsv_vect[nloop] ++; } } if (astd->nkce_vect) astd->nkce_vect[nloop] = svm_nkc_calls; /* END LOGGING CODE */ /* find the next example that is closest to the hyperplane that we just found */ /* the scores need to be recalculated if any of the weights changed... */ if (changed) { train_cscores = train_scores; if (svm_kernel_type == 0) { for (j=sub_ndocs,nleft=0; j<nplabeled; j++) { train_scores[nleft].d = fabs(evaluate_model_hyperplane(*W, tb, train_docs[j])); train_scores[nleft].i = j; nleft ++; } } else { for (j=sub_ndocs,nleft=0; j<nplabeled; j++) { train_scores[nleft].d = fabs(evaluate_model_cache(train_docs, weights, hyp_yvect, tb, train_docs[j], nsv)); train_scores[nleft].i = j; nleft ++; } } /* BEGIN LOGGING CODE */ if (astd->train_anvect && astd->train_apvect) { double out; astd->train_anvect[nloop] = astd->train_apvect[nloop] = 0; for (i=0; i<sub_ndocs; i++) { if (svm_kernel_type == 0) { out = evaluate_model_hyperplane(*W,tb,train_docs[i]); } else { out = evaluate_model_cache(train_docs, weights, hyp_yvect, tb, train_docs[i], nsv); } if (train_yvect[i]*out > 0) { if (train_yvect[i] > 0) { astd->train_apvect[nloop] ++; } else { astd->train_anvect[nloop] ++; } } } } /* lets figure out the change in fdim saturation... */ if (sv_sat_vect) { for (i=0; i<sub_ndocs; i++) { if ((weights[i] == 0.0) && (GETVALID(old_svbitmap,i))) { SETINVALID(old_svbitmap,i); for (j=0; j<train_docs[i]->num_entries; j++) { sv_sat_vect[train_docs[i]->entry[j].wi] --; } } else if ((weights[i] != 0.0) && (!GETVALID(old_svbitmap,i))) { SETVALID(old_svbitmap,i); for (j=0; j<train_docs[i]->num_entries; j++) { sv_sat_vect[train_docs[i]->entry[j].wi] ++; } } } /* this could be smarter - but it would involve more arrays... */ /* update the history vector... */ for (i=0; i<astd->ndim_sat; i++) { astd->sv_dim_sat_vect[i][nloop] = 0; } for (j=0; j<num_words; j++) { for (i=0; sv_sat_vect[j]>=dim_map(i) && i<astd->ndim_sat; i++) { astd->sv_dim_sat_vect[i][nloop] ++; } } } if (astd->train_dim_sat_vect) { for (i=0; i<astd->ndim_sat; i++) { astd->train_dim_sat_vect[i][nloop] = 0; } for (j=0; j<num_words; j++) { for (i=0; train_sat_vect[j]>=dim_map(i) && i<astd->ndim_sat; i++) { astd->train_dim_sat_vect[i][nloop] ++; } } } /* now lets find the accuracy... */ if (astd->prb) { int npos; double *test_evals = (double *) malloc(sizeof(double)*astd->ntest); astd->anvect[nloop] = astd->apvect[nloop] = 0; if (svm_kernel_type == 0) { for (j=0; j<astd->ntest; j++) { test_evals[j] = evaluate_model_hyperplane(*W, tb, astd->test_docs[j]); } } else { for (j=0; j<astd->ntest; j++) { test_evals[j] = evaluate_model(train_docs, weights, hyp_yvect, tb, astd->test_docs[j], nsv); } } if (astd->test_scores) { for (j=0; j<astd->ntest; j++) { astd->test_scores[nloop][j] = test_evals[j]; } } for (j=npos=0; j<astd->ntest; j++) { if (astd->test_yvect[j] * test_evals[j] > 0.0) { if (astd->test_yvect[j] == -1) astd->anvect[nloop] ++; else astd->apvect[nloop] ++; } if (astd->test_yvect[j] > 0) { npos ++; } } if (svm_al_do_trans && (astd->apvect[nloop] == 0 || astd->anvect[nloop] == 0)) { fprintf(stderr,"Unlikely occurence that all test (%d) examples have the same \n" "label when classified with the current model of %d support vectors.\n" "Unless this is expected, there is probably a bug in the program.\n" "Please send the author (gcs@cmu.edu) email (note the cmd line arguments).\n" "The function has stopped the program so that it may be debugged, terminated" "or continued\n", astd->ntest, nsv); for (j=0; j<astd->ntest; j++) { if (test_evals[j] * evaluate_model(train_docs, weights, hyp_yvect, tb, astd->test_docs[j], nsv) < 0.0) { fprintf(stderr, "bad hyperplane (j=%d, te[j]=%f, actual_sv=%f, actual_hyp=%f)\n",j, evaluate_model(train_docs, weights, hyp_yvect, tb, astd->test_docs[j], nsv), test_evals[j],evaluate_model_hyperplane(*W, tb, astd->test_docs[j])); fflush(stderr); kill(getpid(),SIGSTOP); } }#ifdef GCSJPRC system("echo \"rainbow did a boo-boo - stopping!\" | /usr/sbin/sendmail gcs@jules.res.cmu.edu");#endif /* if it didn't get stopped before */ if (j == astd->ntest) { fflush(stderr); kill(getpid(),SIGSTOP); } } /* precision recall breakevens too */ astd->prb[nloop] = prec_recall_breakeven(test_evals, astd->test_yvect, astd->ntest, npos); free(test_evals); } /* END LOGGING CODE */ } else { /* BEGIN LOGGING CODE */ /* we can use the scores that we got last time (they'll still be the same) - just * remove the previous ones from the scores array... */ /* since nothing changed, we don't need to recalculate the test accuracy */ if (astd->prb) { astd->apvect[nloop] = astd->apvect[nloop-1]; astd->anvect[nloop] = astd->anvect[nloop-1]; astd->prb[nloop] = astd->prb[nloop-1]; } /* quite nasty (because of the dependency on nneg_added & npos_added) */ if (astd->train_anvect && astd->train_apvect && astd->nneg_added && astd->npos_added) { astd->train_anvect[nloop] = astd->train_anvect[nloop-1] + astd->nneg_added[nloop]; astd->train_apvect[nloop] = astd->train_apvect[nloop-1] + astd->npos_added[nloop]; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -