📄 svm_al.c
字号:
if (astd->train_dim_sat_vect) { for (i=0; i<astd->ndim_sat; i++) { astd->train_dim_sat_vect[i][nloop] = astd->train_dim_sat_vect[i][nloop-1]; } } if (astd->sv_dim_sat_vect) { for (i=0; i<astd->ndim_sat; i++) { astd->sv_dim_sat_vect[i][nloop] = astd->sv_dim_sat_vect[i][nloop-1]; } } if (astd->test_scores) { for (i=0; i<astd->ntest; i++) { astd->test_scores[nloop][i] = astd->test_scores[nloop-1][i]; } } /* END LOGGING CODE */ /* this code doesn't get touched till after stuff was added */ nleft -= dec; cur_hyp_yvect = &(cur_hyp_yvect[dec]); train_cscores = &(train_cscores[dec]); } /* see if there are any indices < sub_ndocs in the score array */ for (i=0; i<nplabeled-sub_ndocs; i++) { assert (train_cscores[i].i >= sub_ndocs); } if (sub_ndocs == nplabeled) { break; } /* now use the scores (& possibly other things) to chose the next examples to learn */ if (nleft < qsize) { dec = nleft; } else { dec = qsize; } /* do this even if nleft<qsize to find the min... */ if (!do_random_learning) { get_top_n(train_cscores, nleft, dec); } /* this is where the termination criteria goes - right now its pretty dumb... */ /* (it would be a fn, but since bookkeeping & setting up need to go on in here * anyway, i'm just computing it */ if ((train_cscores[0].d > 1) && (0)) { break; } /* this only matters when transduction is being used (otherwise its harmless) */ changed = 0; /* BEGIN LOGGING CODE */ if (astd->query_anvect && astd->query_apvect) { astd->query_anvect[nloop] = astd->query_apvect[nloop] = 0; } /* END LOGGING CODE */ /* query "oracle" */ for (j=0; j<dec; j++) { int t,tj; bow_wv *twv; tj = train_cscores[j].i; t = tdocs[sub_ndocs+j]; tdocs[sub_ndocs+j] = tdocs[tj]; tdocs[tj] = t; twv = train_docs[sub_ndocs+j]; train_docs[sub_ndocs+j] = train_docs[tj]; train_docs[tj] = twv; t = train_yvect[sub_ndocs+j]; train_yvect[sub_ndocs+j] = train_yvect[tj]; train_yvect[tj] = t; if (svm_al_do_trans) { if ((train_yvect[sub_ndocs+j] != cur_hyp_yvect[j]) || (weights[sub_ndocs+j] >= svm_trans_cstar - svm_epsilon_a)) { changed = 1; } } /* BEGIN LOGGING CODE */ if (astd->query_anvect && astd->query_apvect) { double out; if (svm_kernel_type == 0) { out = evaluate_model_hyperplane(*W,tb,train_docs[i]); } else if (svm_al_do_trans) { out = evaluate_model_cache(train_docs,weights,hyp_yvect,tb,train_docs[i],nsv); } if (train_yvect[sub_ndocs+j]*out > 0) { if (train_yvect[sub_ndocs+j] > 0) { astd->query_apvect[nloop] ++; } else { astd->query_anvect[nloop] ++; } } } /* END LOGGING CODE */ /* also need to swap the scores - since they will be used if the output doesn't change */ for (i=0; ; i++) { if (train_cscores[i].i == sub_ndocs+j) { train_cscores[i].i = tj; break; } } train_cscores[j].i = sub_ndocs+j; if (astd->scores_added) astd->scores_added[sub_ndocs+j] = train_cscores[j].d; } for (j=0; j<dec; j++) { hyp_yvect[sub_ndocs+j] = train_yvect[sub_ndocs+j]; } if (!changed) { n_trans_correct ++; } last_subndocs = sub_ndocs; /* calculate tvals that are necessary */ if (svm_use_smo) { for (j=sub_ndocs; j<dec; j++) { weights[j] = 0.0; //tvals[j] doesn't matter } sub_ndocs += dec; } else { int n; for (n=0; n<dec; n++) { for (j=k=0; k<nsv; j++) { if (weights[j] != 0.0) { tvals[sub_ndocs] += weights[j] * train_yvect[j] * svm_kernel_cache(train_docs[sub_ndocs],train_docs[j]); k++; } } sub_ndocs++; } } /* if we no longer need W, lets ditch it (note - the loop never exits here so a * valid W is still in place for the calling fn. */ if (!svm_use_smo && svm_kernel_type == 0) { free(*W); *W = NULL; } } if (svm_al_do_trans) { printf("Queried for a total of %d labels.\nSkipped %d loops w/ transduction.\n", sub_ndocs, n_trans_correct); } free(hyp_yvect); free(train_scores); free(tvals); if (sv_sat_vect) { free(sv_sat_vect); free(old_svbitmap); } if (train_sat_vect) { free(train_sat_vect); } /* fill everything back in - depermute everything */ for (i=0; i<sub_ndocs; ) { int t,j; double td; bow_wv *twv; j = tdocs[i]; if (j == i) { i++; continue; } twv = train_docs[j]; train_docs[j] = train_docs[i]; train_docs[i] = twv; t = train_yvect[j]; train_yvect[j] = train_yvect[i]; train_yvect[i] = t; td = weights[j]; weights[j] = weights[i]; weights[i] = td; tdocs[i] = tdocs[j]; tdocs[j] = j; } free(tdocs); *b = tb; return nsv;}/* this cuts up the training set into training & validation *//* the data coming in has already been permutated *//* the first docs become the test docs * (to prevent us from having to move everything) */int al_svm_test_wrapper(bow_wv **docs, int *yvect, double *weights, double *b, double **W, int ntrans, int ndocs, int do_ts, int do_random_learning, int *permute_table) { struct al_test_data altd; int max_iter; int nlabeled; int ntrain; int nsv; int ntest; int tp, tn; bow_wv **train_docs; int *train_y; int i,j,k; ntrain = altd.ntest = 0; nlabeled = ndocs - ntrans; ntrain = nlabeled/2; ntest = nlabeled - ntrain; altd.ntest = ntest; train_docs = &(docs[ntest]); train_y = &(yvect[ntest]); altd.test_docs = docs; altd.test_yvect = yvect; max_iter = ((ntrain+svm_al_qsize-1) / svm_al_qsize) + 1; altd.apvect = (int *) malloc(sizeof(int)*max_iter); altd.anvect = (int *) malloc(sizeof(int)*max_iter); altd.nsv_vect = (int *) malloc(sizeof(int)*max_iter); altd.nbsv_vect = (int *) malloc(sizeof(int)*ntrain); altd.prb = (double *) malloc(sizeof(double)*max_iter); altd.nkce_vect = (int *) malloc(sizeof(int)*max_iter); altd.time_vect = (int *) malloc(sizeof(int)*max_iter); altd.query_anvect = (int *) malloc(sizeof(int)*max_iter); altd.query_apvect = (int *) malloc(sizeof(int)*max_iter); altd.train_anvect = (int *) malloc(sizeof(int)*max_iter); altd.train_apvect = (int *) malloc(sizeof(int)*max_iter); if (do_ts) { altd.test_scores = (double **) malloc(sizeof(double *)*max_iter); for (i=0; i<max_iter; i++) { altd.test_scores[i] = (double *) malloc(sizeof(double)*altd.ntest); } } else { altd.test_scores = NULL; } altd.npos_added = (int *) malloc(sizeof(int)*max_iter+1); altd.nneg_added = (int *) malloc(sizeof(int)*max_iter+1); altd.docs_added = (int *) malloc(sizeof(int)*ntrain); altd.scores_added = (double *) malloc(sizeof(double)*ntrain); for (i=0; i<ntrain; i++) { altd.scores_added[i] = 0.0; } memset(altd.apvect, -1, max_iter*sizeof(int)); memset(altd.anvect, -1, max_iter*sizeof(int)); altd.ndim_sat = NDIM_INSPECTED; altd.sv_dim_sat_vect = (int **) malloc(NDIM_INSPECTED*sizeof(int *)); altd.train_dim_sat_vect = (int **) malloc(NDIM_INSPECTED*sizeof(int *)); for(i=0; i<NDIM_INSPECTED; i++) { altd.sv_dim_sat_vect[i] = (int *) malloc(sizeof(int)*max_iter); altd.train_dim_sat_vect[i] = (int *) malloc(sizeof(int)*max_iter); } nsv = al_svm_guts(train_docs, train_y, weights, b, W, ntrans, ntrain, &altd, do_random_learning); for (i=tp=tn=0; i<altd.ntest; i++) { if (altd.test_yvect[i] == 1) { tp ++; } else { tn ++; } } printf("%d positive test documents, %d negative test documents.\npositive accuracy vector: ",tp,tn); for (i=0; (altd.apvect[i]>=0) && i < max_iter; i++) { printf(" %d", altd.apvect[i]); } printf("\nnegative accuracy vector: "); for (j=0; j<i; j++) { printf(" %d", altd.anvect[j]); } printf("\nprecision/recall breakeven vector: "); for (j=0; j<i; j++) { printf(" %f", altd.prb[j]); } printf("\nquery positive accuracy vector: "); for (j=0; j<i-1; j++) { printf(" %d",altd.query_apvect[j]); } printf("\nquery negative accuracy vector: "); for (j=0; j<i-1; j++) { printf(" %d",altd.query_anvect[j]); } printf("\ntrain positive accuracy vector: "); for (j=0; j<i; j++) { printf(" %d",altd.train_apvect[j]); } printf("\ntrain negative accuracy vector: "); for (j=0; j<i; j++) { printf(" %d",altd.train_anvect[j]); } printf("\nnumber of positive documents inspected: "); for (j=0; j<i; j++) { printf(" %d", altd.npos_added[j]); } printf("\nnumber of negative documents inspected: "); for (j=0; j<i; j++) { printf(" %d", altd.nneg_added[j]); } printf("\nnumber of support vectors: "); for (j=0; j<i; j++) { printf(" %d", altd.nsv_vect[j]); } printf("\nnumber of bounded support vectors: "); for (j=0; j<i; j++) { printf(" %d", altd.nbsv_vect[j]); } { int k; int start_index= MIN(ntrain, svm_init_al_tset); printf("\n\"Real\" document indices when added: "); printf("0(%d",permute_table[altd.docs_added[0]]); for (k=1; k<start_index; k++) { printf(",%d",permute_table[altd.docs_added[k]]); } printf(") "); for (j=0; j<i-1; j++) { printf("%d(%d",j+1,permute_table[altd.docs_added[j*svm_al_qsize+start_index]]); for (k=1; k<svm_al_qsize && k+j*svm_al_qsize+start_index<ntrain; k++) { printf(",%d",permute_table[altd.docs_added[j*svm_al_qsize+start_index+k]]); } printf(") "); } printf("\nminimum scores of documents when added: "); for (j=0; j<i-1; j++) { printf(" %f", altd.scores_added[j*svm_al_qsize+svm_init_al_tset]); } printf("\naverage scores of documents when added: "); for (j=0; j<i-1; j++) { double avg = 0.0; for (k=0; k<svm_al_qsize && k+j*svm_al_qsize+svm_init_al_tset<ntrain; k++) { avg += altd.scores_added[j*svm_al_qsize+k+svm_init_al_tset]; } printf(" %f", avg/k); } } printf("\nrunning times: "); for (j=0; j<i; j++) { printf(" %d", altd.time_vect[j]); } printf("\nkernel_cache calls: "); for (j=0; j<i; j++) { printf(" %d", altd.nkce_vect[j]); } for (k=0; k<NDIM_INSPECTED; k++) { /* following is only good if the 0'th # of dimensions == 1 */ int num_words = altd.train_dim_sat_vect[0][i-1]; printf("\nnumber of SV dimensions with more than %d elements (%d total dimensions): ", dim_map(k), num_words); for (j=0; j<i; j++) { printf(" %d", altd.sv_dim_sat_vect[k][j]); } } for (k=0; k<NDIM_INSPECTED; k++) { int num_words = altd.train_dim_sat_vect[0][i-1]; printf("\nnumber of train dimensions with more than %d elements (%d total dimensions): ", dim_map(k), num_words); for (j=0; j<i; j++) { printf(" %d", altd.train_dim_sat_vect[k][j]); } } if (do_ts) { printf("\nbegin score matrix:"); for (j=0; j<i; j++) { int k; printf("\n"); for (k=0; k<altd.ntest; k++) { printf(" %.3f", altd.test_scores[j][k]); } } printf("\nend score matrix\n"); for (i=0; i<max_iter; i++) { free(altd.test_scores[i]); } free(altd.test_scores); } else { printf("\n"); } for(i=0; i<NDIM_INSPECTED; i++) { free(altd.sv_dim_sat_vect[i]); free(altd.train_dim_sat_vect[i]); } free(altd.docs_added); free(altd.scores_added); free(altd.apvect); free(altd.anvect); free(altd.prb); free(altd.nsv_vect); free(altd.nbsv_vect); free(altd.time_vect); free(altd.sv_dim_sat_vect); free(altd.train_dim_sat_vect); free(altd.nkce_vect); free(altd.npos_added); free(altd.nneg_added); free(altd.query_anvect); free(altd.query_apvect); free(altd.train_anvect); free(altd.train_apvect); return nsv;}int al_svm(bow_wv **docs, int *yvect, double *weights, double *b, double **W, int ntrans, int ndocs, int do_rlearn) { struct al_test_data altd; bzero(&altd,sizeof(struct al_test_data)); return (al_svm_guts(docs, yvect, weights, b, W, ntrans, ndocs, &altd, do_rlearn));}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -