📄 svm_base.c
字号:
wv_tmp->entry[j].wi = wv_tmp1->entry[j].wi; wv_tmp->entry[j].count = wv_tmp1->entry[j].count; } tf_transform(wv_tmp); bow_wv_normalize_weights_by_summing(wv_tmp); docs[ndocs] = wv_tmp; } return ndocs;}/* C sucks - this is just a fn to pass to bow_heap_next_wv */static int silly_currying_global_v1, silly_currying_global_v2;int use_train_and_submodel(bow_cdoc *cdoc) { return ((cdoc->type == bow_doc_train && (silly_currying_global_v1 == cdoc->class || silly_currying_global_v2 == cdoc->class)) ? 1 : 0);}int use_transduction_docs(bow_cdoc *cdoc) { return (((1 << cdoc->type) & transduce_class) ? 1 : 0);}/* helper fn for adding the data for a training example to the barrel */int add_sv_barrel(bow_barrel *new_barrel,double *weights, int *yvect, int *tdocs, double b, int model_no, int nsv) { bow_cdoc cdoc_pos, cdoc_neg; bow_wv *dummy_wv_neg; bow_wv *dummy_wv_pos; int n_meta_docs=0; int ni, pi, i, j, num_words; num_words = bow_num_words(); dummy_wv_pos = bow_wv_new(num_words); dummy_wv_neg = bow_wv_new(num_words); dummy_wv_pos->num_entries = dummy_wv_neg->num_entries = 0; cdoc_pos.type = bow_doc_ignore; cdoc_neg.normalizer = cdoc_pos.prior = 0.0; cdoc_pos.filename = NULL; cdoc_pos.class_probs = NULL; cdoc_pos.class = 0; cdoc_neg.type = bow_doc_ignore; cdoc_neg.normalizer = cdoc_neg.prior = 0.0; cdoc_neg.filename = NULL; cdoc_neg.class_probs = NULL; cdoc_neg.class = 0; if (model_no == 0) { /* insert an two empty docs into the barrel so that the * ancillary data has a place to live */ cdoc_neg.word_count = 0; bow_barrel_add_document(new_barrel, &cdoc_neg, dummy_wv_pos); bow_barrel_add_document(new_barrel, &cdoc_neg, dummy_wv_pos); n_meta_docs = 2; } cdoc_pos.normalizer = b; cdoc_pos.class = map_y_to_class(model_no,(int) 1); cdoc_neg.class = map_y_to_class(model_no,(int) -1); ni = pi = 0; for (i=j=0; j<nsv; i++) { if (weights[i] > svm_epsilon_a) { if (yvect[i] > 0) { if (pi > num_words) { dummy_wv_pos->num_entries = pi; cdoc_pos.word_count = pi; bow_barrel_add_document(new_barrel, &cdoc_pos, dummy_wv_pos); pi = 0; n_meta_docs++; } dummy_wv_pos->entry[pi].weight = (float) weights[i]; dummy_wv_pos->entry[pi].count = tdocs[i] + 1; dummy_wv_pos->entry[pi].wi = pi; pi++; } else { if (ni > num_words) { dummy_wv_neg->num_entries = pi; cdoc_neg.word_count = ni; bow_barrel_add_document(new_barrel, &cdoc_neg, dummy_wv_pos); ni = 0; n_meta_docs++; } dummy_wv_neg->entry[ni].weight = (float) weights[i]; dummy_wv_neg->entry[ni].count = tdocs[i] + 1; dummy_wv_neg->entry[ni].wi = ni; ni++; } j++; } } cdoc_pos.word_count = pi; dummy_wv_pos->num_entries = pi; bow_barrel_add_document(new_barrel, &cdoc_pos, dummy_wv_pos); cdoc_neg.word_count = ni; dummy_wv_neg->num_entries = ni; bow_barrel_add_document(new_barrel, &cdoc_neg, dummy_wv_neg); bow_wv_free(dummy_wv_pos); bow_wv_free(dummy_wv_neg); return (n_meta_docs+2);}bow_barrel *svm_vpc_merge(bow_barrel *src_barrel) { double b; int cto; /* for pairwise - works with npass */ bow_wv **docs; /* a doc major matrix */ int max_nsv; /* highest # of nsv's in a submodel */ int mdocs; /* the number of docs in the current submodel */ bow_wv **model_weights; int n_meta_docs; /* # of documents that will go into the class barrel * before the weight vectors will */ int nclasses; int ndocs; /* total # of documents to be trained & transduced */ int ntrain; /* # of documents to be trained upon */ int ntrans; /* # of "unlabeled" docs to use in transduction */ bow_barrel *class_barrel; int nloops; /* # of the current submodel being built */ int npass; /* tmp for making submodels from the src_barrel */ int nsv; /* # of support vectors for the current model */ int num_words; bow_wv **sub_docs; int *tdocs; /* trans table of indices in docs to indices * in the original barrel */ int total_docs; /* total # of docs (some not for training) */ int *utdocs; /* trans table of the docs in our training set * to those in the actually used in the models */ float *weight_vect; double *weights; /* lagrange multipliers */ bow_wv **W; /* hyperplane for lin. folding */ int *yvect; int i,j;#ifndef HAVE_LOQO if (svm_use_smo != 1) { fprintf(stderr,"Can only use SMO, no other solvers were built,\n" "rebuild libbow with pr_loqo to use another algorithm.\n"); }#endif#ifdef HAVE_FPSETMASK fpsetmask(~(FP_X_INV | FP_X_DNML | FP_X_DZ | FP_X_OFL | FP_X_UFL | FP_X_IMP));#endif total_docs = src_barrel->cdocs->length; nclasses = bow_barrel_num_classes(src_barrel); weight_vect = NULL; model_weights = NULL; W = NULL; yvect = NULL; /* note - this OVER allocates - uses ALL, instead of just those for training */ docs = (bow_wv **) alloca(sizeof(bow_wv *)*total_docs); tdocs = (int *) alloca(sizeof(int)*(total_docs+1)); mdocs = 0; /* to shut gcc up */ nsv = 0; if (nclasses == 1) { fprintf(stderr, "Cannot build SVM with only 1 class.\n"); fflush(stderr); return NULL; } else if (nclasses == 2) { if (svm_kernel_type != FISHER) { vote_type = PAIRWISE; } } if (weight_type && svm_kernel_type == FISHER) { weight_type = 0; tf_transform_type = RAW; } if ((weight_type && vote_type == PAIRWISE) || weight_type == INFOGAIN) { svm_weight_style = WEIGHTS_PER_MODEL; } else if (weight_type) { svm_weight_style = WEIGHTS_PER_BARREL; } else { svm_weight_style = NO_WEIGHTS; } if (svm_weight_style != WEIGHTS_PER_MODEL) { ntrain = make_doc_array(src_barrel, docs, tdocs, bow_cdoc_is_train); if (ntrain < 2) { if (ntrain) bow_wv_free(docs[0]); fprintf(stderr, "Cannot build svm with less than 2 documents\n"); fflush(stderr); return NULL; } /* append these trans docs to the arrays that were filled in above */ ntrans = make_doc_array(src_barrel, &(docs[ntrain]), &(tdocs[ntrain]), use_transduction_docs); ndocs = ntrain + ntrans; utdocs = (int *) alloca(sizeof(int)*ndocs); for (i=0; i<ndocs; i++) { utdocs[i] = i; } sub_docs = docs; mdocs = ndocs; svm_set_barrel_weights(docs, NULL, ndocs, &weight_vect); kcache_init(ndocs); } else { /* the ndocs value is the number of training documents that will * actually be used - this is done now JUST to fill up the tdocs array. */ ntrain = make_doc_array(src_barrel, docs, tdocs, bow_cdoc_is_train); if (ntrain < 2) { if (ntrain) bow_wv_free(docs[0]); fprintf(stderr, "Cannot build svm with less than 2 documents\n"); fflush(stderr); return NULL; } /* figure out the # of ntrans */ ntrans = make_doc_array(src_barrel, &(docs[ntrain]), &(tdocs[ntrain]), use_transduction_docs); ndocs = ntrain + ntrans; /* since we don't need the docs for a while, free them */ for (i=0; i<ndocs; i++) { bow_wv_free(docs[i]); } model_weights = (bow_wv **) malloc(sizeof(bow_wv *)*nclasses); utdocs = (int *) alloca(sizeof(int)*ndocs); /* the sub_docs vector will be rewritten with wv's to be used each iteration */ sub_docs = alloca(sizeof(bow_wv *)*ndocs); } /* build the naive bayes model for the kernel... */ if (svm_kernel_type == FISHER) { /* this isn't too bad since the cache REALLY should be large enough * to hold everything anyway (the cache doesn't get flushed) */ if (vote_type == PAIRWISE) { fprintf(stderr, "Fisher kernel not implemented for pairwise models yet.\n"); return NULL; } svm_setup_fisher(src_barrel,docs,nclasses,ndocs); weight_type = 0; } weights = (double *) alloca(sizeof(double)*ndocs); yvect = (int *) alloca(sizeof(int)*ndocs); /* put together the resultant barrel */ class_barrel = bow_barrel_new(src_barrel->wi2dvf->size, 2, sizeof(bow_cdoc), src_barrel->cdocs->free_func); class_barrel->method = src_barrel->method; class_barrel->is_vpc = 1; /* make a temp word array big enough to fill a whole strip of the wi2dvf table */ num_words = bow_num_words(); n_meta_docs = 0; /* this is the beginning of the for loop */ max_nsv = -1; nloops = 0; npass = 0; if (svm_kernel_type == 0) { if (vote_type == PAIRWISE) { W = (bow_wv **) malloc(sizeof(bow_wv *)*(nclasses-1)*nclasses/2); } else { W = (bow_wv **) malloc(sizeof(bow_wv *)*nclasses); } } for (npass=0, cto=1; 1; ) { /* initialize & pull together the classes for the npass'th model... */ if (vote_type == PAIRWISE) { if (cto == nclasses) { npass ++; if (npass == nclasses-1) { break; } cto = npass+1; } if (svm_weight_style == WEIGHTS_PER_MODEL) { silly_currying_global_v1 = npass; silly_currying_global_v2 = cto; /* this gets called here since the doctype labels are in the barrel */ /* utdocs is filled with actual indices, not indices of the train set */ mdocs = make_doc_array(src_barrel, sub_docs, utdocs, use_train_and_submodel); /* put the labels in for the labeled docs. */ for (i=0; i<mdocs; i++) { bow_cdoc *cdoc = (GET_CDOC_ARRAY_EL(src_barrel,utdocs[i])); yvect[i] = map_class_to_y(npass, cdoc->class); } /* even though this set of docs is always the same (since all of the * unlabeled data is used for each pairwise document [this is not * suggested to with a barrel w/ more than 2 classes] is used) we * still grab it, since the starting position for the unlabeled data * isn't known beforehand (its a slight hack) */ ntrans = make_doc_array(src_barrel, &(sub_docs[mdocs]), &(utdocs[mdocs]), use_transduction_docs); /* this says that it is unlabelled */ for (i=0; i<ntrans; i++) { yvect[i+mdocs] = 0; } mdocs = mdocs + ntrans; /* utdocs holds the barrel indices we're interested in the sub-model * indices - so we need to remap utdocs */ for (i=j=0; j<mdocs; i++) { if (tdocs[i] < utdocs[j]) { continue; } else { utdocs[j] = i; j++; } } } else { for (i=j=0; i<ntrain; i++) { bow_cdoc *cdoc = (GET_CDOC_ARRAY_EL(src_barrel,tdocs[i])); if ((cdoc->class == npass) || (cdoc->class == cto)) { sub_docs[j] = docs[i]; yvect[j] = map_class_to_y(npass, cdoc->class); utdocs[j] = i; j++; } } for (i=0; i<ntrans; j++,i++) { sub_docs[j] = docs[i+ntrain]; utdocs[j] = i+ntrain; yvect[j] = 0; } mdocs = j; } } else { if (npass == nclasses) { break; } /* all docs should be included - the yvect will do the proper mapping */ for (i=0; i<ntrain; i++) { bow_cdoc *cdoc = (GET_CDOC_ARRAY_EL(src_barrel,tdocs[i])); /* this map will be extended to make the barrel handle more than 2 classes */ yvect[i] = map_class_to_y(npass, cdoc->class); } for (i=0; i<ntrans; i++) { yvect[i+ntrain] = 0; } if (svm_weight_style == WEIGHTS_PER_MODEL) { for (i=0; i<mdocs; i++) { /* the weight values are not correct - they include the last values */ /* make_doc_array does this for pairwise voting */ tf_transform(docs[i]); } } } if (svm_weight_style == WEIGHTS_PER_MODEL) { svm_set_barrel_weights(sub_docs, yvect, mdocs, &weight_vect); model_weights[nloops] = bow_wv_new(num_words); for (i=j=0; i<num_words; i++) { if (weight_vect[i] != 0.0) { model_weights[nloops]->entry[j].wi = i; model_weights[nloops]->entry[j].count = 1; model_weights[nloops]->entry[j].weight = weight_vect[i]; j++; } } free(weight_vect); model_weights[nloops]->num_entries = j; } if (mdocs < 2) { bow_error("Cannot create SVM with only 1 document!\n"); } fprintf(stderr,"Learning %dth model\n",nloops); if (svml_basename) { char *tmp; FILE *f = stdout; tmp = malloc(sizeof(char)*(20+strlen(svml_basename))); sprintf(tmp,"train_%d_%s",nloops,svml_basename); f = fopen (tmp, "w"); for (i=0; i<mdocs; i++) { fprintf(f,"%d ", yvect[i]); for (j=0; j<sub_docs[i]->num_entries; j++) { fprintf (f,"%d:%f ",1+sub_docs[i]->entry[j].wi, sub_docs[i]->entry[j].weight); } fprintf(f,"\n"); } fclose(f); /* set up the test output file */ sprintf(tmp,"test_%s",svml_basename); svml_test_file = fopen (tmp, "w"); free(tmp); nsv = 0; W[nloops] = bow_wv_new(0); } else { /* only useful with test-in-train - ONLY build models after a certain point * (like when the previously acquired data runs out) */ if ((!test_in_train) || ((test_in_train) && (nloops >= model_starting_no))) { nsv = tlf_svm(sub_docs,yvect,weights,&b,&(W[nloops]),ntrans,mdocs); } } if (vote_type == PAIRWISE && weight_type) { for (i=0; i<mdocs; i++) { bow_wv_free(sub_docs[i]); } } if (max_nsv < nsv) { max_nsv = nsv; } /* now we need to drop the significant classes into the barrel */ if (!test_in_train) { n_meta_docs += add_sv_barrel(class_barrel, weights, yvect, utdocs, b, nloops, nsv); } if (vote_type == PAIRWISE) { cto++; } else { npass ++; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -