📄 svm_base.c
字号:
nloops++; } if (test_in_train) { exit(0); } if (svm_kernel_type == 0) { bow_cdoc cdoc; cdoc.filename = NULL; cdoc.class_probs = NULL; cdoc.type = bow_doc_ignore; cdoc.class = 1; for (i=0; i<nloops; i++) { cdoc.word_count = W[i]->num_entries; bow_barrel_add_document(class_barrel, &cdoc, W[i]); bow_wv_free(W[i]); } free(W); } /* if it was per model, the cache would need to be alloc-ed & de-alloced locally */ if (svm_weight_style != WEIGHTS_PER_MODEL) { kcache_clear(); } /* place the model weights into the barrel */ if (svm_weight_style == WEIGHTS_PER_MODEL) { bow_cdoc cdoc; cdoc.filename = NULL; cdoc.class_probs = NULL; cdoc.type = bow_doc_ignore; cdoc.class = 1; /* this is fine since all of the docs are class 0 & we * know how many meta docs there are */ for (i=0; i<nloops; i++) { cdoc.word_count = model_weights[i]->num_entries; bow_barrel_add_document(class_barrel, &cdoc, model_weights[i]); bow_wv_free(model_weights[i]); } free(model_weights); } /* the docs were freed before just to save memory - now we need them again * & the optimizer's done, so a lot of memory is no longer being used */ if (svm_weight_style == WEIGHTS_PER_MODEL && vote_type == PAIRWISE) { make_doc_array(src_barrel, docs, tdocs, bow_cdoc_is_train); /* append these trans docs to the arrays that were filled in above */ make_doc_array(src_barrel, &(docs[ntrain]), &(tdocs[ntrain]), use_transduction_docs); } /* now add all of the documents from the doc barrel to the class barrel */ for (i=0; i<ndocs; i++) { /* add the i'th document to the class_barrel */ /* first we need to make a new cdoc */ bow_cdoc cdoc; memcpy(&cdoc, GET_CDOC_ARRAY_EL(src_barrel, tdocs[i]), sizeof(bow_cdoc)); cdoc.filename = strdup(cdoc.filename); cdoc.class = 0; bow_barrel_add_document(class_barrel, &cdoc, docs[i]); } /* this has to be done after all possible dv's have been created */ if (!((vote_type == PAIRWISE && weight_type) || weight_type == INFOGAIN) && weight_type) { /* if no weights are used at all this isn't nec. */ bow_dv *dv; j = bow_num_words(); for (i=0; i<j; i++) { dv = bow_wi2dvf_dv (class_barrel->wi2dvf, i); if (dv) { dv->idf = weight_vect[i]; } } free(weight_vect); } if (vote_type == PAIRWISE) { BARREL_GET_MAX_NSV(class_barrel) = max_nsv; } else { BARREL_GET_MAX_NSV(class_barrel) = -1*max_nsv; } BARREL_GET_NCLASSES(class_barrel) = nclasses; BARREL_GET_NMETA_DOCS(class_barrel) = n_meta_docs; class_barrel->classnames = bow_int4str_new(0); for (i=0; i<nclasses; i++) { /* drop a class label in */ bow_str2int(class_barrel->classnames, bow_int2str(src_barrel->classnames, i)); } for (i=0; i<ndocs; i++) { bow_wv_free(docs[i]); } return class_barrel;}inline double evaluate_model(bow_wv **docs, double *weights, int *yvect, double b, bow_wv *query_wv, int nsv) { double sum,tmp; int i,j; for (i=j=0, sum=0.0; j<nsv; i++) { if (weights[i] != 0.0) { tmp = kernel(docs[i],query_wv); sum += yvect[i]*weights[i]*tmp; j++; } } return (sum - b);}/* similar to above, but to only for when the cache should be used */inline double evaluate_model_cache(bow_wv **docs, double *weights, int *yvect, double b, bow_wv *query_wv, int nsv) { double sum,tmp; int i,j; for (i=j=0, sum=0.0; j<nsv; i++) { if (weights[i] != 0.0) { tmp = svm_kernel_cache(docs[i],query_wv); sum += yvect[i]*weights[i]*tmp; j++; } } return (sum - b);}inline double evaluate_model_hyperplane(double *W, double b, bow_wv *query_wv) { return (dprod_sd(query_wv,W)-b);}/* this & setup_docs are for "caching" the barrel into its wv form */static void clear_model_cache () { int i; if (model_cache.barrel) { for (i=0; i<model_cache.ndocs; i++) { bow_wv_free(model_cache.docs[i]); } for (i=0; i<model_cache.nmodels; i++) { free(model_cache.indices[i]); free(model_cache.weights[i]); free(model_cache.yvect[i]); if (svm_weight_style == WEIGHTS_PER_MODEL) { free(model_cache.word_weights.sub_model[i]); } if (svm_kernel_type == 0) { free(model_cache.W[i]); } } free(model_cache.docs); free(model_cache.indices); free(model_cache.weights); free(model_cache.yvect); free(model_cache.bvect); free(model_cache.sizes); if (svm_weight_style == WEIGHTS_PER_MODEL) { free(model_cache.word_weights.sub_model); } else if (svm_weight_style == WEIGHTS_PER_BARREL) { free(model_cache.word_weights.barrel); } if (svm_kernel_type == 0) { free(model_cache.W); } } model_cache.barrel = NULL;}/* this fn fills *sub_docs with the m-th submodel (it pulls the docs * from the cache that setup_docs fills & then sets whatever weights * are necessary) *//* the query vector should already be normalized */void make_sub_model(int m, int weight_style, bow_wv ***sub_docs) { bow_wv **docs; int *indices; float *weights; bow_we *v2; int i,j; docs = *sub_docs; for(j=0; j<model_cache.sizes[m]; j++) { docs[j] = model_cache.docs[model_cache.indices[m][j]]; } if (weight_style) { indices = model_cache.indices[m]; weights = model_cache.word_weights.sub_model[m]; for (i=0; i<model_cache.sizes[m]; i++) { int n = docs[i]->num_entries; int di = indices[i]; v2 = docs[i]->entry; for (j=0; j<n; j++) { v2[j].weight = weights[v2[j].wi] * model_cache.oweights[di][j]; } } }}static void setup_docs(bow_barrel *barrel, int nclasses, int nmodels) { bow_cdoc *cdoc; int classnum, c_old; bow_wv *dtmp; bow_dv_heap *heap; int ndocs; int nmeta_docs; int nwords; int total_words; int h,i,j,k,l; nmeta_docs = BARREL_GET_NMETA_DOCS(barrel); ndocs = barrel->cdocs->length - nmeta_docs; total_words = bow_num_words(); clear_model_cache(); model_cache.docs = (bow_wv **) malloc(sizeof(bow_wv *)*ndocs); model_cache.indices = (int **) malloc(sizeof(int *)*nmodels); model_cache.weights = (double **) malloc(sizeof(double *)*nmodels); model_cache.yvect = (int **) malloc(sizeof(int *)*nmodels); model_cache.bvect = (double *) malloc(sizeof(double)*nmodels); model_cache.sizes = (int *) malloc(sizeof(int)*nmodels); if (weight_type) { if (vote_type == PAIRWISE || weight_type == INFOGAIN) { svm_weight_style = WEIGHTS_PER_MODEL; model_cache.word_weights.sub_model = (float **) malloc(sizeof(float *)*nmodels); if (tf_transform_type) model_cache.oweights = (float **) malloc(sizeof(float *)*ndocs); } else { svm_weight_style = WEIGHTS_PER_BARREL; model_cache.word_weights.barrel = (float *) malloc(sizeof(float *)*total_words); } } else { svm_weight_style = NO_WEIGHTS; } if (svm_kernel_type == 0) { model_cache.W = (double **) malloc(sizeof(double *)*nmodels); } else { model_cache.W = NULL; } /* Create the Heap of vectors of all documents */ heap = bow_make_dv_heap_from_wi2dvf(barrel->wi2dvf); /* throw away the first 2 - they hold only ancillary info * (see the macros at the top of the file) */ bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes); bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes); /* grab the meta documents first & setup the arrays */ for (h=0,l=2; h<nmodels; h++) { classnum=c_old=-1; for (nwords=j=0,k=-1; l<nmeta_docs; l++) { /* only go thru for 2 different classes */ cdoc = bow_cdocs_di2doc (barrel->cdocs, l); /* if this isn't what the last one was, */ if ((cdoc->class != classnum) && (c_old != cdoc->class) && (k==1)) { break; } bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes); if ((cdoc->class != classnum) && (c_old != cdoc->class)) { if (k==-1) { /* do the stuff that needs done once for each model */ model_cache.bvect[h] = cdoc->normalizer; nwords = dtmp->num_entries; model_cache.indices[h] = (int *) malloc(sizeof(int)*nwords); model_cache.weights[h] = (double *) malloc(sizeof(double)*nwords); model_cache.yvect[h] = (int *) malloc(sizeof(int)*nwords); } else { /* in an already initialized model, but we need to grow arrays */ nwords += dtmp->num_entries; model_cache.indices[h] = (int *) realloc(model_cache.indices[h], sizeof(int)*(nwords)); model_cache.weights[h] = (double *) realloc(model_cache.weights[h], sizeof(double)*nwords); model_cache.yvect[h] = (int *) realloc(model_cache.yvect[h], sizeof(int)*nwords); } k++; c_old = classnum; classnum = cdoc->class; } else { /* already seen this class - need to grow some arrays */ nwords += dtmp->num_entries; model_cache.indices[h] = (int *) realloc(model_cache.indices[h], sizeof(int)*(nwords)); model_cache.weights[h] = (double *) realloc(model_cache.weights[h], sizeof(double)*nwords); model_cache.yvect[h] = (int *) realloc(model_cache.yvect[h], sizeof(int)*nwords); } for (i=0; j<nwords; j++,i++) { model_cache.indices[h][j] = dtmp->entry[i].count - 1; model_cache.weights[h][j] = dtmp->entry[i].weight; model_cache.yvect[h][j] = ((k == 0) ? 1.0 : -1.0); } } model_cache.sizes[h] = nwords; } /* if there are cached hyperplanes, lets grab them... */ if (svm_kernel_type == 0) { for (i=0; i<nmodels; i++) { bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes); model_cache.W[i] = (double *) malloc(total_words*sizeof(double)); for (h=j=0; j<dtmp->num_entries; h++) { if (h == dtmp->entry[j].wi) { model_cache.W[i][h] = dtmp->entry[j].weight; j++; } else { model_cache.W[i][h] = 0.0; } } for (; h<total_words; h++) { model_cache.W[i][h] = 0.0; } }#ifdef DEBUG for (j=0; j<total_words; j++) { tmp = model_cache.W[0][j] + model_cache.W[1][j]; assert(tmp >= -1*svm_epsilon_crit && tmp <= svm_epsilon_crit); }#endif } /* any kind of pairwise weights needs its own set of weights, since the domain * for each model is different... Info-gain also needs it since items relevant * & useful in one model may be of no use in another (since there are always only * 2 classes...) */ if (svm_weight_style == WEIGHTS_PER_MODEL) { for (h=0; h<nmodels; h++) { bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes); model_cache.word_weights.sub_model[h] = (float *) malloc(sizeof(float)*total_words); for (i=j=0; i<total_words; i++) { if ((j < dtmp->num_entries) && (dtmp->entry[j].wi == i)) { model_cache.word_weights.sub_model[h][i] = dtmp->entry[j].weight; j++; } else { model_cache.word_weights.sub_model[h][i] = 0.0; } } } } else if (svm_weight_style == WEIGHTS_PER_BARREL) { bow_dv *dv; for (h=0; h<total_words; h++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, h); if (dv) { model_cache.word_weights.barrel[h] = dv->idf; } else { model_cache.word_weights.barrel[h] = 0.0; } } } /* the rest of the documents are just the training documents - keep * grabbing them until they're gone */ for (h=0; heap->length; h++) { bow_heap_next_wv(heap, barrel, &dtmp, bow_cdoc_yes); model_cache.docs[h] = bow_wv_new(dtmp->num_entries); for (j=0; j<dtmp->num_entries; j++) { model_cache.docs[h]->entry[j].wi = dtmp->entry[j].wi; model_cache.docs[h]->entry[j].count = dtmp->entry[j].count; } /* if (svm_kernel_type == FISHER) { for (j=0; j<model_cache.docs[h]->num_entries; j++) { model_cache.docs[h]->entry[j].weight = (float) model_cache.docs[h]->entry[j].count; } model_cache.docs[h]->normalizer = 1.0; continue; }*/ tf_transform(model_cache.docs[h]); /* this means that the weights will change with every model & * therefore we need to keep track of what they were initially (after the tf_transform) */ if (svm_weight_style == WEIGHTS_PER_MODEL && tf_transform_type) { model_cache.oweights[h] = (float *) malloc(sizeof(float)*dtmp->num_entries); for (j=0; j<model_cache.docs[h]->num_entries; j++) { model_cache.oweights[h][j] = model_cache.docs[h]->entry[j].weight; } } else { /* otherwise, the weights should be set now... */ if (svm_weight_style == NO_WEIGHTS) { bow_wv_normalize_weights_by_summing(model_cache.docs[h]); for (j=0; j<model_cache.docs[h]->num_entries; j++) { model_cache.docs[h]->entry[j].weight *= model_cache.docs[h]->normalizer; } } else { for (j=0; j<model_cache.docs[h]->num_entries; j++) { model_cache.docs[h]->entry[j].weight *= model_cache.word_weights.barrel[model_cache.docs[h]->entry[j].wi]; } bow_wv_normalize_weights_by_summing(model_cache.docs[h]); for (j=0; j<model_cache.docs[h]->num_entries; j++) { model_cache.docs[h]->entry[j].weight *= model_cache.docs[h]->normalizer; } } } /* the oweights (original weights) in the svm_wv now has the proper, * tf_transformed & normalized value. */ } model_cache.barrel = barrel; model_cache.ndocs = h; model_cache.nmodels = nmodels;}int svm_score(bow_barrel *barrel, bow_wv *query_wv, bow_score *bscores, int bscores_len, int loo_class) { int ci; int max_nsv; double *model_vals; bow_score *myscores; float *base_qwv_weights; int nclasses; int nmodels; int ntied; int num_scores; int set_weights; bow_wv **sub_docs; int voting_scheme; int i, ii, j, k; /* This should be initialized in case BSCORES_LEN is larger than the number * of classes in the barrel */ for (ci=0; ci < bscores_len; ci++) { bscores[ci].weight = 0.0; bscores[ci].di = 0; bscores[ci].name = "default"; } base_qwv_weights = NULL; max_nsv = BARREL_GET_MAX_NSV(barrel); nclasses = BARREL_GET_NCLASSES(barrel); if (max_nsv < 0) { max_nsv *= -1; nmodels = nclasses; voting_scheme = AGAINST_ALL; } else { nmodels = nclasses*(nclasses-1)/2; voting_scheme = PAIRWISE; } if
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -