📄 active.c
字号:
{ if (1e-100 < mean_class_dist[class]) { scores[k].weight -= (mean_class_dist[class] * log (mean_class_dist[class])); } } /* adjust for the correct log factor */ scores[k].weight *= 1 / log(2); /* convert to a probability */ scores[k].weight /= log (bow_barrel_num_classes (doc_barrel)) / log(2); /* multiply in the epsilon factor */ scores[k].weight *= active_stream_epsilon; /* Entropy must be greater than or equal to 0 */ if (scores[k].weight < -0.1) bow_error("scores[k].weight < -0.1: %.20f", scores[k].weight); } /* select some documents randomly according to the weights */ for (j=0, k=0; k < num_to_add; j++) { int scoresi = rand() % total_unknown; int doci; double coin_flip; doci = scores[scoresi].di; doc = bow_cdocs_di2doc (doc_barrel->cdocs, doci); assert (doc); if (doc->type == bow_doc_unlabeled) { coin_flip = bow_random_double (0,1); if (scores[scoresi].weight > coin_flip) { doc->type = bow_doc_train; k++; bow_verbosify (bow_progress, "Labeling %s, weight %f, flip %f", doc->filename, scores[scoresi].weight, coin_flip); for (committee=0; committee < committee_size; committee++) bow_verbosify(bow_progress, " [(%d, %f) (%d, %f)]", scores[scoresi].scores[committee][0].di, scores[scoresi].scores[committee][0].weight, scores[scoresi].scores[committee][1].di, scores[scoresi].scores[committee][1].weight); bow_verbosify(bow_progress, "\n"); } } if (j > doc_barrel->cdocs->length * 1000) bow_error ("Random number generator could not find enough " "unlabeled documents to convert."); } return;}voidactive_select_stream_kl (bow_barrel *doc_barrel, active_scores *scores, int num_to_add, int total_unknown, int committee_size){ int j; bow_cdoc *doc; int num_classes = bow_barrel_num_classes (doc_barrel); double *mean_class_dist; double mean_class_sum; int committee; int class; int k; mean_class_dist = alloca (sizeof (double) * num_classes); assert(num_to_add <= total_unknown); /* ensures our max-kl for probability mapping is correct */ assert(bow_barrel_num_classes(doc_barrel) >= committee_size); /* Calculate the entropy of the class labels, H(Class|d,Committee), where Class and Committee are random varibles, and put this in SCORES->WEIGHT. */ for (k = 0; k < total_unknown; k++) { scores[k].weight = 0; /* Initialize the mean class distribution for this document. */ for (class = 0; class < num_classes; class++) mean_class_dist[class] = 0; for (committee = 0; committee < committee_size; committee++) for (class = 0; class < num_classes; class++) mean_class_dist[scores[k].scores[committee][class].di] += scores[k].scores[committee][class].weight; mean_class_sum = 0; for (class = 0; class < num_classes; class++) mean_class_sum += mean_class_dist[class]; assert (mean_class_sum > committee_size * 0.999); assert (mean_class_sum < committee_size * 1.001); for (class = 0; class < num_classes; class++) mean_class_dist[class] /= mean_class_sum; /* Set WEIGHT to KL-divergence-to-the-mean averaged over all committee members. */ for (committee = 0; committee < committee_size; committee++) { for (class = 0; class < bow_barrel_num_classes (doc_barrel); class++) { if (1e-100 < scores[k].scores[committee][class].weight) { scores[k].weight -= ((1.0 / committee_size) * scores[k].scores[committee][class].weight * log (mean_class_dist[scores[k].scores[committee][class].di] / scores[k].scores[committee][class].weight)); if (scores[k].weight < -0.1) bow_error("scores[k].weight < -0.1: %.20f, %.20f", scores[k].weight, log (mean_class_dist[scores[k].scores[committee][class].di] / scores[k].scores[committee][class].weight)); } } } /* KL divergence must be greater than or equal to 0 */ if (scores[k].weight < -0.1) bow_error("scores[k].weight < -0.1: %.20f", scores[k].weight); /* adjust for the correct log factor */ scores[k].weight *= 1.0 / log(2); /* convert to a probability by scaling with the max kl-to-the-mean */ scores[k].weight /= -1.0 * log (1.0 / (double) committee_size) / log(2); /* multiply in the epsilon factor */ scores[k].weight *= active_stream_epsilon; } /* select some documents randomly according to the weights */ for (j=0, k=0; k < num_to_add; j++) { int scoresi = rand() % total_unknown; int doci; double coin_flip; doci = scores[scoresi].di; doc = bow_cdocs_di2doc (doc_barrel->cdocs, doci); assert (doc); if (doc->type == bow_doc_unlabeled) { coin_flip = bow_random_double (0,1); if (scores[scoresi].weight > coin_flip) { doc->type = bow_doc_train; k++; bow_verbosify (bow_progress, "Labeling %s, weight %f, flip %f", doc->filename, scores[scoresi].weight, coin_flip); for (committee=0; committee < committee_size; committee++) bow_verbosify(bow_progress, " [(%d, %f) (%d, %f)]", scores[scoresi].scores[committee][0].di, scores[scoresi].scores[committee][0].weight, scores[scoresi].scores[committee][1].di, scores[scoresi].scores[committee][1].weight); bow_verbosify(bow_progress, "\n"); } } if (j > doc_barrel->cdocs->length * 1000) bow_error ("Random number generator could not find enough " "unlabeled documents to convert."); } return;}/* Functions for calculating document density. */intactive_cdoc_is_used_for_density (bow_cdoc *cdoc){ return ((cdoc->type == bow_doc_train) || (cdoc->type == bow_doc_unlabeled) || (cdoc->type == bow_doc_pool) || (cdoc->type == bow_doc_waiting));}/* Given a document barrel, set the CDOC->NORMALIZER to the document word entropy. Return the sum of the background cross entropies of all the documents. Assumes that IDF has already been set to Pr(w) */doubleactive_doc_barrel_set_entropy (bow_barrel *barrel){ bow_wv *wv; bow_dv_heap *heap; int wvi; double pr_w_d; double entropy; double entropy_sum = 0; double total_background_kl = 0; int di; bow_cdoc *cdoc; bow_dv *dv; double word_kl; heap = bow_test_new_heap (barrel); /* xxx Make sure to update CDOC->WORD_COUNT for a new vocabulary! */ while ((di = bow_heap_next_wv (heap, barrel, &wv, active_cdoc_is_used_for_density)) != -1) { cdoc = bow_array_entry_at_index (barrel->cdocs, di); entropy = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) { pr_w_d = ((double)wv->entry[wvi].count) / cdoc->word_count; entropy -= pr_w_d * log (pr_w_d); dv = bow_wi2dvf_dv (barrel->wi2dvf, wv->entry[wvi].wi); word_kl = (- pr_w_d * log ((1 - active_alpha) * dv->idf)); assert (word_kl >= 0); total_background_kl += word_kl; } total_background_kl -= entropy; cdoc->normalizer = entropy; entropy_sum += entropy; } return total_background_kl;}/* Given a document barrel, set the WI2DVF->IDF to Pr(w) */voidactive_doc_barrel_set_pr_w (bow_barrel *barrel){ int wi, max_wi, dvi; int total_num_words; bow_dv *dv; max_wi = MIN (barrel->wi2dvf->size, bow_num_words ()); total_num_words = 0; for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (!dv) continue; dv->idf = 0; for (dvi = 0; dvi < dv->length; dvi++) { dv->idf += dv->entry[dvi].count; total_num_words += dv->entry[dvi].count; } } for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (!dv) continue; dv->idf /= total_num_words; }}/* Return the density of document WV, calculated using a KL divergence distance to all other documents. */floatactive_wv_density (bow_wv *wv, bow_barrel *barrel, float background_kl){ int wvi; /* bow_bitvec *document_touched = bow_bitvec_new (1, barrel->cdocs->length); */ double pr_w_d; double pr_w_wv; double pr_w_wv_missing; double total_kl; /* sum of KL divergence to all other docs */ bow_dv *dv; int dvi; bow_cdoc *cdoc; /* Set to background KL, that a document with no words would have. */ pr_w_wv_missing = 1.0 / (wv->num_entries + barrel->wi2dvf->num_words); /*total_kl = - barrel->cdocs->length * log (pr_w_wv_missing);*/ total_kl = background_kl; assert (total_kl == total_kl); for (wvi = 0; wvi < wv->num_entries; wvi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wv->entry[wvi].wi); if (!dv) continue; pr_w_wv = ((active_alpha * ((double)wv->entry[wvi].count) / wv->num_entries) + ((1 - active_alpha) * dv->idf)); for (dvi = 0; dvi < dv->length; dvi++) { cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); pr_w_d = ((double)dv->entry[dvi].count) / cdoc->word_count; /* Remove from the total what we said its contribution would be above in the background calculation. */ /*total_kl += pr_w_d * log (pr_w_wv_missing);*/ dv = bow_wi2dvf_dv (barrel->wi2dvf, wv->entry[wvi].wi); total_kl += pr_w_d * log ((1 - active_alpha) * dv->idf); assert (total_kl == total_kl); /* Add in the true contribution */ total_kl -= pr_w_d * log (pr_w_wv); assert (total_kl == total_kl); } } return total_kl;}/* Given a document barrel, set the CDOC->PRIOR to the document density, using a KL divergence distance to all other documents. Uses train and unlabeled documents. Also sets the CDOC->NORMALIZER to the document entropy */voidactive_doc_barrel_set_density (bow_barrel *barrel){ bow_dv_heap *heap; int di; bow_wv *wv; bow_cdoc *cdoc; double background_kl; active_doc_barrel_set_pr_w (barrel); background_kl = active_doc_barrel_set_entropy (barrel); heap = bow_test_new_heap (barrel); while ((di = bow_heap_next_wv (heap, barrel, &wv, active_cdoc_is_used_for_density)) != -1) { cdoc = bow_array_entry_at_index (barrel->cdocs, di); cdoc->prior = active_wv_density (wv, barrel, background_kl); cdoc->prior = exp (- active_beta * cdoc->prior / barrel->cdocs->length); /* printf ("%10g %s\n", cdoc->prior, cdoc->filename); */ }}/* Create a class barrel using active learning */bow_barrel *active_learn (bow_barrel *doc_barrel){ bow_barrel *vpc_barrel = NULL; /* the vector-per-class barrel */ int max_ci; int ci; int di; int mi; int round_num; int actual_num_hits; int num_unlabeled_docs = 0; int orig_num_unlabeled_docs; bow_dv_heap *test_heap; /* we'll extract test WV's from here */ bow_wv *query_wv; active_scores *scores; bow_cdoc *doc_cdoc; bow_cdoc *class_cdoc; rainbow_method *secondary_method; /* Set the CDOC->PRIOR to the "density" value. */ if (active_selection_method == dkl) active_doc_barrel_set_density (doc_barrel); /* initialize variables */ max_ci = bow_barrel_num_classes(doc_barrel); secondary_method = (rainbow_method*) bow_method_at_name (active_secondary_method); /* change all but vpc_with_weights */ doc_barrel->method->set_weights = secondary_method->set_weights; doc_barrel->method->scale_weights = secondary_method->scale_weights; doc_barrel->method->normalize_weights = secondary_method->normalize_weights; doc_barrel->method->vpc_set_priors = secondary_method->vpc_set_priors; doc_barrel->method->score = secondary_method->score; doc_barrel->method->wv_set_weights = secondary_method->wv_set_weights; doc_barrel->method->wv_normalize_weights = secondary_method->wv_normalize_weights; doc_barrel->method->free_barrel = secondary_method->free_barrel; doc_barrel->method->params = secondary_method->params; /* find the binary positive class, if needed */ if (active_binary_pos_classname != NULL) { assert(bow_barrel_num_classes(doc_barrel) == 2); for (ci = 0; ci < bow_barrel_num_classes(doc_barrel); ci++) { if (!strcmp(active_binary_pos_classname, filename_to_classname (bow_barrel_classname_at_index (doc_barrel, ci)))) { active_binary_pos_ci = ci; break;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -