📄 active.c
字号:
assert(num_classes < 100); /* set the class_probs by picking numbers from the pr charts */ for (hi = 0; hi < num_classes; hi++) { double score = scores[scorei].scores[member][hi].weight; int class = scores[scorei].scores[member][hi].di; int pr_index_low; int pr_index_high; int pr_index = 0; int correct_count = 0; int num_docs_in_window = 0; int pri; while ((pr_index < total_unknown) && (pr_by_class[class][pr_index].score > score)) pr_index++; pr_index_low = pr_index; while ((pr_index < total_unknown) && pr_by_class[class][pr_index].score == score) pr_index++; pr_index_high = pr_index; #if 0 if (10 > pr_index) correct_count += 10 - pr_index;#endif /* note that we're including the test document here in the stats... */ for (pri = MAX (0, MIN(pr_index_low, ((pr_index_low + pr_index_high - active_pr_window_size) / 2))); pri < MIN (MAX(pr_index_high, ((pr_index_high + pr_index_low + active_pr_window_size) / 2)), total_unknown); pri++) { correct_count += pr_by_class[class][pri].correct; num_docs_in_window++; } prob_by_ci[class] = (double) correct_count / ((double) num_docs_in_window); } /* normalize the probs to sum to one */ for (ci = 0; ci < num_classes; ci++) total += prob_by_ci[ci]; for (hi = 0; hi < num_classes; hi++) scores[scorei].scores[member][hi].weight = prob_by_ci[scores[scorei].scores[member][hi].di] / total; } }}/* Return the entropy of the words in the document WV. */floatactive_document_entropy (bow_wv *wv){ float ret = 0; float wv_word_count = 0; int wvi; float pr_w; for (wvi = 0; wvi < wv->num_entries; wvi++) wv_word_count += wv->entry[wvi].count; for (wvi = 0; wvi < wv->num_entries; wvi++) { pr_w = wv->entry[wvi].count / wv_word_count; ret -= pr_w * log (pr_w); } return ret;}/* select method routines *//* comparison function for sorting on selection criteria */intactive_scores_compare (const void *x, const void *y){ if (((active_scores *)x)->weight > ((active_scores *)y)->weight) return -1; else if (((active_scores *)x)->weight == ((active_scores *)y)->weight) return 0; else return 1;}/* select docs with the highest kl-divergence to the mean */voidactive_select_qbc (bow_barrel *doc_barrel, active_scores *scores, int num_to_add, int total_unknown, int committee_size){ int num_classes = bow_barrel_num_classes (doc_barrel); double *mean_class_dist; double mean_class_sum; int committee; int class; int k; mean_class_dist = alloca (sizeof (double) * num_classes); /* Calculate the entropy of the class labels, H(Class|d,Committee), where Class and Committee are random varibles, and put this in SCORES->WEIGHT. */ for (k = 0; k < total_unknown; k++) { scores[k].weight = 0; /* Initialize the mean class distribution for this document. */ for (class = 0; class < num_classes; class++) mean_class_dist[class] = 0; for (committee = 0; committee < committee_size; committee++) for (class = 0; class < num_classes; class++) mean_class_dist[scores[k].scores[committee][class].di] += scores[k].scores[committee][class].weight; mean_class_sum = 0; for (class = 0; class < num_classes; class++) mean_class_sum += mean_class_dist[class]; assert (mean_class_sum > committee_size * 0.999); assert (mean_class_sum < committee_size * 1.001); for (class = 0; class < num_classes; class++) mean_class_dist[class] /= mean_class_sum; /* Set WEIGHT to KL-divergence-to-the-mean averaged over all committee members. */ for (committee = 0; committee < committee_size; committee++) { for (class = 0; class < bow_barrel_num_classes (doc_barrel); class++) { if (1e-100 < scores[k].scores[committee][class].weight) { scores[k].weight -= ((1.0 / committee_size) * scores[k].scores[committee][class].weight * log (mean_class_dist[scores[k].scores[committee][class].di] / scores[k].scores[committee][class].weight)); if (scores[k].weight < -0.1) bow_error("scores[k].weight < -0.1: %.20f, %.20f", scores[k].weight, log (mean_class_dist[scores[k].scores[committee][class].di] / scores[k].scores[committee][class].weight)); } } } /* KL divergence must be greater than or equal to 0 */ if (scores[k].weight < -0.1) bow_error("scores[k].weight < -0.1: %.20f", scores[k].weight); } /* reverse all weights if want lowest ones */ if (active_qbc_low_kl) { for (k = 0; k < total_unknown ; k++) { scores[k].weight = -1 * scores[k].weight; } } /* Sort based on weight */ qsort (scores, total_unknown, sizeof (active_scores), active_scores_compare); /* Change doc types of those with highest entropy*/ for (k = 0; k < num_to_add; k++) { bow_cdoc *doc; doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di); assert (doc); assert (doc->type == bow_doc_unlabeled); bow_verbosify (bow_progress, "Labeling %s, weight %f", doc->filename, scores[k].weight); for (committee=0; committee < committee_size; committee++) bow_verbosify(bow_progress, " [(%d, %f) (%d, %f)]", scores[k].scores[committee][0].di, scores[k].scores[committee][0].weight, scores[k].scores[committee][1].di, scores[k].scores[committee][1].weight); bow_verbosify(bow_progress, "\n"); doc->type = bow_doc_train; } return;}/* select docs with the highest weighted kl-divergence to the mean */voidactive_select_weighted_kl (bow_barrel *doc_barrel, active_scores *scores, int num_to_add, int total_unknown, int committee_size){ int num_classes = bow_barrel_num_classes (doc_barrel); double mean_class_dist[num_classes]; double mean_class_sum; double **nb_scores; int committee; int class; int k; bow_cdoc *cdoc; double nb_scores_sum; double nb_scores_max; int si; /* an index into the sorted list of scores */ assert (num_to_add < total_unknown); assert (em_cross_entropy == 1); /* Allocate space to store Naive Bayes scores. */ nb_scores = alloca (sizeof (double*) * committee_size); for (committee = 0; committee < committee_size; committee++) nb_scores[committee] = alloca (sizeof(double) * num_classes); /* Calculate the weighted KL divergence of the class labels and put this in SCORES->WEIGHT. */ for (k = 0; k < total_unknown; k++) { /* Fill in the Naive Bayes scores array for this K'th document. */ cdoc = bow_array_entry_at_index (doc_barrel->cdocs, scores[k].di); for (committee = 0; committee < committee_size; committee++) { /* Undo the document length normalization */ for (si = 0; si < num_classes; si++) nb_scores[committee][scores[k].scores[committee][si].di] = (scores[k].scores[committee][si].weight * (cdoc->word_count + 1)); /* Rescale the scores */ nb_scores_max = -DBL_MAX; for (class = 0; class < num_classes; class++) if (nb_scores_max < nb_scores[committee][class]) nb_scores_max = nb_scores[committee][class]; for (class = 0; class < num_classes; class++) nb_scores[committee][class] -= nb_scores_max; /* Take the exponent of the scores to make them probabilities. */ for (class = 0; class < num_classes; class++) nb_scores[committee][class] = exp (nb_scores[committee][class]); /* Normalize them so they sum to one. */ nb_scores_sum = 0; for (class = 0; class < num_classes; class++) nb_scores_sum += nb_scores[committee][class]; assert (nb_scores_sum > 0); for (class = 0; class < num_classes; class++) nb_scores[committee][class] /= nb_scores_sum; } /* Initialize the mean class distribution for this document. */ for (class = 0; class < num_classes; class++) mean_class_dist[class] = 0; for (committee = 0; committee < committee_size; committee++) for (class = 0; class < num_classes; class++) mean_class_dist[class] += nb_scores[committee][class]; mean_class_sum = 0; for (class = 0; class < num_classes; class++) mean_class_sum += mean_class_dist[class]; assert (mean_class_sum > committee_size * 0.999); assert (mean_class_sum < committee_size * 1.001); for (class = 0; class < num_classes; class++) mean_class_dist[class] /= mean_class_sum; /* Set WEIGHT to KL-divergence-to-the-mean averaged over all committee members. */ scores[k].weight = 0; for (committee = 0; committee < committee_size; committee++) { for (si = 0; si < bow_barrel_num_classes (doc_barrel); si++) { class = scores[k].scores[committee][si].di; if (1e-100 < nb_scores[committee][class]) {/* xxx Change this back to regular old WKL! */#define UNSUPERVISED_DENSITY 1#if UNSUPERVISED_DENSITY scores[k].weight -= ((1.0 / committee_size) /* scale by kl-div of this document to this class */ * nb_scores[committee][class] * log (mean_class_dist[class] / nb_scores[committee][class]));#elif 1 /* Used for ICML submission */ scores[k].weight -= ((1.0 / committee_size) /* scale by kl-div of this document to this class */ * exp (scores[k].scores[committee][si].weight + cdoc->normalizer) * nb_scores[committee][class] * log (mean_class_dist[class] / nb_scores[committee][class]));#else scores[k].weight -= ((1.0 / committee_size) * (cdoc->word_count + 1) /* scale by perplexity of this document in this class */ * exp (scores[k].scores[committee][si].weight) * nb_scores[committee][class] * log (mean_class_dist[class] / nb_scores[committee][class]));#endif } } }#if UNSUPERVISED_DENSITY /* Scale the score by the document density. */ scores[k].weight *= cdoc->prior;#endif /* KL divergence must be greater than or equal to 0 */ if (scores[k].weight < -0.1) bow_error("scores[k].weight < -0.1: %.20f", scores[k].weight); } /* Sort based on weight */ qsort (scores, total_unknown, sizeof (active_scores), active_scores_compare); /* Change doc types of those with highest entropy*/ for (k = 0; k < num_to_add ; k++) { bow_cdoc *doc; doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di); assert (doc); assert (doc->type == bow_doc_unlabeled); bow_verbosify (bow_progress, "Labeling %s, weight %f", doc->filename, scores[k].weight); for (committee=0; committee < committee_size; committee++) bow_verbosify(bow_progress, " [(%d, %f) (%d, %f)]", scores[k].scores[committee][0].di, scores[k].scores[committee][0].weight, scores[k].scores[committee][1].di, scores[k].scores[committee][1].weight); bow_verbosify(bow_progress, "\n"); doc->type = bow_doc_train; } return;}/* select docs with the highest weighted kl-divergence to the mean. Needs crossentropy scores! */voidactive_select_dkl (bow_barrel *doc_barrel, active_scores *scores, int num_to_add, int total_unknown, int committee_size){ int num_classes = bow_barrel_num_classes (doc_barrel); double mean_class_dist[num_classes]; double mean_class_sum; double **nb_scores; int committee; int class; int k; bow_cdoc *cdoc; double nb_scores_sum; double nb_scores_max; int si; /* an index into the sorted list of scores */ assert (num_to_add < total_unknown); assert (em_cross_entropy == 1); /* Allocate space to store Naive Bayes scores. */ nb_scores = alloca (sizeof (double*) * committee_size); for (committee = 0; committee < committee_size; committee++) nb_scores[committee] = alloca (sizeof(double) * num_classes); /* Calculate the weighted KL divergence of the class labels and put this in SCORES->WEIGHT. */ for (k = 0; k < total_unknown; k++) { /* Fill in the Naive Bayes scores array for this K'th document. */ cdoc = bow_array_entry_at_index (doc_barrel->cdocs, scores[k].di); for (committee = 0; committee < committee_size; committee++) { /* Undo the document length normalization */ for (si = 0; si < num_classes; si++) nb_scores[committee][scores[k].scores[committee][si].di] = (scores[k].scores[committee][si].weight * (cdoc->word_count + 1)); /* Rescale the scores */ nb_scores_max = -DBL_MAX; for (class = 0; class < num_classes; class++) if (nb_scores_max < nb_scores[committee][class]) nb_scores_max = nb_scores[committee][class]; for (class = 0; class < num_classes; class++) nb_scores[committee][class] -= nb_scores_max; /* Take the exponent of the scores to make them probabilities. */ for (class = 0; class < num_classes; class++) nb_scores[committee][class] = exp (nb_scores[committee][class]);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -