📄 active.c
字号:
/* Normalize them so they sum to one. */ nb_scores_sum = 0; for (class = 0; class < num_classes; class++) nb_scores_sum += nb_scores[committee][class]; assert (nb_scores_sum > 0); for (class = 0; class < num_classes; class++) nb_scores[committee][class] /= nb_scores_sum; } /* Initialize the mean class distribution for this document. */ for (class = 0; class < num_classes; class++) mean_class_dist[class] = 0; for (committee = 0; committee < committee_size; committee++) for (class = 0; class < num_classes; class++) mean_class_dist[class] += nb_scores[committee][class]; mean_class_sum = 0; for (class = 0; class < num_classes; class++) mean_class_sum += mean_class_dist[class]; assert (mean_class_sum > committee_size * 0.999); assert (mean_class_sum < committee_size * 1.001); for (class = 0; class < num_classes; class++) mean_class_dist[class] /= mean_class_sum; /* Set WEIGHT to KL-divergence-to-the-mean averaged over all committee members. */ scores[k].weight = 0; for (committee = 0; committee < committee_size; committee++) { for (si = 0; si < bow_barrel_num_classes (doc_barrel); si++) { class = scores[k].scores[committee][si].di; if (1e-100 < nb_scores[committee][class]) { scores[k].weight -= ((1.0 / committee_size) /* scale by kl-div of this document to this class */ * nb_scores[committee][class] * log (mean_class_dist[class] / nb_scores[committee][class])); } } } /* Scale the score by the document density. */ scores[k].weight *= cdoc->prior; /* KL divergence must be greater than or equal to 0 */ if (scores[k].weight < -0.1) bow_error("scores[k].weight < -0.1: %.20f", scores[k].weight); } /* Sort based on weight */ qsort (scores, total_unknown, sizeof (active_scores), active_scores_compare); /* Change doc types of those with highest entropy*/ for (k = 0; k < num_to_add ; k++) { bow_cdoc *doc; doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di); assert (doc); assert (doc->type == bow_doc_unlabeled); bow_verbosify (bow_progress, "Labeling %s, weight %f", doc->filename, scores[k].weight); for (committee=0; committee < committee_size; committee++) bow_verbosify(bow_progress, " [(%d, %f) (%d, %f)]", scores[k].scores[committee][0].di, scores[k].scores[committee][0].weight, scores[k].scores[committee][1].di, scores[k].scores[committee][1].weight); bow_verbosify(bow_progress, "\n"); doc->type = bow_doc_train; } return;}/* select docs with the highest vote entropy (Dagan and Engelson) */voidactive_select_vote_entropy (bow_barrel *doc_barrel, active_scores *scores, int num_to_add, int total_unknown, int committee_size){ int num_classes = bow_barrel_num_classes (doc_barrel); double *mean_class_dist; double mean_class_sum; int committee; int class; int k; int si; mean_class_dist = alloca (sizeof (double) * num_classes); /* Calculate the entropy of the class labels, H(Class|d,Committee), where Class and Committee are random varibles, and put this in SCORES->WEIGHT. */ for (k = 0; k < total_unknown; k++) { scores[k].weight = 0; /* Initialize the scores to be 'votes' */ for (committee = 0; committee < committee_size; committee++) { scores[k].scores[committee][0].weight = 1.0; for (si = 1; si < num_classes; si++) scores[k].scores[committee][si].weight = 0.0; } /* Initialize the mean class distribution for this document. */ for (class = 0; class < num_classes; class++) mean_class_dist[class] = 0; for (committee = 0; committee < committee_size; committee++) for (class = 0; class < num_classes; class++) mean_class_dist[scores[k].scores[committee][class].di] += scores[k].scores[committee][class].weight; mean_class_sum = 0; for (class = 0; class < num_classes; class++) mean_class_sum += mean_class_dist[class]; assert (mean_class_sum > committee_size * 0.999); assert (mean_class_sum < committee_size * 1.001); for (class = 0; class < num_classes; class++) mean_class_dist[class] /= mean_class_sum; /* Calculate the entropy of the mean class distribution */ for (class = 0; class < bow_barrel_num_classes (doc_barrel); class++) { if (1e-100 < mean_class_dist[class]) { scores[k].weight -= (mean_class_dist[class] * log (mean_class_dist[class])); } } /* Entropy must be greater than or equal to 0 */ if (scores[k].weight < -0.1) bow_error("scores[k].weight < -0.1: %.20f", scores[k].weight); } /* Sort based on weight */ qsort (scores, total_unknown, sizeof (active_scores), active_scores_compare); /* Change doc types of those with highest entropy*/ for (k = 0; k < num_to_add; ) { int z; double top_score; int n; int j; /* find how many top ranked docs have same score */ top_score = scores[k].weight; for (z=k; z < total_unknown && scores[z].weight == top_score ; z++); /* add all with top score if won't max it out */ if (z < num_to_add) { for (n=k; n<z; n++, k++) { bow_cdoc *doc; doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di); assert (doc); assert (doc->type == bow_doc_unlabeled); bow_verbosify (bow_progress, "Labeling %s, weight %f", doc->filename, scores[n].weight); for (committee=0; committee < committee_size; committee++) bow_verbosify(bow_progress, " %d", scores[n].scores[committee][0].di); bow_verbosify(bow_progress, "\n"); doc->type = bow_doc_train; } } else { /* need to randomly select some of the docs for labeling */ for (j=0, n=k; n < num_to_add; j++) { int si = (rand() % (z-k)) + k; int doci; bow_cdoc *doc; doci = scores[si].di; doc = bow_cdocs_di2doc (doc_barrel->cdocs, doci); assert (doc); if (doc->type == bow_doc_unlabeled) { doc->type = bow_doc_train; bow_verbosify (bow_progress, "Labeling %s, weight %f", doc->filename, scores[si].weight); for (committee=0; committee < committee_size; committee++) bow_verbosify(bow_progress, " %d", scores[si].scores[committee][0].di); bow_verbosify(bow_progress, "\n"); n++; } if (j > doc_barrel->cdocs->length * 1000) bow_error ("Random number generator could not find enough " "unlabeled documents to convert."); } return; } } return;}voidactive_select_uncertain (bow_barrel *doc_barrel, active_scores *scores, int num_to_add, int total_unknown, int committee_size){ int k; assert(num_to_add <= total_unknown); assert(committee_size == 1); /* Make smallest top classification better */ for (k=0; k < total_unknown; k++) { scores[k].weight = -1 * scores[k].scores[0][0].weight; } /* sort based on weight */ qsort(scores, total_unknown, sizeof (active_scores), active_scores_compare); /* change doc types */ for (k=0; k < num_to_add; k++) { bow_cdoc *doc; doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di); assert(doc); assert(doc->type == bow_doc_unlabeled); bow_verbosify(bow_progress, "Labeling %s\n", doc->filename); doc->type = bow_doc_train; } return; }voidactive_select_relevant (bow_barrel *doc_barrel, active_scores *scores, int num_to_add, int total_unknown, int committee_size){ int k; assert(num_to_add <= total_unknown); assert(committee_size == 1); for (k=0; k < total_unknown; k++) { scores[k].weight = -1 *scores[k].scores[0][0].weight; } /* sort based on weight */ qsort(scores, total_unknown, sizeof (active_scores), active_scores_compare); /* change doc types */ for (k = total_unknown - num_to_add; k < total_unknown; k++) { bow_cdoc *doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di); assert(doc); assert(doc->type == bow_doc_unlabeled); bow_verbosify(bow_progress, "Labeling %s\n", doc->filename); doc->type = bow_doc_train; } return;}void active_select_length(bow_barrel *doc_barrel, active_scores *scores, int num_to_add, int total_unknown, int committee_size){ int k; assert(num_to_add <= total_unknown); /* set weight to the document length */ for (k=0; k < total_unknown; k++) { bow_cdoc *cdoc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di); scores[k].weight = cdoc->word_count; } /* sort based on weight */ qsort(scores, total_unknown, sizeof (active_scores), active_scores_compare); /* change doc types */ for (k = 0 ; k < num_to_add; k++) { bow_cdoc *doc = bow_cdocs_di2doc (doc_barrel->cdocs, scores[k].di); assert(doc); assert(doc->type == bow_doc_unlabeled); bow_verbosify(bow_progress, "Labeling %s, weight %f\n", doc->filename, scores[k].weight); doc->type = bow_doc_train; } return;}voidactive_select_random (bow_barrel *doc_barrel, active_scores *scores, int num_to_add, int total_unknown, int committee_size){ int j; int k; bow_cdoc *doc; assert(num_to_add <= total_unknown); for (j=0, k=0; k < num_to_add; j++) { int scoresi = rand() % total_unknown; int doci; doci = scores[scoresi].di; doc = bow_cdocs_di2doc (doc_barrel->cdocs, doci); assert (doc); if (doc->type == bow_doc_unlabeled) { doc->type = bow_doc_train; bow_verbosify(bow_progress, "Labeling %s\n", doc->filename); k++; } if (j > doc_barrel->cdocs->length * 1000) bow_error ("Random number generator could not find enough " "unlabeled documents to convert."); } return;}voidactive_select_stream_ve (bow_barrel *doc_barrel, active_scores *scores, int num_to_add, int total_unknown, int committee_size){ int j; bow_cdoc *doc; int num_classes = bow_barrel_num_classes (doc_barrel); double *mean_class_dist; double mean_class_sum; int committee; int class; int k; int si; assert(num_to_add <= total_unknown); mean_class_dist = alloca (sizeof (double) * num_classes); /* Calculate the entropy of the class labels, H(Class|d,Committee), where Class and Committee are random varibles, and put this in SCORES->WEIGHT. */ for (k = 0; k < total_unknown; k++) { scores[k].weight = 0; /* Initialize the scores to be 'votes' */ for (committee = 0; committee < committee_size; committee++) { scores[k].scores[committee][0].weight = 1.0; for (si = 1; si < num_classes; si++) scores[k].scores[committee][si].weight = 0.0; } /* Initialize the mean class distribution for this document. */ for (class = 0; class < num_classes; class++) mean_class_dist[class] = 0; for (committee = 0; committee < committee_size; committee++) for (class = 0; class < num_classes; class++) mean_class_dist[scores[k].scores[committee][class].di] += scores[k].scores[committee][class].weight; mean_class_sum = 0; for (class = 0; class < num_classes; class++) mean_class_sum += mean_class_dist[class]; assert (mean_class_sum > committee_size * 0.999); assert (mean_class_sum < committee_size * 1.001); for (class = 0; class < num_classes; class++) mean_class_dist[class] /= mean_class_sum; /* Calculate the entropy of the mean class distribution */ for (class = 0; class < bow_barrel_num_classes (doc_barrel); class++)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -