📄 em.c
字号:
counts[n % bow_em_multi_hump_neg]++; /* reassign the negative docs */ for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); int new_class; if (cdoc->type != bow_doc_train || cdoc->class == binary_pos_ci) continue; assert(yet_to_find > 0); /* find a new class */ for (new_class = rand() % bow_em_multi_hump_neg; counts[new_class] == 0; new_class = rand() % bow_em_multi_hump_neg); yet_to_find--; counts[new_class]--; /* assign it to the right hump */ if (new_class != 0) { cdoc->class_probs[new_class + 1] = 1.0; cdoc->class_probs[binary_neg_ci] = 0.0; } } assert(yet_to_find == 0); } else if (em_multi_hump_init == bow_em_init_spread) { bow_random_set_seed(); /* spread each negative doc randomly over neg components */ for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); float total = 0; if (cdoc->type != bow_doc_train || cdoc->class == binary_pos_ci) continue; for (ci=0; ci < max_new_ci; ci++) { if (ci == binary_pos_ci) cdoc->class_probs[ci] = 0.0; else { cdoc->class_probs[ci] = (float) (rand() % 100) + 1; total += cdoc->class_probs[ci]; } } for (ci=0; ci < max_new_ci; ci++) { cdoc->class_probs[ci] /= total ; } } } else bow_error ("No initialization for this type"); } /* set priors using just the known docs if we'll need them for setting class_probs */ if (em_unlabeled_start == em_start_prior) { assert (num_train_docs > 0); assert (!bow_uniform_class_priors); (*doc_barrel->method->vpc_set_priors) (vpc_barrel, doc_barrel); } else { for (ci = 0; ci < max_new_ci; ci++) { bow_cdoc *cdoc = bow_array_entry_at_index(vpc_barrel->cdocs, ci); cdoc->prior = 0.0; } } /* set the class probs of all the unlabeled docs to determine the EM starting point */ for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); if (cdoc->type != bow_doc_unlabeled) continue; if (em_unlabeled_start == em_start_zero) { /* set class_probs as all zeros (ignore them for first M step) */ for (ci=0; ci < max_new_ci; ci++) cdoc->class_probs[ci] = 0.0; } else if (em_unlabeled_start == em_start_random) { float total = 0; /* if there are no labeled docs, randomly assign class probs */ bow_random_set_seed(); for (ci=0; ci < max_new_ci; ci++) { cdoc->class_probs[ci] = (float) (rand() % 100); total += cdoc->class_probs[ci]; } for (ci=0; ci < max_new_ci; ci++) { cdoc->class_probs[ci] *= unlabeled_normalizer / total ; } } else if (em_unlabeled_start == em_start_prior) { /* distribute class_probs according to priors on just the known */ assert (!bow_em_multi_hump_neg && !bow_uniform_class_priors); assert (num_train_docs > 0); for (ci=0; ci < max_new_ci; ci++) { bow_cdoc *class_cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci); cdoc->class_probs[ci] = class_cdoc->prior * unlabeled_normalizer; } } else if (em_unlabeled_start == em_start_even) { /* distribute class_probs evenly across all classes */ for (ci=0; ci < max_new_ci; ci++) { cdoc->class_probs[ci] = unlabeled_normalizer / bow_barrel_num_classes(vpc_barrel); } } else bow_error ("No such value for em_unlabeled_start"); } } /* let's do some EM */ while (em_anneal ? em_temperature >= 1.0 : (em_halt_using_perplexity ? (old_perplexity > new_perplexity && ABS (new_perplexity - old_perplexity) > 0.05) : (em_halt_using_accuracy ? old_accuracy < new_accuracy : em_runs < bow_em_num_em_runs))) { em_runs++; /* the M-step */ bow_verbosify (bow_progress, "Making class barrel by counting words: "); if (vpc_barrel->wi2dvf != NULL) bow_wi2dvf_free(vpc_barrel->wi2dvf);#if 0 /* save the previous wi2dvf */ if (prev_wi2dvf != NULL) bow_wi2dvf_free(prev_wi2dvf); prev_wi2dvf = vpc_barrel->wi2dvf; for (ci = 0; ci < max_new_ci; ci++) { bow_cdoc *cdoc = bow_array_entry_at_index(vpc_barrel->cdocs, ci); prev_priors[ci] = cdoc->prior; prev_word_counts[ci] = cdoc->word_count; prev_normalizers[ci] = cdoc->normalizer; }#endif /* get a new wi2dvf structure for our class barrel */ vpc_barrel->wi2dvf = bow_wi2dvf_new (doc_barrel->wi2dvf->size); /* Initialize the WI2DVF part of the VPC_BARREL. Sum together the counts and weights for individual documents, grabbing only the training and unlabeled documents. */ for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi); if (!dv) continue;#if 0 /* create the dv in the class barrel if there's an entry in the doc barrel. This ensures that perplexity calculations happen correctly. */ vpc_barrel->wi2dvf->entry[wi].dv = bow_dv_new (0); vpc_barrel->wi2dvf->entry[wi].seek_start = 2; (vpc_barrel->wi2dvf->num_words)++;#endif for (dvi = 0; dvi < dv->length; dvi++) { bow_cdoc *cdoc; di = dv->entry[dvi].di; cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); if (cdoc->type == bow_doc_train || cdoc->type == bow_doc_unlabeled) { assert(cdoc->word_count > 0); for (ci=0; ci < max_new_ci; ci++) { /* it's important to do this even when class_prob is 0 to ensure that perplexity calculations happen ok. */#if 0 if (cdoc->class_probs[ci] > 0)#endif { if (bow_event_model == bow_event_document_then_word) bow_wi2dvf_add_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, 1, /* hopelessly dummy value */ (cdoc->class_probs[ci] * (float) dv->entry[dvi].count * (float) bow_event_document_then_word_document_length / (float) cdoc->word_count)); else if (bow_event_model == bow_event_word) { float addition = cdoc->class_probs[ci] * (float) dv->entry[dvi].count; bow_wi2dvf_add_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, 1, /* hopelessly dummy value */ addition); } else bow_error("No implementation of this event model."); } } } } if (wi % 100 == 0) bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", max_wi - wi); } bow_verbosify (bow_progress, "\n"); /* set the dv->idf, normalizer and word_count */ bow_em_set_weights (vpc_barrel); /* set priors */ if (doc_barrel->method->vpc_set_priors && !bow_uniform_class_priors) (*doc_barrel->method->vpc_set_priors) (vpc_barrel, doc_barrel); /* If on first EM run, and doing perturbed starting points (e.g. for active learning), then perturb the the weights using the variance */ if (em_runs == 1 && bow_em_perturb_starting_point) bow_em_perturb_weights (doc_barrel, vpc_barrel); /* print top words by class */ if (bow_em_print_word_vector) bow_em_print_log_odds_ratio(vpc_barrel, 20); /* Print the P(C|w) distribution to a file so that we can later calculate the KL-divergence between the current distribution and the "correct" distribution. */ if (bow_em_print_probs) bow_em_print_word_distribution(vpc_barrel, em_runs, bow_barrel_num_classes(vpc_barrel)); /* if we're ignoring the labeled data during the iterations, then zero out their class probs now */ if (em_runs == 1 && em_labeled_for_start_only) { for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); if (cdoc->type == bow_doc_train) cdoc->class_probs[cdoc->class] = 0.0; } } /* OK. we're done with our M-step. We have a new vpc barrel to use. Let's now do the E-step, and classify all our documents. */ /* Calculate perplexity of the validation set for halting check */ if (em_perplexity_docs) { old_perplexity = new_perplexity; new_perplexity = em_calculate_perplexity (doc_barrel, vpc_barrel); bow_verbosify(bow_progress, "Perplexity = %f\n", new_perplexity); } /* Calculate accuracy of the validation set for halting check */ if (em_accuracy_docs) { old_accuracy = new_accuracy; new_accuracy = em_calculate_accuracy (doc_barrel, vpc_barrel); bow_verbosify (bow_progress, "Correct: %f\n", new_accuracy); } /* adjust the normalizer if we're annealing it. */ if (bow_em_anneal_normalizer) { float new_unlabeled_fraction; total_weight = ((float) num_train_docs) + (unlabeled_normalizer * (float) num_unlabeled_docs); labeled_weight_fraction = (float) num_train_docs / total_weight; /* increase weight of unlabeled data by factor of 1.1, unless it's the first round; then bump it away from zero slightly */ if (labeled_weight_fraction == 1.0) { new_labeled_fraction = 0.98; new_unlabeled_fraction = 0.02; } else { new_unlabeled_fraction = 1.1 * (1.0 - labeled_weight_fraction); new_labeled_fraction = 1.0 - new_unlabeled_fraction; } unlabeled_normalizer = ((num_train_docs / new_labeled_fraction) - num_train_docs) / num_unlabeled_docs; /* halt normalizer annealing when one labeled document is the same as one unlabeled document */ if (new_unlabeled_fraction >= 1.0 || unlabeled_normalizer >= 1.0) { unlabeled_normalizer = 1.0; bow_em_anneal_normalizer = 0; em_runs = 1; } assert (unlabeled_normalizer >= 0 && unlabeled_normalizer <= 1 ); bow_verbosify (bow_progress, "Updating total labeled weight to %f (normalizer = %f).\n", new_labeled_fraction, unlabeled_normalizer); } /* only do the e-step if not the last round */ if (em_anneal ? 1 : (em_halt_using_perplexity ? (old_perplexity > new_perplexity && ABS(new_perplexity - old_perplexity) > 0.05) : (em_halt_using_accuracy ? old_accuracy < new_accuracy : em_runs < bow_em_num_em_runs))) { /* now classify the unknown documents */ bow_verbosify(bow_progress, "\nClassifying unlabeled documents: "); /* Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free. Create the heap from which we'll get WV's. */ query_wv = NULL; hits = alloca (sizeof (bow_score) * max_new_ci); num_tested = 0; test_heap = bow_test_new_heap (doc_barrel); /* Loop once for each unlabeled document. */ while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, bow_cdoc_next_em_doc)) != -1) { doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); bow_wv_set_weights (query_wv, vpc_barrel); bow_wv_normalize_weights (query_wv, vpc_barrel); actual_num_hits = bow_barrel_score (vpc_barrel, query_wv, hits, max_new_ci, (int) NULL); assert (actual_num_hits == max_new_ci); if (em_stat_method == simple) { /* set the class probs to 1 for the maximally likely class */ for (ci = 0; ci < max_new_ci; ci++) doc_cdoc->class_probs[ci] = 0.0; doc_cdoc->class_probs[hits[0].di] = unlabeled_normalizer; } else if (em_stat_method == nb_score) { /* set the class probs to the naive bayes score */ for (hi = 0; hi < actual_num_hits; hi++) doc_cdoc->class_probs[hits[hi].di] = unlabeled_normalizer * hits[hi].weight; /* this is a neg training doc. Zero out the pos component. */ if (bow_em_multi_hump_neg > 1 && doc_cdoc->type == bow_doc_train) { double new_total = 0; doc_cdoc->class_probs[binary_pos_ci] = 0; for (ci = 0; ci < max_new_ci; ci++) new_total += doc_cdoc->class_probs[ci]; if (new_total != 0) { for (ci = 0; ci < max_new_ci; ci++) doc_cdoc->class_probs[ci] = unlabeled_normalizer * doc_cdoc->class_probs[ci] / new_total; } else { /* blech. we got hosed on roundoff. */ for (ci = 0; ci < max_new_ci; ci++) doc_cdoc->class_probs[ci] = (float) unlabeled_normalizer / ((float) max_new_ci - 1.0); doc_cdoc->class_probs[binary_pos_ci] = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -