📄 em.c
字号:
} } } else bow_error ("No method for this type."); if (num_tested % 100 == 0) bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", num_tested); num_tested++; } bow_verbosify(bow_progress, "\b\b\b\b\b\b%6d\n", num_tested); } /* Lower the temperature if doing DA */ if (em_anneal) { em_temperature *= em_temp_reduction; /* if temperature hits bottom, finish up */ if (em_temperature < 1.0) { em_temperature = 1.0; em_anneal = 0; em_runs = 1; } bow_verbosify (bow_progress, "Lowering temperature to %f\n", em_temperature); } } /* don't free class_probs for now. Need them if doing LOO */#if 0 /* fix back up the doc barrel... dealloc class_probs (wrong size!) */ for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); bow_free(cdoc->class_probs); cdoc->class_probs = NULL; }#endif #if 0 /* if halting by perplexity reduction, return the previous round's barrel */ if (em_halt_using_perplexity) { bow_wi2dvf_free(vpc_barrel->wi2dvf); vpc_barrel->wi2dvf = prev_wi2dvf; for (ci = 0; ci < max_new_ci; ci++) { bow_cdoc *cdoc = bow_array_entry_at_index(vpc_barrel->cdocs, ci); cdoc->prior = prev_priors[ci]; cdoc->word_count = prev_word_counts[ci]; cdoc->normalizer = prev_normalizers[ci]; } }#endif bow_em_making_barrel = 0; return vpc_barrel;}/* Calculate the perplexity of specified documents */doubleem_calculate_perplexity (bow_barrel *doc_barrel, bow_barrel *class_barrel){ bow_dv_heap *test_heap; /* we'll extract test WV's from here */ bow_wv *query_wv; int di; /* a document index */ bow_score *hits; int num_hits_to_retrieve = bow_barrel_num_classes (class_barrel); int actual_num_hits; bow_cdoc *doc_cdoc; double log_prob_of_data = 0; double *class_probs; int hi; int ci; double rescaler; double scores_sum; double num_data_words = 0; int num_tested = 0; int wvi; bow_dv *dv; /* turn this on so scoring knows to return perplexities */ bow_em_calculating_perplexity = 1; bow_verbosify(bow_progress, "\nCalculating perplexity: "); /* Create the heap from which we'll get WV's. Initialize QUERY_WV so BOW_HEAP_NEXT_WV() knows not to try to free. */ hits = alloca (sizeof (bow_score) * num_hits_to_retrieve); class_probs = alloca (sizeof (double) * num_hits_to_retrieve); test_heap = bow_test_new_heap (doc_barrel); query_wv = NULL; /* Loop once for each validation document. */ while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, em_perplexity_docs)) != -1) { doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); bow_wv_set_weights (query_wv, class_barrel); bow_wv_normalize_weights (query_wv, class_barrel); actual_num_hits = bow_barrel_score (class_barrel, query_wv, hits, num_hits_to_retrieve, (em_perplexity_loo ? (int) doc_cdoc->class_probs : (int) NULL)); assert (actual_num_hits == num_hits_to_retrieve); /* calculate class probabilities by normalizing scores and adding in the class priors */ { for (ci = 0; ci < num_hits_to_retrieve; ci++) class_probs[ci] = 2; for (hi = 0; hi < num_hits_to_retrieve; hi++) class_probs[hits[hi].di] = hits[hi].weight; /* check they all got set ok */ for (ci = 0; ci < num_hits_to_retrieve; ci++) assert (class_probs[ci] != 2); /* add in the class priors */ for (ci = 0; ci < num_hits_to_retrieve; ci++) { bow_cdoc *cdoc = bow_array_entry_at_index(class_barrel->cdocs, ci); class_probs[ci] += log (cdoc->prior); } /* Rescale the class_probs making them all 0 or negative, so that exp() will work well, especially around the higher-probability classes. */ rescaler = -DBL_MAX; for (ci = 0; ci < num_hits_to_retrieve; ci++) if (class_probs[ci] > rescaler) rescaler = class_probs[ci]; /* RESCALER is now the maximum of the class_probs. */ for (ci = 0; ci < num_hits_to_retrieve; ci++) class_probs[ci] -= rescaler; /* Use exp() on the class_probs to get probabilities from log-probabilities. */ for (ci = 0; ci < num_hits_to_retrieve; ci++) class_probs[ci] = exp (class_probs[ci]); /* If multi-hump neg, zero out the positive class */ if (doc_cdoc->type == bow_doc_train && bow_em_multi_hump_neg > 1 && doc_cdoc->class != binary_pos_ci) class_probs[binary_pos_ci] = 0; /* Normalize the class_probs so they all sum to one. */ scores_sum = 0; for (ci = 0; ci < num_hits_to_retrieve; ci++) scores_sum += class_probs[ci]; for (ci = 0; ci < num_hits_to_retrieve; ci++) class_probs[ci] /= scores_sum; } /* add in the contribution of this document. For training docs, only count the contribution of their class, since the class label is known. */ if (doc_cdoc->type != bow_doc_train || (doc_cdoc->type == bow_doc_train && bow_em_multi_hump_neg > 1 && doc_cdoc->class != binary_pos_ci)) { for (hi = 0; hi < num_hits_to_retrieve; hi++) log_prob_of_data += class_probs[hits[hi].di] * hits[hi].weight; } else { for (hi = 0; hi < num_hits_to_retrieve; hi++) { if (hits[hi].di == doc_cdoc->class) { log_prob_of_data += hits[hi].weight; break; } } }#if 0 if (bow_event_model == bow_event_document_then_word) assert (query_wv->normalizer == bow_event_document_then_word_document_length ); num_data_words += query_wv->normalizer;#endif /* calculate the number of words shared between the model and the doc */ for (wvi = 0; wvi < query_wv->num_entries; wvi++) { dv = bow_wi2dvf_dv (class_barrel->wi2dvf, query_wv->entry[wvi].wi); if (!dv) continue; num_data_words += query_wv->entry[wvi].weight; } if (num_tested % 100 == 0) bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", num_tested); num_tested++; } bow_verbosify(bow_progress, "\b\b\b\b\b\b%6d\n", num_tested); bow_verbosify (bow_progress, "Docs = %d, Words = %f, l(data) = %f\n", num_tested, num_data_words, log_prob_of_data); /* convert log prob to perplexity and return */ bow_em_calculating_perplexity = 0; return exp (-log_prob_of_data / num_data_words);}/* Calculate the accuracy of the barrel on the test set */floatem_calculate_accuracy (bow_barrel *doc_barrel, bow_barrel *class_barrel){ bow_dv_heap *test_heap; /* we'll extract test WV's from here */ bow_wv *query_wv; int di; /* a document index */ bow_score *hits; int num_hits_to_retrieve = 1; int actual_num_hits; bow_cdoc *doc_cdoc; int num_tested = 0; int num_correct = 0; /* Create the heap from which we'll get WV's. Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free. */ hits = alloca (sizeof (bow_score) * num_hits_to_retrieve); test_heap = bow_test_new_heap (doc_barrel); query_wv = NULL; /* Loop once for each test document. */ while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, em_accuracy_docs)) != -1) { doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); bow_wv_set_weights (query_wv, class_barrel); bow_wv_normalize_weights (query_wv, class_barrel); actual_num_hits = bow_barrel_score (class_barrel, query_wv, hits, num_hits_to_retrieve, (em_accuracy_loo ? (int) doc_cdoc->class_probs : (int) NULL)); assert (actual_num_hits == num_hits_to_retrieve); if (doc_cdoc->class == hits[0].di) num_correct++; num_tested++; } return (((float) num_correct) / ((float) num_tested));}/* Run test trials, outputing results to TEST_FP. The results are indended to be read and processed by the Perl script ./rainbow-stats. */voidbow_em_compare_to_nb (bow_barrel *doc_barrel){ bow_dv_heap *test_heap; /* we'll extract test WV's from here */ bow_wv *query_wv; int di; /* a document index */ bow_score *hits; int num_hits_to_retrieve = bow_barrel_num_classes (doc_barrel); int actual_num_hits; int hi; /* hit index */ bow_cdoc *doc_cdoc; bow_cdoc *class_cdoc; FILE *test_fp = stdout; bow_barrel *class_barrel; /* Re-create the vector-per-class barrel in accordance with the new train/test settings. */ doc_barrel->method = (rainbow_method*) bow_method_at_name ("naivebayes"); class_barrel = bow_barrel_new_vpc_with_weights (doc_barrel); /* Create the heap from which we'll get WV's. Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free. */ test_heap = bow_test_new_heap (doc_barrel); query_wv = NULL; hits = alloca (sizeof (bow_score) * num_hits_to_retrieve); fprintf(test_fp, "#0\n"); /* Loop once for each test document. */ while ((di = bow_test_next_wv (test_heap, doc_barrel, &query_wv)) != -1) { doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); class_cdoc = bow_array_entry_at_index (class_barrel->cdocs, doc_cdoc->class); bow_wv_set_weights (query_wv, class_barrel); bow_wv_normalize_weights (query_wv, class_barrel); actual_num_hits = bow_barrel_score (class_barrel, query_wv, hits, num_hits_to_retrieve, -1); assert (actual_num_hits == num_hits_to_retrieve); fprintf (test_fp, "%s %s ", doc_cdoc->filename, filename_to_classname(class_cdoc->filename)); for (hi = 0; hi < actual_num_hits; hi++) { class_cdoc = bow_array_entry_at_index (class_barrel->cdocs, hits[hi].di); fprintf (test_fp, "%s:%.*g ", filename_to_classname (class_cdoc->filename), bow_score_print_precision, hits[hi].weight); } fprintf (test_fp, "\n"); } bow_barrel_free (class_barrel); doc_barrel->method = (rainbow_method*) bow_method_at_name ("em");}/* Function to assign `Naive Bayes'-style weights to each element of each document vector. */voidbow_em_print_log_odds_ratio (bow_barrel *barrel, int num_to_print){ int ci; bow_cdoc *cdoc; int wi; /* a "word index" into WI2DVF */ int max_wi; /* the highest "word index" in WI2DVF. */ bow_dv *dv; /* the "document vector" at index WI */ int dvi; /* an index into the DV */ int weight_setting_num_words = 0; int total_num_words = 0; struct lorth { int wi; float lor; } lors[barrel->cdocs->length][num_to_print]; int wci; bow_error("Can't use this while normalizer is being used for non-integral word_count"); /* We assume that we have already called BOW_BARREL_NEW_VPC() on BARREL, so BARREL already has one-document-per-class. */ max_wi = MIN (barrel->wi2dvf->size, bow_num_words()); for (ci = 0; ci < barrel->cdocs->length; ci++) for (wci = 0; wci < num_to_print; wci++) { lors[ci][wci].lor = 0.0; lors[ci][wci].wi = -1; } /* assume that word_count, normalizer are already set */ /* Calculate the total number of occurrences of each word; store this int DV->IDF. */ for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (dv == NULL) continue; dv->idf = 0; for (dvi = 0; dvi < dv->length; dvi++) { /* Is cdoc used for anything? - Jason */ cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); total_num_words += dv->entry[dvi].weight; dv->idf += dv->entry[dvi].weight; } } bow_verbosify(bow_progress, "Calculating word weights: "); /* Set the weights in the BARREL's WI2DVF so that they are equal to P(w|C), the probability of a word given a class. */ for (wi = 0; wi < max_wi; wi++) { double pr_w = 0.0; dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (wi % 100 == 0) bow_verbosify(bow_progress, "\b\b\b\b\b\b%6d", wi); /* If the model doesn't know about this word, skip it. */ if (dv == NULL) continue; pr_w = ((double)dv->idf) / total_num_words; /* Now loop through all the elements, setting their weights */ for (dvi = 0; dvi < dv->length; dvi++) { double pr_w_c; double pr_w_not_c; double log_likelihood_ratio; cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); /* Here CDOC->WORD_COUNT is the total number of words in the class */ /* We use Laplace Estimation. */ pr_w_c = ((double)dv->entry[dvi].weight / (cdoc->word_count + cdoc->normalizer)); pr_w_c = (((double)dv->entry[dvi].weight + 1) / (cdoc->word_count + barrel->wi2dvf->num_words)); pr_w_not_c = ((dv->idf - dv->entry[dvi].weight + barrel->cdocs->length - 1)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -