📄 naivebayes.c
字号:
/* Initialize the SCORES to the class prior probabilities. */ if (bow_print_word_scores) printf ("%s\n", "(CLASS PRIOR PROBABILIES)"); for (ci = 0; ci < barrel->cdocs->length; ci++) { bow_cdoc *cdoc; cdoc = bow_array_entry_at_index (barrel->cdocs, ci); if (bow_uniform_class_priors) /* Uniform prior means each class has probability 1/#classes. */ scores[ci] = - log (barrel->cdocs->length); else {#if 0 /* For now forget about this little detail, because rainbow-h trips up on it. */ /* LOO_CLASS is not implemented for cases in which we are not doing uniform class priors. */ assert (loo_class == -1);#endif assert (cdoc->prior > 0.0f && cdoc->prior <= 1.0f); scores[ci] = log (cdoc->prior); } assert (scores[ci] > -FLT_MAX + 1.0e5); if (bow_print_word_scores) printf ("%16s %-40s %10.9f\n", "", (strrchr (cdoc->filename, '/') ? : cdoc->filename), scores[ci]); } /* Loop over each word in the word vector QUERY_WV, putting its contribution into SCORES. */ for (wvi = 0; wvi < query_wv->num_entries; wvi++) { int wi; /* the word index for the word at WVI */ bow_dv *dv; /* the "document vector" for the word WI */ /* Get information about this word. */ wi = query_wv->entry[wvi].wi; dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); /* If the model doesn't know about this word, skip it. */ if (!dv) continue; if (bow_print_word_scores) printf ("%-30s (queryweight=%.8f)\n", bow_int2word (wi), query_wv->entry[wvi].weight * query_wv->normalizer); rescaler = DBL_MAX; /* Loop over all classes, putting this word's (WI's) contribution into SCORES. */ for (ci = 0, dvi = 0; ci < barrel->cdocs->length; ci++) { bow_cdoc *cdoc; cdoc = bow_array_entry_at_index (barrel->cdocs, ci); assert (cdoc->type == model); /* Assign PR_W_C to P(w|C), either using a DV entry, or, if there is no DV entry for this class, using M-estimate smoothing */ if (dv) while (dvi < dv->length && dv->entry[dvi].di < ci) dvi++; if (dv && dvi < dv->length && dv->entry[dvi].di == ci) { if (loo_class == ci) { /* xxx This is not exactly right, because BARREL->WI2DVF->NUM_WORDS might have changed with the removal of QUERY_WV's document. */ pr_w_c = ((float) ((M_EST_M * M_EST_P) + dv->entry[dvi].count - query_wv->entry[wvi].count) / (M_EST_M + cdoc->word_count - query_wv->entry[wvi].count)); if (pr_w_c <= 0) bow_error ("A negative word probability was calculated. " "This can happen if you are using\n" "--test-files-loo and the test files are " "not being lexed in the same way as they\n" "were when the model was built"); assert (pr_w_c > 0 && pr_w_c <= 1); } else { pr_w_c = ((float) ((M_EST_M * M_EST_P) + dv->entry[dvi].count) / (M_EST_M + cdoc->word_count)); assert (pr_w_c > 0 && pr_w_c <= 1); } } else { if (loo_class == ci) { /* xxx This is not exactly right, because BARREL->WI2DVF->NUM_WORDS might have changed with the removal of QUERY_WV's document. */ pr_w_c = ((M_EST_M * M_EST_P) / (M_EST_M + cdoc->word_count - query_wv->entry[wvi].count)); assert (pr_w_c > 0 && pr_w_c <= 1); } else { pr_w_c = ((M_EST_M * M_EST_P) / (M_EST_M + cdoc->word_count)); assert (pr_w_c > 0 && pr_w_c <= 1); } } assert (pr_w_c > 0 && pr_w_c <= 1); log_pr_tf = log (pr_w_c); assert (log_pr_tf > -FLT_MAX + 1.0e5); /* Take into consideration the number of times it occurs in the query document */ log_pr_tf *= query_wv->entry[wvi].count; assert (log_pr_tf > -FLT_MAX + 1.0e5); scores[ci] += log_pr_tf; if (bow_print_word_scores) printf (" %8.2e %7.2f %-40s %10.9f\n", pr_w_c, log_pr_tf, (strrchr (cdoc->filename, '/') ? : cdoc->filename), scores[ci]); /* Keep track of the minimum score updated for this word. */ if (rescaler > scores[ci]) rescaler = scores[ci]; } /* Loop over all classes, re-scaling SCORES so that they don't get so small we loose floating point resolution. This scaling always keeps all SCORES positive. */ if (rescaler < 0) { for (ci = 0; ci < barrel->cdocs->length; ci++) { /* Add to SCORES to bring them close to zero. RESCALER is expected to often be less than zero here. */ /* xxx If this doesn't work, we could keep track of the min and the max, and sum by their average. */ scores[ci] += -rescaler; assert (scores[ci] > -DBL_MAX + 1.0e5 && scores[ci] < DBL_MAX - 1.0e5); } } } /* Now SCORES[] contains a (unnormalized) log-probability for each class. */ /* Rescale the SCORE one last time, this time making them all 0 or negative, so that exp() will work well, especially around the higher-probability classes. */ { rescaler = -DBL_MAX; for (ci = 0; ci < barrel->cdocs->length; ci++) if (scores[ci] > rescaler) rescaler = scores[ci]; /* RESCALER is now the maximum of the SCORES. */ for (ci = 0; ci < barrel->cdocs->length; ci++) scores[ci] -= rescaler; } /* Use exp() on the SCORES to get probabilities from log-probabilities. */ for (ci = 0; ci < barrel->cdocs->length; ci++) { new_score = exp (scores[ci]); /* assert (new_score > 0 && new_score < DBL_MAX - 1.0e5); */ scores[ci] = new_score; } /* Normalize the SCORES so they all sum to one. */ { double scores_sum = 0; for (ci = 0; ci < barrel->cdocs->length; ci++) scores_sum += scores[ci]; for (ci = 0; ci < barrel->cdocs->length; ci++) { scores[ci] /= scores_sum; /* assert (scores[ci] > 0); */ } } /* Return the SCORES by putting them (and the `class indices') into SCORES in sorted order. */ { num_scores = 0; for (ci = 0; ci < barrel->cdocs->length; ci++) { if (num_scores < bscores_len || bscores[num_scores-1].weight < scores[ci]) { /* We are going to put this score and CI into SCORES because either: (1) there is empty space in SCORES, or (2) SCORES[CI] is larger than the smallest score there currently. */ int dsi; /* an index into SCORES */ if (num_scores < bscores_len) num_scores++; dsi = num_scores - 1; /* Shift down all the entries that are smaller than SCORES[CI] */ for (; dsi > 0 && bscores[dsi-1].weight < scores[ci]; dsi--) bscores[dsi] = bscores[dsi-1]; /* Insert the new score */ bscores[dsi].weight = scores[ci]; bscores[dsi].di = ci; } } } return num_scores;}bow_params_naivebayes bow_naivebayes_params ={ bow_no, /* no uniform priors */ bow_yes, /* normalize_scores */};bow_method bow_method_naivebayes = { "naivebayes", bow_naivebayes_set_weights, 0, /* no weight scaling function */ NULL, /* bow_barrel_normalize_weights_by_summing, */ bow_barrel_new_vpc_merge_then_weight, bow_barrel_set_vpc_priors_by_counting, bow_naivebayes_score, bow_wv_set_weights_to_count, NULL, /* no need for extra weight normalization */ &bow_naivebayes_params};void _register_method_naivebayes () __attribute__ ((constructor));void _register_method_naivebayes (){ bow_method_register_with_name (&bow_method_naivebayes, "naivebayes"); bow_argp_add_child (&naivebayes_argp_child);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -