📄 naivebayes.c
字号:
else if (bow_smoothing_method == bow_smoothing_goodturing) { assert(barrel == bow_naivebayes_goodturing_barrel); /* don't adjust if above k */ if (num_wi_ci > bow_smoothing_goodturing_k) pr_w_c = num_wi_ci / num_w_ci; /* if zero, just grab the stored weight */ else if (num_wi_ci == 0) pr_w_c = bow_naivebayes_goodturing_discounts[ci][0]; /* else adjust by discount factor */ else pr_w_c = bow_naivebayes_goodturing_discounts[ci][(int) num_wi_ci] * num_wi_ci / num_w_ci; } else if (bow_smoothing_method == bow_smoothing_dirichlet) { pr_w_c = (num_wi_ci + bow_naivebayes_dirichlet_alphas[wi]) / (num_w_ci + bow_naivebayes_dirichlet_total); } else { bow_error ("Naivebayes does not implement smoothing method %d", bow_smoothing_method); pr_w_c = 0; /* to avoid gcc warning */ }#if 0 if (pr_w_c <= 0) bow_error ("A negative word probability was calculated. " "This can happen if you are using\n" "--test-files-loo and the test files are " "not being lexed in the same way as they\n" "were when the model was built"); assert (pr_w_c > 0 && pr_w_c <= 1);#endif return pr_w_c;}doublebow_naivebayes_total_word_count_for_ci (bow_barrel *class_barrel, int ci){ double ret = 0; int max_wi, wi, dvi; bow_dv *dv; max_wi = MIN (class_barrel->wi2dvf->size, bow_num_words()); for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (class_barrel->wi2dvf, wi); for (dvi = 0; dv && dvi < dv->length; dvi++) if (dv->entry[dvi].di == ci) ret += dv->entry[dvi].weight; } return ret;}voidbow_naivebayes_print_word_probabilities_for_class (bow_barrel *barrel, const char *classname){ int wi; int ci = bow_str2int_no_add (barrel->classnames, classname); double pr_w; assert (ci >= 0); for (wi = 0; wi < barrel->wi2dvf->size; wi++) { pr_w = bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0, NULL, NULL); if (pr_w >= 0) printf ("%20.18f %s\n", pr_w, bow_int2word (wi)); } printf ("%-30s %10.8f\n", "total_count", bow_naivebayes_total_word_count_for_ci (barrel, ci));}bow_wa *bow_naivebayes_new_odds_ratio_for_ci (bow_barrel *barrel, int the_ci){ bow_wa *ret; int wi; int ci; int max_wi; bow_cdoc *cdoc; double pr_wi_c; double pr_wi_not_c; double class_prior_ratio; double pr_wi; double pr_not_wi; double ig; bow_dv *dv; int dvi; cdoc = bow_array_entry_at_index (barrel->cdocs, the_ci); class_prior_ratio = cdoc->prior / (1.0 - cdoc->prior); max_wi = MIN (barrel->wi2dvf->size, bow_num_words()); ret = bow_wa_new (max_wi+2); for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); /* If the model doesn't know about this word, skip it. */ if (dv == NULL) continue; pr_wi_c = 0; pr_wi_not_c = 0; for (ci = 0, dvi = 0; ci < barrel->cdocs->length; ci++) { if (the_ci == ci) pr_wi_c = bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0, &dv, &dvi); else pr_wi_not_c += bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0, &dv, &dvi); } pr_wi = pr_wi_c + pr_wi_not_c; pr_not_wi = (1 - pr_wi);#if 0 ig = (-(pr_wi * log (pr_wi) + pr_not_wi * log (pr_not_wi)) + ((pr_wi_c * log (pr_wi_c) + (1-pr_wi_c) * log (1-pr_wi_c))));#endif ig = pr_wi_c * log (pr_wi_c / pr_wi_not_c); bow_wa_append (ret, wi, ig); } bow_wa_sort (ret); return ret;}/* Print the top N words by odds ratio for each class. */voidbow_naivebayes_print_odds_ratio_for_all_classes (bow_barrel *barrel, int n){ int ci; bow_cdoc *cdoc; bow_wa *wa; for (ci = 0; ci < barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (barrel->cdocs, ci); wa = bow_naivebayes_new_odds_ratio_for_ci (barrel, ci); fprintf (stderr, "%s [%d words]\n", cdoc->filename, cdoc->word_count); bow_wa_fprintf (wa, stderr, n); bow_wa_free (wa); }}voidbow_naivebayes_print_odds_ratio_for_class (bow_barrel *barrel, const char *classname){ int wi; int the_ci; int ci; int max_wi; bow_cdoc *cdoc; double pr_wi_c; double pr_wi_not_c; double class_prior_ratio; bow_dv *dv; int dvi; the_ci = bow_str2int_no_add (barrel->classnames, classname); if (the_ci == -1) bow_error ("%s: Classname `%s' not found", __PRETTY_FUNCTION__, classname); cdoc = bow_array_entry_at_index (barrel->cdocs, the_ci); class_prior_ratio = cdoc->prior / (1.0 - cdoc->prior); max_wi = MIN (barrel->wi2dvf->size, bow_num_words()); for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); /* If the model doesn't know about this word, skip it. */ if (dv == NULL) continue; pr_wi_c = 0; pr_wi_not_c = 0; for (ci = 0, dvi = 0; ci < bow_barrel_num_classes (barrel); ci++) { if (the_ci == ci) pr_wi_c = bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0, &dv, &dvi); else pr_wi_not_c += bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0, &dv, &dvi); } printf ("%.10f %s\n", pr_wi_c * log (pr_wi_c / pr_wi_not_c), bow_int2word (wi)); }}/* Get the total number of terms in each class; store this in CDOC->WORD_COUNT. */voidbow_naivebayes_set_cdoc_word_count_from_wi2dvf_weights (bow_barrel *barrel){ int ci; bow_cdoc *cdoc; int wi, max_wi; bow_dv *dv; int dvi; int num_classes = bow_barrel_num_classes (barrel); double num_words_per_ci[num_classes]; for (ci = 0; ci < num_classes; ci++) num_words_per_ci[ci] = 0; max_wi = MIN (barrel->wi2dvf->size, bow_num_words()); for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (dv == NULL) continue; for (dvi = 0; dvi < dv->length; dvi++) { cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); ci = dv->entry[dvi].di; assert (ci < num_classes); num_words_per_ci[ci] += dv->entry[dvi].weight; } } for (ci = 0; ci < barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (barrel->cdocs, ci); cdoc->word_count = (int) rint (num_words_per_ci[ci]); }}/* Function to assign `Naive Bayes'-style weights to each element of each document vector. */voidbow_naivebayes_set_weights (bow_barrel *barrel){ int ci; bow_cdoc *cdoc; int wi; /* a "word index" into WI2DVF */ int max_wi; /* the highest "word index" in WI2DVF. */ bow_dv *dv; /* the "document vector" at index WI */ int dvi; /* an index into the DV */ int weight_setting_num_words = 0; double *pr_all_w_c = alloca (barrel->cdocs->length * sizeof (double)); double pr_w_c; int total_num_words = 0; /* Gather the word count here instead of directly of in CDOC->WORD_COUNT so we avoid round-off error with each increment. Remember, CDOC->WORD_COUNT is a int! */ float num_words_per_ci[bow_barrel_num_classes (barrel)]; int barrel_is_empty = 0; /* We assume that we have already called BOW_BARREL_NEW_VPC() on BARREL, so BARREL already has one-document-per-class. */#if 0 assert (!strcmp (barrel->method->name, "naivebayes") || !strcmp (barrel->method->name, "crossentropy") || !strcmp (barrel->method->name, "active"));#endif max_wi = MIN (barrel->wi2dvf->size, bow_num_words()); /* The CDOC->PRIOR should have been set in bow_barrel_new_vpc(); verify it. */ /* Get the total number of unique terms in each class; store this in CDOC->NORMALIZER. */ for (ci = 0; ci < barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (barrel->cdocs, ci); assert (cdoc->prior >= 0); pr_all_w_c[ci] = 0; cdoc->normalizer = 0; num_words_per_ci[ci] = 0; } /* Set the CDOC->WORD_COUNT for each class. If we are using a document (binomial) model, then we'll just use the value of WORD_COUNT set in bow_barrel_new_vpc(), which is the total number of *documents* in the class, not the number of words. */ /* Calculate P(w); store this in DV->IDF. */ if (bow_event_model != bow_event_document) { /* Get the total number of terms in each class; store this in CDOC->WORD_COUNT. */ /* Calculate the total number of unique words, and make sure it is the same as BARREL->WI2DVF->NUM_WORDS. */ int num_unique_words = 0; for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (dv == NULL) continue; num_unique_words++; dv->idf = 0.0; for (dvi = 0; dvi < dv->length; dvi++) { cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); ci = dv->entry[dvi].di; num_words_per_ci[ci] += dv->entry[dvi].weight; cdoc->normalizer++; dv->idf += dv->entry[dvi].weight; total_num_words += dv->entry[dvi].weight; } } for (ci = 0; ci < barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (barrel->cdocs, ci); cdoc->word_count = (int) rint (num_words_per_ci[ci]); } assert (num_unique_words == barrel->wi2dvf->num_words); /* Normalize the DV->IDF to sum to one across all words, so it is P(w). */ if (total_num_words) { for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (dv == NULL) continue; dv->idf /= total_num_words; } } else { barrel_is_empty = 1; bow_verbosify (bow_progress, "Zero words in class barrel\n"); } } /* initialize smoothing methods, if necessary */ if (bow_smoothing_method == bow_smoothing_goodturing) bow_naivebayes_initialize_goodturing (barrel); else if (bow_smoothing_method == bow_smoothing_dirichlet) { bow_naivebayes_load_dirichlet_alphas (); bow_naivebayes_initialize_dirichlet_smoothing (barrel); } if (bow_event_model != bow_event_document && !barrel_is_empty) { /* Now loop through all the classes, verifying the the probability of all in each class sums to one. */ total_num_words = 0; for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); /* If the model doesn't know about this word, skip it. */ if (dv == NULL) continue; for (ci = 0; ci < barrel->cdocs->length; ci++) { pr_w_c = bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0, NULL, NULL); cdoc = bow_array_entry_at_index (barrel->cdocs, ci); assert (pr_w_c <= 1); pr_all_w_c[ci] += pr_w_c; } weight_setting_num_words++; } for (ci = 0; ci < barrel->cdocs->length; ci++) { /* Is this too much round-off error to expect? */ assert (pr_all_w_c[ci] < 1.01 && pr_all_w_c[ci] > 0.99); } }#if 0 fprintf (stderr, "wi2dvf num_words %d, weight-setting num_words %d\n", barrel->wi2dvf->num_words, weight_setting_num_words);#endif}#define IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR 999.99intbow_naivebayes_score (bow_barrel *barrel, bow_wv *query_wv, bow_score *bscores, int bscores_len, int loo_class){ double *scores; /* will become prob(class), indexed over CI */ int ci; /* a "class index" (document index) */ int wvi; /* an index into the entries of QUERY_WV. */ int dvi; /* an index into a "document vector" */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -