📄 naivebayes.c
字号:
double pr_w_c; /* P(w|C), prob a word is in a class */ double log_pr_tf; /* log(P(w|C)^TF), ditto, log() of it */ double rescaler; /* Rescale SCORES by this after each word */ double new_score; /* a temporary holder */ int num_scores = 0; /* number of entries placed in SCORES */ int num_words_in_query = 0; double pr_w_d; /* P(w|d) */ double h_w_d; /* entropy of P(W|d) */ int wi; int hi; int max_wi; double query_wv_total_weight; /* Binomial event model with LOO processing doesn't work yet. */ assert (bow_event_model != bow_event_document || loo_class == -1); max_wi = MIN (barrel->wi2dvf->size, bow_num_words()); /* Allocate space to store scores for *all* classes (documents) */ scores = alloca (barrel->cdocs->length * sizeof (double)); /* Instead of multiplying probabilities, we will sum up log-probabilities, (so we don't loose floating point resolution), and then take the exponent of them to get probabilities back. */ /* Initialize the SCORES to the class prior probabilities. */ if (bow_print_word_scores) printf ("%s\n", "(CLASS PRIOR PROBABILIES)"); for (hi = 0; hi < bscores_len; hi++) bscores[hi].name = NULL; /* Initialize log-probabilities to 0 (unless class prior is zero) */ for (ci = 0; ci < barrel->cdocs->length; ci++) { bow_cdoc *cdoc; cdoc = bow_array_entry_at_index (barrel->cdocs, ci); if (cdoc->prior == 0) scores[ci] = IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR; else scores[ci] = 0; } /* If we are doing leave-one-out evaluation, get the total number of words in this query. */ if (1 || loo_class >= 0 || naivebayes_cross_entropy) { bow_dv *dv; num_words_in_query = 0; for (wvi = 0; wvi < query_wv->num_entries; wvi++) { /* Only count those words that are in the model's vocabulary. */ dv = bow_wi2dvf_dv (barrel->wi2dvf, query_wv->entry[wvi].wi); if (dv) num_words_in_query += query_wv->entry[wvi].count; } } /* Set the weights of the QUERY_WV, according to the event model. */ for (wvi = 0; wvi < query_wv->num_entries; wvi++) { if (bow_event_model == bow_event_document_then_word) query_wv->entry[wvi].weight = bow_event_document_then_word_document_length * ((float)query_wv->entry[wvi].count) / num_words_in_query; else query_wv->entry[wvi].weight = query_wv->entry[wvi].count; } if (bow_event_model == bow_event_document_then_word) query_wv_total_weight = bow_event_document_then_word_document_length; else query_wv_total_weight = num_words_in_query; /* Put contribution of the words into SCORES. If we are using the document event model, then loop over all words in the vocabulary, otherwise, just loop over all the words in the QUERY_WV document. */ h_w_d = 0; for (wvi = 0, wi = 0; ((bow_event_model == bow_event_document) ? (wi < max_wi) : (wvi < query_wv->num_entries)); ((bow_event_model == bow_event_document) ? (wi++) : (wvi++))) { bow_dv *dv; /* the "document vector" for the word WI */ /* Get information about this word. */ /* Align WI and WVI in ways that depend on whether we are looping over all words in the vocabulary or over words in the query. */ if (bow_event_model == bow_event_document) { if (query_wv->entry[wvi].wi < wi && wvi < query_wv->num_entries) { assert (query_wv->entry[wvi].wi == wi-1); wvi++; } } else { wi = query_wv->entry[wvi].wi; } dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); /* If the model doesn't know about this word, skip it. */ if (!dv) continue; if (wi == query_wv->entry[wvi].wi && query_wv->num_entries) { pr_w_d = ((double)query_wv->entry[wvi].count) / num_words_in_query; h_w_d -= pr_w_d * log (pr_w_d); } if (bow_print_word_scores) printf ("%-30s (queryweight=%.8f)\n", bow_int2word (wi), query_wv->entry[wvi].weight * query_wv->normalizer); rescaler = DBL_MAX; /* Loop over all classes, putting this word's (WI's) contribution into SCORES. */ for (ci = 0, dvi = 0; ci < barrel->cdocs->length; ci++) { if (scores[ci] == IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR) continue; pr_w_c = bow_naivebayes_pr_wi_ci (barrel, wi, ci, loo_class, query_wv->entry[wvi].weight, query_wv_total_weight, &dv, &dvi); /* If this is a word that does not occur in the document, then use the probability it does not occur in the class. This occurs only if we are using the document event model. */ if (query_wv->num_entries == 0 || wi != query_wv->entry[wvi].wi) pr_w_c = 1.0 - pr_w_c; assert (pr_w_c > 0 && pr_w_c <= 1); /* Put the probability in log-space */ log_pr_tf = log (pr_w_c); assert (log_pr_tf > -FLT_MAX + 1.0e5); /* Take into consideration the number of times it occurs in the query document */ if (bow_event_model != bow_event_document) log_pr_tf *= query_wv->entry[wvi].weight; assert (log_pr_tf > -FLT_MAX + 1.0e5); scores[ci] += log_pr_tf; if (bow_print_word_scores) { bow_cdoc *cdoc = bow_array_entry_at_index (barrel->cdocs, ci); printf (" %8.2e %7.2f %-40s %10.9f\n", pr_w_c, log_pr_tf, (strrchr (cdoc->filename, '/') ? : cdoc->filename), scores[ci]); } /* Keep track of the minimum score updated for this word. */ if (rescaler > scores[ci]) rescaler = scores[ci]; } /* Loop over all classes, re-scaling SCORES so that they don't get so small we loose floating point resolution. This scaling always keeps all SCORES positive. */ if (naivebayes_rescale_scores && rescaler < 0 && !naivebayes_score_returns_doc_pr) { for (ci = 0; ci < barrel->cdocs->length; ci++) { /* Add to SCORES to bring them close to zero. RESCALER is expected to often be less than zero here. */ /* xxx If this doesn't work, we could keep track of the min and the max, and sum by their average. */ if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR) scores[ci] += -rescaler; assert (scores[ci] > -DBL_MAX + 1.0e5 && scores[ci] < DBL_MAX - 1.0e5); } } } /* Now SCORES[] contains a (unnormalized) log-probability of the document for each class. */ /* Anneal the probability */ if (bow_naivebayes_anneal_temperature != 1) { for (ci = 0; ci < barrel->cdocs->length; ci++) if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR) {#if 0 scores[ci] /= (query_wv_total_weight + bow_naivebayes_anneal_temperature);#elif 0 scores[ci] /= 1 + log (query_wv_total_weight + 1);#elif 0 scores[ci] /= ((pow (query_wv_total_weight, 0.9) + 1) / 3);#elif 1 scores[ci] /= ((query_wv_total_weight + 1) / 7);#else scores[ci] /= bow_naivebayes_anneal_temperature;#endif assert (scores[ci] > -FLT_MAX + 1.0e5); } } /* Incorporate the class prior */ if (!naivebayes_score_returns_doc_pr && !bow_uniform_class_priors) { for (ci = 0; ci < barrel->cdocs->length; ci++) { bow_cdoc *cdoc; cdoc = bow_array_entry_at_index (barrel->cdocs, ci); assert (cdoc->prior >= 0.0f && cdoc->prior <= 1.0f); if (cdoc->prior == 0) assert (scores[ci] == IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR); else scores[ci] += log (cdoc->prior); assert (scores[ci] > -FLT_MAX + 1.0e5); if (bow_print_word_scores) printf ("%16s %-40s %10.9f\n", "", (strrchr (cdoc->filename, '/') ? : cdoc->filename), scores[ci]); } } /* Rescale the SCORE one last time, this time making them all -2 or more negative, so that exp() will work well, especially around the higher-probability classes. */ if (naivebayes_final_rescale_scores && !naivebayes_score_returns_doc_pr) { rescaler = -DBL_MAX; for (ci = 0; ci < barrel->cdocs->length; ci++) if (scores[ci] > rescaler && scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR) rescaler = scores[ci]; rescaler += 2.0; /* rescaler += 2.5; */ /* RESCALER is now the maximum of the SCORES. */ for (ci = 0; ci < barrel->cdocs->length; ci++) if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR) scores[ci] -= rescaler; } if (naivebayes_cross_entropy) { for (ci = 0; ci < barrel->cdocs->length; ci++) { scores[ci] /= (num_words_in_query + 1); /* This makes it into KL divergence scores[ci] += h_w_d; */ } } else if (naivebayes_binary_scoring) { int low_score_index; assert (barrel->cdocs->length == 2); if (scores[0] <= scores[1]) low_score_index = 0; else low_score_index = 1; if (scores[1-low_score_index] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR) scores[1-low_score_index] = -1.0 * scores[low_score_index]; } else if (!naivebayes_return_log_pr) { if (naivebayes_normalize_log) { for (ci = 0; ci < barrel->cdocs->length; ci++) { assert (scores[ci] < 0); /* scores[ci] = -1.0 / scores[ci]; */ scores[ci] = -1.0 / (scores[ci] * scores[ci] * scores[ci]); /* scores[ci] = 1.0 / pow (-scores[ci], 2.7); */ } } else { /* Use exp() on the SCORES to get probabilities from log-probabilities. */ for (ci = 0; ci < barrel->cdocs->length; ci++) { new_score = exp (scores[ci]); /* assert (new_score > 0 && new_score < DBL_MAX - 1.0e5); */ if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR) scores[ci] = new_score; } } /* Normalize the SCORES so they all sum to one. */ if (!naivebayes_score_returns_doc_pr) { double scores_sum = 0; for (ci = 0; ci < barrel->cdocs->length; ci++) if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR) scores_sum += scores[ci]; for (ci = 0; ci < barrel->cdocs->length; ci++) { if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR) scores[ci] /= scores_sum; /* assert (scores[ci] > 0); */ } } } if (naivebayes_score_unsorted) { for (ci=0; ci<barrel->cdocs->length; ci++) { bscores[ci].weight = scores[ci]; } } else { /* Return the SCORES by putting them (and the `class indices') into SCORES in sorted order. */ num_scores = 0; for (ci = 0; ci < barrel->cdocs->length; ci++) { if (scores[ci] == IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR) scores[ci] = -DBL_MAX; if (num_scores < bscores_len || bscores[num_scores-1].weight < scores[ci]) { /* We are going to put this score and CI into SCORES because either: (1) there is empty space in SCORES, or (2) SCORES[CI] is larger than the smallest score there currently. */ int dsi; /* an index into SCORES */ if (num_scores < bscores_len) num_scores++; dsi = num_scores - 1; /* Shift down all the entries that are smaller than SCORES[CI] */ for (; dsi > 0 && bscores[dsi-1].weight < scores[ci]; dsi--) bscores[dsi] = bscores[dsi-1]; /* Insert the new score */ bscores[dsi].weight = scores[ci]; bscores[dsi].di = ci; } } } return num_scores;}bow_params_naivebayes bow_naivebayes_params ={ bow_no, /* no uniform priors */ bow_yes, /* normalize_scores */};rainbow_method bow_method_naivebayes = { "naivebayes", bow_naivebayes_set_weights, 0, /* no weight scaling function */ NULL, /* bow_barrel_normalize_weights_by_summing, */ bow_barrel_new_vpc_merge_then_weight, bow_barrel_set_vpc_priors_by_counting, bow_naivebayes_score, bow_wv_set_weights_to_count, NULL, /* no need for extra weight normalization */ bow_barrel_free, &bow_naivebayes_params};void _register_method_naivebayes () __attribute__ ((constructor));void _register_method_naivebayes (){ bow_method_register_with_name ((bow_method*)&bow_method_naivebayes, "naivebayes", sizeof (rainbow_method), &naivebayes_argp_child); bow_argp_add_child (&naivebayes_argp_child);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -