📄 naivebayes.c
字号:
/* Weight-setting and scoring implementation for Naive-Bayes classification *//* Copyright (C) 1997 Andrew McCallum Written by: Andrew Kachites McCallum <mccallum@cs.cmu.edu> This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <math.h>#include <argp/argp.h>/* Command-line options specific to NaiveBayes *//* Default value for option "naivebayes-m-est-m". When zero, then use size-of-vocabulary instead. */static int naivebayes_argp_m_est_m = 0;/* The integer or single char used to represent this command-line option. Make sure it is unique across all libbow and rainbow. */#define NB_M_EST_M_KEY 3001static struct argp_option naivebayes_options[] ={ {"naivebayes-m-est-m", NB_M_EST_M_KEY, "M", 0, "When using `m'-estimates for smoothing in NaiveBayes, use M as the " "value for `m'. The default is the size of vocabulary."}, {0, 0}};error_tnaivebayes_parse_opt (int key, char *arg, struct argp_state *state){ switch (key) { case NB_M_EST_M_KEY: naivebayes_argp_m_est_m = atoi (arg); break; default: return ARGP_ERR_UNKNOWN; } return 0;}static const struct argp naivebayes_argp ={ naivebayes_options, naivebayes_parse_opt};static struct argp_child naivebayes_argp_child ={ &naivebayes_argp, /* This child's argp structure */ 0, /* flags for child */ 0, /* optional header in help message */ 0 /* arbitrary group number for ordering */};/* End of command-line options specific to NaiveBayes *//* For changing weight of unseen words. I really should implement `deleted interpolation' *//* M_EST_P summed over all words in the vocabulary must sum to 1.0! */#if 1/* This is the special case of the M-estimate that is `Laplace smoothing' */#define M_EST_M (naivebayes_argp_m_est_m \ ? naivebayes_argp_m_est_m \ : barrel->wi2dvf->num_words)#define M_EST_P (1.0 / barrel->wi2dvf->num_words)#else/* This is a version of M-estimates where the value of M depends on the number of words in the class. */#define M_EST_M (cdoc->word_count \ ? (((float)barrel->wi2dvf->num_words) / cdoc->word_count) \ : 1.0)#define M_EST_P (1.0 / barrel->wi2dvf->num_words)#endif/* Function to assign `Naive Bayes'-style weights to each element of each document vector. */voidbow_naivebayes_set_weights (bow_barrel *barrel){ int ci; bow_cdoc *cdoc; int wi; /* a "word index" into WI2DVF */ int max_wi; /* the highest "word index" in WI2DVF. */ bow_dv *dv; /* the "document vector" at index WI */ int dvi; /* an index into the DV */ int weight_setting_num_words = 0; float *pr_all_w_c = alloca (barrel->cdocs->length * sizeof (float)); float pr_w_c; /* We assume that we have already called BOW_BARREL_NEW_VPC() on BARREL, so BARREL already has one-document-per-class. */ assert (!strcmp (barrel->method->name, "naivebayes") || !strcmp (barrel->method->name, "crossentropy")); max_wi = MIN (barrel->wi2dvf->size, bow_num_words()); /* The CDOC->PRIOR should have been set in bow_barrel_new_vpc(); verify it. */ /* Get the total number of unique terms in each class; store this in CDOC->NORMALIZER. */ for (ci = 0; ci < barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (barrel->cdocs, ci); assert (cdoc->prior >= 0); pr_all_w_c[ci] = 0; cdoc->normalizer = 0; }#if 0 /* For Shumeet, make all counts either 0 or 1. */ for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (dv == NULL) continue; for (dvi = 0; dvi < dv->length; dvi++) { assert (dv->entry[dvi].count); dv->entry[dvi].count = 1; } } /* And set uniform priors */ for (ci = 0; ci < barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (barrel->cdocs, ci); cdoc->prior = 1.0; }#endif /* If BOW_BINARY_WORD_COUNTS is true, then we'll just use the value of WORD_COUNT set in bow_barrel_new_vpc(), which is the total number of *documents* in the class, not the number of words. */ if (!bow_binary_word_counts) { /* Get the total number of terms in each class; store this in CDOC->WORD_COUNT. */ /* Calculate the total number of unique words, and make sure it is the same as BARREL->WI2DVF->NUM_WORDS. */ int num_unique_words = 0; for (ci = 0; ci < barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (barrel->cdocs, ci); cdoc->word_count = 0; } for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (dv == NULL) continue; num_unique_words++; for (dvi = 0; dvi < dv->length; dvi++) { cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); cdoc->word_count += dv->entry[dvi].count; cdoc->normalizer++; } } assert (num_unique_words == barrel->wi2dvf->num_words); } /* Set the weights in the BARREL's WI2DVF so that they are equal to P(w|C), the probability of a word given a class. */ for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); /* If the model doesn't know about this word, skip it. */ if (dv == NULL) continue; /* Now loop through all the classes, setting their weights */ for (ci = 0, dvi = 0; ci < barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (barrel->cdocs, ci); while (dvi < dv->length && dv->entry[dvi].di < ci) dvi++; if (dv && dvi < dv->length && dv->entry[dvi].di == ci) { pr_w_c = ((float) ((M_EST_M * M_EST_P) + dv->entry[dvi].count) / (M_EST_M + cdoc->word_count)); dv->entry[dvi].weight = pr_w_c; } else { pr_w_c = ((M_EST_M * M_EST_P) / (M_EST_M + cdoc->word_count)); } assert (pr_w_c <= 1); pr_all_w_c[ci] += pr_w_c; } weight_setting_num_words++; /* Set the IDF. NaiveBayes doesn't use it; make it have no effect */ dv->idf = 1.0; } /* Check to make sure that [Sum_w Pr(w|c)] sums to 1 for all classes. */ for (ci = 0; ci < barrel->cdocs->length; ci++) { /* Is this too much round-off error to expect? */ assert (pr_all_w_c[ci] < 1.01 && pr_all_w_c[ci] > 0.99); }#if 0 fprintf (stderr, "wi2dvf num_words %d, weight-setting num_words %d\n", barrel->wi2dvf->num_words, weight_setting_num_words);#endif}intbow_naivebayes_score (bow_barrel *barrel, bow_wv *query_wv, bow_score *bscores, int bscores_len, int loo_class){ double *scores; /* will become prob(class), indexed over CI */ int ci; /* a "class index" (document index) */ int wvi; /* an index into the entries of QUERY_WV. */ int dvi; /* an index into a "document vector" */ float pr_w_c; /* P(w|C), prob a word is in a class */ double log_pr_tf; /* log(P(w|C)^TF), ditto, log() of it */ double rescaler; /* Rescale SCORES by this after each word */ double new_score; /* a temporary holder */ int num_scores; /* number of entries placed in SCORES */ /* Allocate space to store scores for *all* classes (documents) */ scores = alloca (barrel->cdocs->length * sizeof (double)); /* Instead of multiplying probabilities, we will sum up log-probabilities, (so we don't loose floating point resolution), and then take the exponent of them to get probabilities back. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -