📄 naivebayes.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 3 页
字号:
  else if (bow_smoothing_method == bow_smoothing_goodturing)    {      assert(barrel == bow_naivebayes_goodturing_barrel);      /* don't adjust if above k */      if (num_wi_ci > bow_smoothing_goodturing_k)	pr_w_c = num_wi_ci / num_w_ci;      /* if zero, just grab the stored weight */      else if (num_wi_ci == 0)	pr_w_c = bow_naivebayes_goodturing_discounts[ci][0];      /* else adjust by discount factor */      else	pr_w_c = bow_naivebayes_goodturing_discounts[ci][(int) num_wi_ci] * 	  num_wi_ci / num_w_ci;    }  else if (bow_smoothing_method == bow_smoothing_dirichlet)    {      pr_w_c = (num_wi_ci + bow_naivebayes_dirichlet_alphas[wi]) / 	(num_w_ci + bow_naivebayes_dirichlet_total);    }  else    {      bow_error ("Naivebayes does not implement smoothing method %d",		 bow_smoothing_method);      pr_w_c = 0;		/* to avoid gcc warning */    }#if 0  if (pr_w_c <= 0)    bow_error ("A negative word probability was calculated. "	       "This can happen if you are using\n"	       "--test-files-loo and the test files are "	       "not being lexed in the same way as they\n"	       "were when the model was built");  assert (pr_w_c > 0 && pr_w_c <= 1);#endif  return pr_w_c;}doublebow_naivebayes_total_word_count_for_ci (bow_barrel *class_barrel, int ci){  double ret = 0;  int max_wi, wi, dvi;  bow_dv *dv;  max_wi = MIN (class_barrel->wi2dvf->size, bow_num_words());  for (wi = 0; wi < max_wi; wi++)    {      dv = bow_wi2dvf_dv (class_barrel->wi2dvf, wi);      for (dvi = 0; dv && dvi < dv->length; dvi++)	if (dv->entry[dvi].di == ci)	  ret += dv->entry[dvi].weight;    }  return ret;}voidbow_naivebayes_print_word_probabilities_for_class (bow_barrel *barrel,						   const char *classname){  int wi;  int ci = bow_str2int_no_add (barrel->classnames, classname);  double pr_w;  assert (ci >= 0);  for (wi = 0; wi < barrel->wi2dvf->size; wi++)    {      pr_w = bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0, NULL, NULL);      if (pr_w >= 0)	printf ("%20.18f %s\n", pr_w,		bow_int2word (wi));    }  printf ("%-30s  %10.8f\n", "total_count", 	  bow_naivebayes_total_word_count_for_ci (barrel, ci));}bow_wa *bow_naivebayes_new_odds_ratio_for_ci (bow_barrel *barrel,				      int the_ci){  bow_wa *ret;  int wi;  int ci;  int max_wi;  bow_cdoc *cdoc;  double pr_wi_c;  double pr_wi_not_c;  double class_prior_ratio;  double pr_wi;  double pr_not_wi;  double ig;  bow_dv *dv;  int dvi;  cdoc = bow_array_entry_at_index (barrel->cdocs, the_ci);  class_prior_ratio = cdoc->prior / (1.0 - cdoc->prior);  max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  ret = bow_wa_new (max_wi+2);  for (wi = 0; wi < max_wi; wi++)     {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      /* If the model doesn't know about this word, skip it. */      if (dv == NULL)	continue;      pr_wi_c = 0;      pr_wi_not_c = 0;      for (ci = 0, dvi = 0; ci < barrel->cdocs->length; ci++)	{	  if (the_ci == ci)	    pr_wi_c = bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0,					       &dv, &dvi);	  else	    pr_wi_not_c += bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0,						    &dv, &dvi);	}      pr_wi = pr_wi_c + pr_wi_not_c;      pr_not_wi = (1 - pr_wi);#if 0      ig = (-(pr_wi * log (pr_wi) + pr_not_wi * log (pr_not_wi))	    + ((pr_wi_c * log (pr_wi_c) + (1-pr_wi_c) * log (1-pr_wi_c))));#endif      ig = pr_wi_c * log (pr_wi_c / pr_wi_not_c);      bow_wa_append (ret, wi, ig);    }  bow_wa_sort (ret);  return ret;}/* Print the top N words by odds ratio for each class. */voidbow_naivebayes_print_odds_ratio_for_all_classes (bow_barrel *barrel, int n){  int ci;  bow_cdoc *cdoc;  bow_wa *wa;  for (ci = 0; ci < barrel->cdocs->length; ci++)    {      cdoc = bow_array_entry_at_index (barrel->cdocs, ci);      wa = bow_naivebayes_new_odds_ratio_for_ci (barrel, ci);      fprintf (stderr, "%s [%d words]\n", cdoc->filename, cdoc->word_count);      bow_wa_fprintf (wa, stderr, n);      bow_wa_free (wa);    }}voidbow_naivebayes_print_odds_ratio_for_class (bow_barrel *barrel,					   const char *classname){  int wi;  int the_ci;  int ci;  int max_wi;  bow_cdoc *cdoc;  double pr_wi_c;  double pr_wi_not_c;  double class_prior_ratio;  bow_dv *dv;  int dvi;  the_ci = bow_str2int_no_add (barrel->classnames, classname);  if (the_ci == -1)    bow_error ("%s: Classname `%s' not found",	       __PRETTY_FUNCTION__, classname);  cdoc = bow_array_entry_at_index (barrel->cdocs, the_ci);  class_prior_ratio = cdoc->prior / (1.0 - cdoc->prior);  max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  for (wi = 0; wi < max_wi; wi++)     {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      /* If the model doesn't know about this word, skip it. */      if (dv == NULL)	continue;      pr_wi_c = 0;      pr_wi_not_c = 0;      for (ci = 0, dvi = 0; ci < bow_barrel_num_classes (barrel); ci++)	{	  if (the_ci == ci)	    pr_wi_c = bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0,					       &dv, &dvi);	  else	    pr_wi_not_c += bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0,						    &dv, &dvi);	}      printf ("%.10f %s\n",	      pr_wi_c * log (pr_wi_c / pr_wi_not_c),	      bow_int2word (wi));    }}/* Get the total number of terms in each class; store this in   CDOC->WORD_COUNT. */voidbow_naivebayes_set_cdoc_word_count_from_wi2dvf_weights (bow_barrel *barrel){  int ci;  bow_cdoc *cdoc;  int wi, max_wi;  bow_dv *dv;  int dvi;  int num_classes = bow_barrel_num_classes (barrel);  double num_words_per_ci[num_classes];  for (ci = 0; ci < num_classes; ci++)    num_words_per_ci[ci] = 0;  max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  for (wi = 0; wi < max_wi; wi++)     {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      if (dv == NULL)	continue;      for (dvi = 0; dvi < dv->length; dvi++) 	{	  cdoc = bow_array_entry_at_index (barrel->cdocs, 					   dv->entry[dvi].di);	  ci = dv->entry[dvi].di;	  assert (ci < num_classes);	  num_words_per_ci[ci] += dv->entry[dvi].weight;	}    }  for (ci = 0; ci < barrel->cdocs->length; ci++)    {      cdoc = bow_array_entry_at_index (barrel->cdocs, ci);      cdoc->word_count = (int) rint (num_words_per_ci[ci]);    }}/* Function to assign `Naive Bayes'-style weights to each element of   each document vector. */voidbow_naivebayes_set_weights (bow_barrel *barrel){  int ci;  bow_cdoc *cdoc;  int wi;			/* a "word index" into WI2DVF */  int max_wi;			/* the highest "word index" in WI2DVF. */  bow_dv *dv;			/* the "document vector" at index WI */  int dvi;			/* an index into the DV */  int weight_setting_num_words = 0;  double *pr_all_w_c = alloca (barrel->cdocs->length * sizeof (double));  double pr_w_c;  int total_num_words = 0;  /* Gather the word count here instead of directly of in CDOC->WORD_COUNT     so we avoid round-off error with each increment.  Remember,     CDOC->WORD_COUNT is a int! */  float num_words_per_ci[bow_barrel_num_classes (barrel)];  int barrel_is_empty = 0;  /* We assume that we have already called BOW_BARREL_NEW_VPC() on     BARREL, so BARREL already has one-document-per-class. */#if 0      assert (!strcmp (barrel->method->name, "naivebayes")	  || !strcmp (barrel->method->name, "crossentropy")	  || !strcmp (barrel->method->name, "active"));#endif  max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  /* The CDOC->PRIOR should have been set in bow_barrel_new_vpc();     verify it. */  /* Get the total number of unique terms in each class; store this in     CDOC->NORMALIZER. */  for (ci = 0; ci < barrel->cdocs->length; ci++)    {      cdoc = bow_array_entry_at_index (barrel->cdocs, ci);      assert (cdoc->prior >= 0);      pr_all_w_c[ci] = 0;      cdoc->normalizer = 0;      num_words_per_ci[ci] = 0;    }  /* Set the CDOC->WORD_COUNT for each class.  If we are using a     document (binomial) model, then we'll just use the value of     WORD_COUNT set in bow_barrel_new_vpc(), which is the total number     of *documents* in the class, not the number of words. */  /* Calculate P(w); store this in DV->IDF. */  if (bow_event_model != bow_event_document)    {      /* Get the total number of terms in each class; store this in	 CDOC->WORD_COUNT. */      /* Calculate the total number of unique words, and make sure it is	 the same as BARREL->WI2DVF->NUM_WORDS. */      int num_unique_words = 0;      for (wi = 0; wi < max_wi; wi++) 	{	  dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);	  if (dv == NULL)	    continue;	  num_unique_words++;	  dv->idf = 0.0;	  for (dvi = 0; dvi < dv->length; dvi++) 	    {	      cdoc = bow_array_entry_at_index (barrel->cdocs, 					       dv->entry[dvi].di);	      ci = dv->entry[dvi].di;	      num_words_per_ci[ci] += dv->entry[dvi].weight;	      cdoc->normalizer++;	      dv->idf += dv->entry[dvi].weight;	      total_num_words += dv->entry[dvi].weight;	    }	}      for (ci = 0; ci < barrel->cdocs->length; ci++)	{	  cdoc = bow_array_entry_at_index (barrel->cdocs, ci);	  cdoc->word_count = (int) rint (num_words_per_ci[ci]);	}      assert (num_unique_words == barrel->wi2dvf->num_words);      /* Normalize the DV->IDF to sum to one across all words, so it is	 P(w). */      if (total_num_words)	{	  for (wi = 0; wi < max_wi; wi++) 	    {	      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);	      if (dv == NULL)		continue;	      dv->idf /= total_num_words;	    }	}      else	{	  barrel_is_empty = 1;	  bow_verbosify (bow_progress, "Zero words in class barrel\n");	}    }  /* initialize smoothing methods, if necessary */  if (bow_smoothing_method == bow_smoothing_goodturing)    bow_naivebayes_initialize_goodturing (barrel);  else if (bow_smoothing_method == bow_smoothing_dirichlet)    {      bow_naivebayes_load_dirichlet_alphas ();      bow_naivebayes_initialize_dirichlet_smoothing (barrel);    }  if (bow_event_model != bow_event_document && !barrel_is_empty)    {      /* Now loop through all the classes, verifying the	 the probability of all in each class sums to one. */      total_num_words = 0;      for (wi = 0; wi < max_wi; wi++) 	{	  dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);	  /* If the model doesn't know about this word, skip it. */	  if (dv == NULL)	    continue;	  for (ci = 0; ci < barrel->cdocs->length; ci++)	    {	      pr_w_c = bow_naivebayes_pr_wi_ci (barrel, wi, ci, -1, 0, 0,						NULL, NULL);	      cdoc = bow_array_entry_at_index (barrel->cdocs, ci);	      assert (pr_w_c <= 1);	      pr_all_w_c[ci] += pr_w_c;	    }	  weight_setting_num_words++;	}      for (ci = 0; ci < barrel->cdocs->length; ci++)	{	  /* Is this too much round-off error to expect? */	  assert (pr_all_w_c[ci] < 1.01 && pr_all_w_c[ci] > 0.99);	}    }#if 0  fprintf (stderr, "wi2dvf num_words %d, weight-setting num_words %d\n",	   barrel->wi2dvf->num_words, weight_setting_num_words);#endif}#define IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR 999.99intbow_naivebayes_score (bow_barrel *barrel, bow_wv *query_wv, 		      bow_score *bscores, int bscores_len,		      int loo_class){  double *scores;		/* will become prob(class), indexed over CI */  int ci;			/* a "class index" (document index) */  int wvi;			/* an index into the entries of QUERY_WV. */  int dvi;			/* an index into a "document vector" */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -