📄 naivebayes.c

📁 贝叶斯学习算法分类文本。基于朴素贝叶斯分类器的文本分类的通用算法
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
  /* Initialize the SCORES to the class prior probabilities. */  if (bow_print_word_scores)    printf ("%s\n",	    "(CLASS PRIOR PROBABILIES)");  for (ci = 0; ci < barrel->cdocs->length; ci++)    {      bow_cdoc *cdoc;      cdoc = bow_array_entry_at_index (barrel->cdocs, ci);      if (bow_uniform_class_priors)	/* Uniform prior means each class has probability 1/#classes. */	scores[ci] = - log (barrel->cdocs->length);      else	{#if 0 /* For now forget about this little detail, because rainbow-h	 trips up on it. */	  /* LOO_CLASS is not implemented for cases in which we are	     not doing uniform class priors. */	  assert (loo_class == -1);#endif	  assert (cdoc->prior > 0.0f && cdoc->prior <= 1.0f);	  scores[ci] = log (cdoc->prior);	}      assert (scores[ci] > -FLT_MAX + 1.0e5);      if (bow_print_word_scores)	printf ("%16s %-40s  %10.9f\n", 		"",		(strrchr (cdoc->filename, '/') ? : cdoc->filename),		scores[ci]);    }  /* Loop over each word in the word vector QUERY_WV, putting its     contribution into SCORES. */  for (wvi = 0; wvi < query_wv->num_entries; wvi++)    {      int wi;			/* the word index for the word at WVI */      bow_dv *dv;		/* the "document vector" for the word WI */      /* Get information about this word. */      wi = query_wv->entry[wvi].wi;      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      /* If the model doesn't know about this word, skip it. */      if (!dv)	continue;      if (bow_print_word_scores)	printf ("%-30s (queryweight=%.8f)\n",		bow_int2word (wi), 		query_wv->entry[wvi].weight * query_wv->normalizer);      rescaler = DBL_MAX;      /* Loop over all classes, putting this word's (WI's)	 contribution into SCORES. */      for (ci = 0, dvi = 0; ci < barrel->cdocs->length; ci++)	{	  bow_cdoc *cdoc;	  cdoc = bow_array_entry_at_index (barrel->cdocs, ci);	  assert (cdoc->type == model);	  /* Assign PR_W_C to P(w|C), either using a DV entry, or, if	     there is no DV entry for this class, using M-estimate 	     smoothing */	  if (dv)	    while (dvi < dv->length && dv->entry[dvi].di < ci)	      dvi++;	  if (dv && dvi < dv->length && dv->entry[dvi].di == ci)	    {	      if (loo_class == ci)		{		  /* xxx This is not exactly right, because 		     BARREL->WI2DVF->NUM_WORDS might have changed with the		     removal of QUERY_WV's document. */		  pr_w_c = ((float)			    ((M_EST_M * M_EST_P) + dv->entry[dvi].count 			     - query_wv->entry[wvi].count)			    / (M_EST_M + cdoc->word_count			       - query_wv->entry[wvi].count));		  if (pr_w_c <= 0)		    bow_error ("A negative word probability was calculated. "			       "This can happen if you are using\n"			       "--test-files-loo and the test files are "			       "not being lexed in the same way as they\n"			       "were when the model was built");		  assert (pr_w_c > 0 && pr_w_c <= 1);		}	      else		{		  pr_w_c = ((float)			    ((M_EST_M * M_EST_P) + dv->entry[dvi].count)			    / (M_EST_M + cdoc->word_count));		  assert (pr_w_c > 0 && pr_w_c <= 1);		}	    }	  else	    {	      if (loo_class == ci)		{		  /* xxx This is not exactly right, because 		     BARREL->WI2DVF->NUM_WORDS might have changed with the		     removal of QUERY_WV's document. */		  pr_w_c = ((M_EST_M * M_EST_P)			    / (M_EST_M + cdoc->word_count			       - query_wv->entry[wvi].count));		  assert (pr_w_c > 0 && pr_w_c <= 1);		}	      else		{		  pr_w_c = ((M_EST_M * M_EST_P)			    / (M_EST_M + cdoc->word_count));		  assert (pr_w_c > 0 && pr_w_c <= 1);		}	    }	  assert (pr_w_c > 0 && pr_w_c <= 1);	  log_pr_tf = log (pr_w_c);	  assert (log_pr_tf > -FLT_MAX + 1.0e5);	  /* Take into consideration the number of times it occurs in 	     the query document */	  log_pr_tf *= query_wv->entry[wvi].count;	  assert (log_pr_tf > -FLT_MAX + 1.0e5);	  scores[ci] += log_pr_tf;	  if (bow_print_word_scores)	    printf (" %8.2e %7.2f %-40s  %10.9f\n", 		    pr_w_c,		    log_pr_tf, 		    (strrchr (cdoc->filename, '/') ? : cdoc->filename),		    scores[ci]);	  /* Keep track of the minimum score updated for this word. */	  if (rescaler > scores[ci])	    rescaler = scores[ci];	}      /* Loop over all classes, re-scaling SCORES so that they	 don't get so small we loose floating point resolution.	 This scaling always keeps all SCORES positive. */      if (rescaler < 0)	{	  for (ci = 0; ci < barrel->cdocs->length; ci++)	    {	      /* Add to SCORES to bring them close to zero.  RESCALER is		 expected to often be less than zero here. */	      /* xxx If this doesn't work, we could keep track of the min		 and the max, and sum by their average. */	      scores[ci] += -rescaler;	      assert (scores[ci] > -DBL_MAX + 1.0e5		      && scores[ci] < DBL_MAX - 1.0e5);	    }	}    }  /* Now SCORES[] contains a (unnormalized) log-probability for each class. */  /* Rescale the SCORE one last time, this time making them all 0 or     negative, so that exp() will work well, especially around the     higher-probability classes. */  {    rescaler = -DBL_MAX;    for (ci = 0; ci < barrel->cdocs->length; ci++)      if (scores[ci] > rescaler) 	rescaler = scores[ci];    /* RESCALER is now the maximum of the SCORES. */    for (ci = 0; ci < barrel->cdocs->length; ci++)      scores[ci] -= rescaler;  }  /* Use exp() on the SCORES to get probabilities from         log-probabilities. */  for (ci = 0; ci < barrel->cdocs->length; ci++)    {      new_score = exp (scores[ci]);      /* assert (new_score > 0 && new_score < DBL_MAX - 1.0e5); */      scores[ci] = new_score;    }  /* Normalize the SCORES so they all sum to one. */  {    double scores_sum = 0;    for (ci = 0; ci < barrel->cdocs->length; ci++)      scores_sum += scores[ci];    for (ci = 0; ci < barrel->cdocs->length; ci++)      {	scores[ci] /= scores_sum;	/* assert (scores[ci] > 0); */      }  }  /* Return the SCORES by putting them (and the `class indices') into     SCORES in sorted order. */  {    num_scores = 0;    for (ci = 0; ci < barrel->cdocs->length; ci++)      {	if (num_scores < bscores_len	    || bscores[num_scores-1].weight < scores[ci])	  {	    /* We are going to put this score and CI into SCORES	       because either: (1) there is empty space in SCORES, or	       (2) SCORES[CI] is larger than the smallest score there	       currently. */	    int dsi;		/* an index into SCORES */	    if (num_scores < bscores_len)	      num_scores++;	    dsi = num_scores - 1;	    /* Shift down all the entries that are smaller than SCORES[CI] */	    for (; dsi > 0 && bscores[dsi-1].weight < scores[ci]; dsi--)	      bscores[dsi] = bscores[dsi-1];	    /* Insert the new score */	    bscores[dsi].weight = scores[ci];	    bscores[dsi].di = ci;	  }      }  }  return num_scores;}bow_params_naivebayes bow_naivebayes_params ={  bow_no,			/* no uniform priors */  bow_yes,			/* normalize_scores */};bow_method bow_method_naivebayes = {  "naivebayes",  bow_naivebayes_set_weights,  0,				/* no weight scaling function */  NULL, /* bow_barrel_normalize_weights_by_summing, */  bow_barrel_new_vpc_merge_then_weight,  bow_barrel_set_vpc_priors_by_counting,  bow_naivebayes_score,  bow_wv_set_weights_to_count,  NULL,				/* no need for extra weight normalization */  &bow_naivebayes_params};void _register_method_naivebayes () __attribute__ ((constructor));void _register_method_naivebayes (){  bow_method_register_with_name (&bow_method_naivebayes, "naivebayes");  bow_argp_add_child (&naivebayes_argp_child);}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -