📄 naivebayes.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
  double pr_w_c;		/* P(w|C), prob a word is in a class */  double log_pr_tf;		/* log(P(w|C)^TF), ditto, log() of it */  double rescaler;		/* Rescale SCORES by this after each word */  double new_score;		/* a temporary holder */  int num_scores = 0;		/* number of entries placed in SCORES */  int num_words_in_query = 0;  double pr_w_d;		/* P(w|d) */  double h_w_d;			/* entropy of P(W|d) */  int wi;  int hi;  int max_wi;  double query_wv_total_weight;    /* Binomial event model with LOO processing doesn't work yet. */  assert (bow_event_model != bow_event_document	  || loo_class == -1);    max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  /* Allocate space to store scores for *all* classes (documents) */  scores = alloca (barrel->cdocs->length * sizeof (double));  /* Instead of multiplying probabilities, we will sum up     log-probabilities, (so we don't loose floating point resolution),     and then take the exponent of them to get probabilities back. */  /* Initialize the SCORES to the class prior probabilities. */  if (bow_print_word_scores)    printf ("%s\n",	    "(CLASS PRIOR PROBABILIES)");  for (hi = 0; hi < bscores_len; hi++)    bscores[hi].name = NULL;  /* Initialize log-probabilities to 0 (unless class prior is zero) */  for (ci = 0; ci < barrel->cdocs->length; ci++)    {      bow_cdoc *cdoc;      cdoc = bow_array_entry_at_index (barrel->cdocs, ci);      if (cdoc->prior == 0)	scores[ci] = IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR;      else	scores[ci] = 0;    }  /* If we are doing leave-one-out evaluation, get the total number of     words in this query. */  if (1 || loo_class >= 0 || naivebayes_cross_entropy)    {      bow_dv *dv;      num_words_in_query = 0;      for (wvi = 0; wvi < query_wv->num_entries; wvi++)	{	  /* Only count those words that are in the model's vocabulary. */	  dv = bow_wi2dvf_dv (barrel->wi2dvf, query_wv->entry[wvi].wi);	  if (dv)	    num_words_in_query += query_wv->entry[wvi].count;	}    }  /* Set the weights of the QUERY_WV, according to the event model. */  for (wvi = 0; wvi < query_wv->num_entries; wvi++)    {      if (bow_event_model == bow_event_document_then_word)	query_wv->entry[wvi].weight = 	  bow_event_document_then_word_document_length	  * ((float)query_wv->entry[wvi].count) / num_words_in_query;      else	query_wv->entry[wvi].weight = query_wv->entry[wvi].count;    }  if (bow_event_model == bow_event_document_then_word)    query_wv_total_weight = bow_event_document_then_word_document_length;  else    query_wv_total_weight = num_words_in_query;  /* Put contribution of the words into SCORES.  If we are using the     document event model, then loop over all words in the vocabulary,     otherwise, just loop over all the words in the QUERY_WV     document. */  h_w_d = 0;  for (wvi = 0, wi = 0;       ((bow_event_model == bow_event_document)	? (wi < max_wi)	: (wvi < query_wv->num_entries));       ((bow_event_model == bow_event_document)	? (wi++)	: (wvi++)))    {      bow_dv *dv;		/* the "document vector" for the word WI */      /* Get information about this word. */            /* Align WI and WVI in ways that depend on whether we are looping	 over all words in the vocabulary or over words in the query. */      if (bow_event_model == bow_event_document)	{	  if (query_wv->entry[wvi].wi < wi	      && wvi < query_wv->num_entries)	    {	      assert (query_wv->entry[wvi].wi == wi-1);	      wvi++;	    }	}      else	{	  wi = query_wv->entry[wvi].wi;	}      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      /* If the model doesn't know about this word, skip it. */      if (!dv)	continue;      if (wi == query_wv->entry[wvi].wi && query_wv->num_entries)	{	  pr_w_d = ((double)query_wv->entry[wvi].count) / num_words_in_query;	  h_w_d -= pr_w_d * log (pr_w_d);	}      if (bow_print_word_scores)	printf ("%-30s (queryweight=%.8f)\n",		bow_int2word (wi), 		query_wv->entry[wvi].weight * query_wv->normalizer);      rescaler = DBL_MAX;      /* Loop over all classes, putting this word's (WI's)	 contribution into SCORES. */      for (ci = 0, dvi = 0; ci < barrel->cdocs->length; ci++)	{	  if (scores[ci] == IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR)	    continue;	  pr_w_c = bow_naivebayes_pr_wi_ci (barrel, wi, ci, 					    loo_class, 					    query_wv->entry[wvi].weight, 					    query_wv_total_weight,					    &dv, &dvi);	  /* If this is a word that does not occur in the document,	     then use the probability it does not occur in the class.	     This occurs only if we are using the document event model. */	  if (query_wv->num_entries == 0 || wi != query_wv->entry[wvi].wi)	    pr_w_c = 1.0 - pr_w_c;	  assert (pr_w_c > 0 && pr_w_c <= 1);	  /* Put the probability in log-space */	  log_pr_tf = log (pr_w_c);	  assert (log_pr_tf > -FLT_MAX + 1.0e5);	  /* Take into consideration the number of times it occurs in 	     the query document */	  if (bow_event_model != bow_event_document)	    log_pr_tf *= query_wv->entry[wvi].weight;	  assert (log_pr_tf > -FLT_MAX + 1.0e5);	  scores[ci] += log_pr_tf;	  if (bow_print_word_scores)	    {	      bow_cdoc *cdoc = bow_array_entry_at_index (barrel->cdocs, ci);	      printf (" %8.2e %7.2f %-40s  %10.9f\n", 		      pr_w_c,		      log_pr_tf, 		      (strrchr (cdoc->filename, '/') ? : cdoc->filename),		      scores[ci]);	    }	  /* Keep track of the minimum score updated for this word. */	  if (rescaler > scores[ci])	    rescaler = scores[ci];	}      /* Loop over all classes, re-scaling SCORES so that they	 don't get so small we loose floating point resolution.	 This scaling always keeps all SCORES positive. */      if (naivebayes_rescale_scores && rescaler < 0 &&	  !naivebayes_score_returns_doc_pr)	{	  for (ci = 0; ci < barrel->cdocs->length; ci++)	    {	      /* Add to SCORES to bring them close to zero.  RESCALER is		 expected to often be less than zero here. */	      /* xxx If this doesn't work, we could keep track of the min		 and the max, and sum by their average. */	      if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR)		scores[ci] += -rescaler;	      assert (scores[ci] > -DBL_MAX + 1.0e5		      && scores[ci] < DBL_MAX - 1.0e5);	    }	}    }  /* Now SCORES[] contains a (unnormalized) log-probability of the     document for each class. */  /* Anneal the probability */  if (bow_naivebayes_anneal_temperature != 1)    {      for (ci = 0; ci < barrel->cdocs->length; ci++)	if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR)	  {#if 0	    scores[ci] /= (query_wv_total_weight			   + bow_naivebayes_anneal_temperature);#elif 0	    scores[ci] /= 1 + log (query_wv_total_weight + 1);#elif 0	    scores[ci] /= ((pow (query_wv_total_weight, 0.9) + 1) / 3);#elif 1	    scores[ci] /= ((query_wv_total_weight + 1) / 7);#else	    scores[ci] /= bow_naivebayes_anneal_temperature;#endif	    assert (scores[ci] > -FLT_MAX + 1.0e5);	  }    }  /* Incorporate the class prior */  if (!naivebayes_score_returns_doc_pr && !bow_uniform_class_priors)    {      for (ci = 0; ci < barrel->cdocs->length; ci++)	{	  bow_cdoc *cdoc;	  cdoc = bow_array_entry_at_index (barrel->cdocs, ci);	  assert (cdoc->prior >= 0.0f && cdoc->prior <= 1.0f);	  if (cdoc->prior == 0)	    assert (scores[ci] == IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR);	  else	    scores[ci] += log (cdoc->prior);	  assert (scores[ci] > -FLT_MAX + 1.0e5);	  if (bow_print_word_scores)	    printf ("%16s %-40s  %10.9f\n", 		    "",		    (strrchr (cdoc->filename, '/') ? : cdoc->filename),		    scores[ci]);	}    }  /* Rescale the SCORE one last time, this time making them all -2 or     more negative, so that exp() will work well, especially around     the higher-probability classes. */  if (naivebayes_final_rescale_scores && !naivebayes_score_returns_doc_pr)    {      rescaler = -DBL_MAX;      for (ci = 0; ci < barrel->cdocs->length; ci++)	if (scores[ci] > rescaler	    && scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR)	  rescaler = scores[ci];      rescaler += 2.0;      /* rescaler += 2.5; */      /* RESCALER is now the maximum of the SCORES. */      for (ci = 0; ci < barrel->cdocs->length; ci++)	if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR)	  scores[ci] -= rescaler;    }  if (naivebayes_cross_entropy)    {      for (ci = 0; ci < barrel->cdocs->length; ci++)	{	  scores[ci] /= (num_words_in_query + 1);	  /* This makes it into KL divergence  scores[ci] += h_w_d; */	}    }  else if (naivebayes_binary_scoring)    {      int low_score_index;      assert (barrel->cdocs->length == 2);      if (scores[0] <= scores[1])	low_score_index = 0;      else	low_score_index = 1;      if (scores[1-low_score_index] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR)	scores[1-low_score_index] = -1.0 * scores[low_score_index];    }  else if (!naivebayes_return_log_pr)    {      if (naivebayes_normalize_log)	{	  for (ci = 0; ci < barrel->cdocs->length; ci++)	    {	      assert (scores[ci] < 0);	      /* scores[ci] = -1.0 / scores[ci]; */	      scores[ci] = -1.0 / (scores[ci] * scores[ci] * scores[ci]);	      /* scores[ci] = 1.0 / pow (-scores[ci], 2.7); */	    }	}      else	{	  /* Use exp() on the SCORES to get probabilities from	     log-probabilities. */	  for (ci = 0; ci < barrel->cdocs->length; ci++)	    {	      new_score = exp (scores[ci]);	      /* assert (new_score > 0 && new_score < DBL_MAX - 1.0e5); */	      if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR)		scores[ci] = new_score;	    }	}      /* Normalize the SCORES so they all sum to one. */      if (!naivebayes_score_returns_doc_pr) 	{	  double scores_sum = 0;	  for (ci = 0; ci < barrel->cdocs->length; ci++)	    if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR)	      scores_sum += scores[ci];	  for (ci = 0; ci < barrel->cdocs->length; ci++)	    {	      if (scores[ci] != IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR)		scores[ci] /= scores_sum;	      /* assert (scores[ci] > 0); */	    }	}    }   if (naivebayes_score_unsorted) {     for (ci=0; ci<barrel->cdocs->length; ci++) {      bscores[ci].weight = scores[ci];    }  } else {    /* Return the SCORES by putting them (and the `class indices') into       SCORES in sorted order. */    num_scores = 0;    for (ci = 0; ci < barrel->cdocs->length; ci++)      {	if (scores[ci] == IMPOSSIBLE_SCORE_FOR_ZERO_CLASS_PRIOR)	  scores[ci] = -DBL_MAX;	if (num_scores < bscores_len	    || bscores[num_scores-1].weight < scores[ci])	  {	    /* We are going to put this score and CI into SCORES	       because either: (1) there is empty space in SCORES, or	       (2) SCORES[CI] is larger than the smallest score there	       currently. */	    int dsi;		/* an index into SCORES */	    if (num_scores < bscores_len)	      num_scores++;	    dsi = num_scores - 1;	    /* Shift down all the entries that are smaller than SCORES[CI] */	    for (; dsi > 0 && bscores[dsi-1].weight < scores[ci]; dsi--)	      bscores[dsi] = bscores[dsi-1];	    /* Insert the new score */	    bscores[dsi].weight = scores[ci];	    bscores[dsi].di = ci;	  }      }  }  return num_scores;}bow_params_naivebayes bow_naivebayes_params ={  bow_no,			/* no uniform priors */  bow_yes,			/* normalize_scores */};rainbow_method bow_method_naivebayes = {  "naivebayes",  bow_naivebayes_set_weights,  0,				/* no weight scaling function */  NULL, /* bow_barrel_normalize_weights_by_summing, */  bow_barrel_new_vpc_merge_then_weight,  bow_barrel_set_vpc_priors_by_counting,  bow_naivebayes_score,  bow_wv_set_weights_to_count,  NULL,				/* no need for extra weight normalization */  bow_barrel_free,  &bow_naivebayes_params};void _register_method_naivebayes () __attribute__ ((constructor));void _register_method_naivebayes (){  bow_method_register_with_name ((bow_method*)&bow_method_naivebayes,				 "naivebayes",				 sizeof (rainbow_method),				 &naivebayes_argp_child);  bow_argp_add_child (&naivebayes_argp_child);}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -