📄 em.c

📁 卡内基梅隆大学MaCallum开发的文本分类系统
💻 C
📖 第 1 页 / 共 5 页
字号:
			}		    }		}	      else		bow_error ("No method for this type.");	      	      if (num_tested % 100 == 0)		bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", num_tested);	      	      num_tested++;	  	    }	  	  bow_verbosify(bow_progress, "\b\b\b\b\b\b%6d\n", num_tested);	}      /* Lower the temperature if doing DA */      if (em_anneal)	{	  em_temperature *= em_temp_reduction;	  /* if temperature hits bottom, finish up */	  if (em_temperature < 1.0)	    {	      em_temperature = 1.0;	      em_anneal = 0;	      em_runs = 1;	    }	  bow_verbosify (bow_progress, "Lowering temperature to %f\n", 			 em_temperature);	}    }    /* don't free class_probs for now.  Need them if doing LOO */#if 0  /* fix back up the doc barrel... dealloc class_probs (wrong size!) */    for (di=0; di < doc_barrel->cdocs->length; di++)    {      bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);      bow_free(cdoc->class_probs);      cdoc->class_probs = NULL;    }#endif  #if 0  /* if halting by perplexity reduction, return the previous     round's barrel */  if (em_halt_using_perplexity)    {      bow_wi2dvf_free(vpc_barrel->wi2dvf);      vpc_barrel->wi2dvf = prev_wi2dvf;      for (ci = 0; ci < max_new_ci; ci++)	{	  bow_cdoc *cdoc = bow_array_entry_at_index(vpc_barrel->cdocs, ci);	  	  cdoc->prior = prev_priors[ci];	  cdoc->word_count = prev_word_counts[ci];	  cdoc->normalizer = prev_normalizers[ci];	}    }#endif  bow_em_making_barrel = 0;    return vpc_barrel;}/* Calculate the perplexity of specified documents */doubleem_calculate_perplexity (bow_barrel *doc_barrel, bow_barrel *class_barrel){  bow_dv_heap *test_heap;	/* we'll extract test WV's from here */  bow_wv *query_wv;  int di;			/* a document index */  bow_score *hits;  int num_hits_to_retrieve = bow_barrel_num_classes (class_barrel);  int actual_num_hits;  bow_cdoc *doc_cdoc;  double log_prob_of_data = 0;  double *class_probs;  int hi;  int ci;  double rescaler;  double scores_sum;  double num_data_words = 0;  int num_tested = 0;  int wvi;  bow_dv *dv;    /* turn this on so scoring knows to return perplexities */  bow_em_calculating_perplexity = 1;  bow_verbosify(bow_progress, "\nCalculating perplexity:       ");  /* Create the heap from which we'll get WV's. Initialize QUERY_WV so     BOW_HEAP_NEXT_WV() knows not to try to free. */  hits = alloca (sizeof (bow_score) * num_hits_to_retrieve);  class_probs = alloca (sizeof (double) * num_hits_to_retrieve);  test_heap = bow_test_new_heap (doc_barrel);  query_wv = NULL;  /* Loop once for each validation document. */  while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, 				 em_perplexity_docs))	 != -1)    {      doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 					   di);      bow_wv_set_weights (query_wv, class_barrel);      bow_wv_normalize_weights (query_wv, class_barrel);      actual_num_hits = 	bow_barrel_score (class_barrel, 			  query_wv, hits,			  num_hits_to_retrieve, 			  (em_perplexity_loo			   ? (int) doc_cdoc->class_probs			   : (int) NULL));      assert (actual_num_hits == num_hits_to_retrieve);      /* calculate class probabilities by normalizing scores          and adding in the class priors */      {	for (ci = 0; ci < num_hits_to_retrieve; ci++)	  class_probs[ci] = 2;	for (hi = 0; hi < num_hits_to_retrieve; hi++)	  class_probs[hits[hi].di] = hits[hi].weight;		/* check they all got set ok */	for (ci = 0; ci < num_hits_to_retrieve; ci++)	  assert (class_probs[ci] != 2);		/* add in the class priors */	for (ci = 0; ci < num_hits_to_retrieve; ci++)	  {	    bow_cdoc *cdoc = bow_array_entry_at_index(class_barrel->cdocs, 						      ci);	    class_probs[ci] += log (cdoc->prior);	  }	/* Rescale the class_probs  making them all 0 or	   negative, so that exp() will work well, especially around the	   higher-probability classes. */	rescaler = -DBL_MAX;	for (ci = 0; ci < num_hits_to_retrieve; ci++)	  if (class_probs[ci] > rescaler) 	    rescaler = class_probs[ci];	/* RESCALER is now the maximum of the class_probs. */	for (ci = 0; ci < num_hits_to_retrieve; ci++)	  class_probs[ci] -= rescaler;		/* Use exp() on the class_probs to get probabilities from	   log-probabilities. */	for (ci = 0; ci < num_hits_to_retrieve; ci++)	    class_probs[ci] = exp (class_probs[ci]);		/* If multi-hump neg, zero out the positive class */	if (doc_cdoc->type == bow_doc_train &&	    bow_em_multi_hump_neg > 1 &&	    doc_cdoc->class != binary_pos_ci)	  class_probs[binary_pos_ci] = 0;	/* Normalize the class_probs so they all sum to one. */	scores_sum = 0;	for (ci = 0; ci < num_hits_to_retrieve; ci++)	  scores_sum += class_probs[ci];	for (ci = 0; ci < num_hits_to_retrieve; ci++)	  class_probs[ci] /= scores_sum;      }      /* add in the contribution of this document.  For training docs,         only count the contribution of their class, since the class         label is known. */      if (doc_cdoc->type != bow_doc_train ||	  (doc_cdoc->type == bow_doc_train &&	   bow_em_multi_hump_neg > 1 &&	   doc_cdoc->class != binary_pos_ci))	{	  for (hi = 0; hi < num_hits_to_retrieve; hi++)	    log_prob_of_data += class_probs[hits[hi].di] * hits[hi].weight;	}      else 	{	  for (hi = 0; hi < num_hits_to_retrieve; hi++)	    {	      if (hits[hi].di == doc_cdoc->class)		{		  log_prob_of_data += hits[hi].weight;		  break;		}	    }	}#if 0      if (bow_event_model == bow_event_document_then_word)	assert (query_wv->normalizer == 		bow_event_document_then_word_document_length );      num_data_words += query_wv->normalizer;#endif      /* calculate the number of words shared between the model and the doc */      for (wvi = 0; wvi < query_wv->num_entries; wvi++)	{	  dv = bow_wi2dvf_dv (class_barrel->wi2dvf, query_wv->entry[wvi].wi);	  if (!dv)	    continue;	  num_data_words += query_wv->entry[wvi].weight;	}      if (num_tested % 100 == 0)	bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", num_tested);            num_tested++;	      }  bow_verbosify(bow_progress, "\b\b\b\b\b\b%6d\n", num_tested);  bow_verbosify (bow_progress, "Docs = %d, Words = %f, l(data) = %f\n",		 num_tested, num_data_words, log_prob_of_data);  /* convert log prob to perplexity and return */  bow_em_calculating_perplexity = 0;  return exp (-log_prob_of_data / num_data_words);}/* Calculate the accuracy of the barrel on the test set */floatem_calculate_accuracy (bow_barrel *doc_barrel, bow_barrel *class_barrel){  bow_dv_heap *test_heap;	/* we'll extract test WV's from here */  bow_wv *query_wv;  int di;			/* a document index */  bow_score *hits;  int num_hits_to_retrieve = 1;  int actual_num_hits;  bow_cdoc *doc_cdoc;  int num_tested = 0;  int num_correct = 0;  /* Create the heap from which we'll get WV's. Initialize QUERY_WV so     BOW_TEST_NEXT_WV() knows not to try to free. */  hits = alloca (sizeof (bow_score) * num_hits_to_retrieve);  test_heap = bow_test_new_heap (doc_barrel);  query_wv = NULL;  /* Loop once for each test document. */  while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, 				 em_accuracy_docs))	 != -1)    {      doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 					   di);      bow_wv_set_weights (query_wv, class_barrel);      bow_wv_normalize_weights (query_wv, class_barrel);      actual_num_hits = 	bow_barrel_score (class_barrel, 			  query_wv, hits,			  num_hits_to_retrieve, 			  (em_accuracy_loo			   ? (int) doc_cdoc->class_probs			   : (int) NULL));      assert (actual_num_hits == num_hits_to_retrieve);      if (doc_cdoc->class == hits[0].di)	num_correct++;      num_tested++;    }  return (((float) num_correct) / ((float) num_tested));}/* Run test trials, outputing results to TEST_FP.  The results are   indended to be read and processed by the Perl script   ./rainbow-stats. */voidbow_em_compare_to_nb (bow_barrel *doc_barrel){  bow_dv_heap *test_heap;	/* we'll extract test WV's from here */  bow_wv *query_wv;  int di;			/* a document index */  bow_score *hits;  int num_hits_to_retrieve = bow_barrel_num_classes (doc_barrel);  int actual_num_hits;  int hi;			/* hit index */  bow_cdoc *doc_cdoc;  bow_cdoc *class_cdoc;  FILE *test_fp = stdout;  bow_barrel *class_barrel;  /* Re-create the vector-per-class barrel in accordance with the     new train/test settings. */  doc_barrel->method = (rainbow_method*) bow_method_at_name ("naivebayes");    class_barrel =     bow_barrel_new_vpc_with_weights (doc_barrel);  /* Create the heap from which we'll get WV's. Initialize QUERY_WV so     BOW_TEST_NEXT_WV() knows not to try to free. */  test_heap = bow_test_new_heap (doc_barrel);  query_wv = NULL;  hits = alloca (sizeof (bow_score) * num_hits_to_retrieve);  fprintf(test_fp, "#0\n");  /* Loop once for each test document. */  while ((di = bow_test_next_wv (test_heap, doc_barrel, &query_wv))	 != -1)    {      doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);      class_cdoc = bow_array_entry_at_index (class_barrel->cdocs, 					     doc_cdoc->class);      bow_wv_set_weights (query_wv, class_barrel);      bow_wv_normalize_weights (query_wv, class_barrel);      actual_num_hits = 	bow_barrel_score (class_barrel, 			  query_wv, hits,			  num_hits_to_retrieve, -1);      assert (actual_num_hits == num_hits_to_retrieve);      fprintf (test_fp, "%s %s ", 	       doc_cdoc->filename, 	       filename_to_classname(class_cdoc->filename));       for (hi = 0; hi < actual_num_hits; hi++)	{	  class_cdoc = 	    bow_array_entry_at_index (class_barrel->cdocs,				      hits[hi].di);	  fprintf (test_fp, "%s:%.*g ", 		   filename_to_classname (class_cdoc->filename),		   bow_score_print_precision,		   hits[hi].weight); 	}      fprintf (test_fp, "\n");    }  bow_barrel_free (class_barrel);  doc_barrel->method = (rainbow_method*) bow_method_at_name ("em");}/* Function to assign `Naive Bayes'-style weights to each element of   each document vector. */voidbow_em_print_log_odds_ratio (bow_barrel *barrel, int num_to_print){  int ci;  bow_cdoc *cdoc;  int wi;			/* a "word index" into WI2DVF */  int max_wi;			/* the highest "word index" in WI2DVF. */  bow_dv *dv;			/* the "document vector" at index WI */  int dvi;			/* an index into the DV */  int weight_setting_num_words = 0;  int total_num_words = 0;  struct lorth { int wi; float lor; } lors[barrel->cdocs->length][num_to_print];  int wci;  bow_error("Can't use this while normalizer is being used for non-integral word_count");  /* We assume that we have already called BOW_BARREL_NEW_VPC() on     BARREL, so BARREL already has one-document-per-class. */  max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  for (ci = 0; ci < barrel->cdocs->length; ci++)    for (wci = 0; wci < num_to_print; wci++)      {	lors[ci][wci].lor = 0.0;	lors[ci][wci].wi = -1;      }  /* assume that word_count, normalizer are already set */  /* Calculate the total number of occurrences of each word; store this     int DV->IDF. */  for (wi = 0; wi < max_wi; wi++)     {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      if (dv == NULL)	continue;      dv->idf = 0;      for (dvi = 0; dvi < dv->length; dvi++) 	{	  /* Is cdoc used for anything? - Jason */	  cdoc = bow_array_entry_at_index (barrel->cdocs, 					   dv->entry[dvi].di);	  total_num_words += dv->entry[dvi].weight;	  dv->idf += dv->entry[dvi].weight;	}    }  bow_verbosify(bow_progress, "Calculating word weights:        ");  /* Set the weights in the BARREL's WI2DVF so that they are     equal to P(w|C), the probability of a word given a class. */  for (wi = 0; wi < max_wi; wi++)     {      double pr_w = 0.0;      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      if (wi % 100 == 0)	bow_verbosify(bow_progress, "\b\b\b\b\b\b%6d", wi);      /* If the model doesn't know about this word, skip it. */      if (dv == NULL)	continue;      pr_w = ((double)dv->idf) / total_num_words;      /* Now loop through all the elements, setting their weights */      for (dvi = 0; dvi < dv->length; dvi++) 	{	  double pr_w_c;	  double pr_w_not_c;	  double log_likelihood_ratio;	  cdoc = bow_array_entry_at_index (barrel->cdocs, 					   dv->entry[dvi].di);	  /* Here CDOC->WORD_COUNT is the total number of words in the class */	  /* We use Laplace Estimation. */	  pr_w_c = ((double)dv->entry[dvi].weight 		    / (cdoc->word_count + cdoc->normalizer));	  pr_w_c = (((double)dv->entry[dvi].weight + 1)		    / (cdoc->word_count + barrel->wi2dvf->num_words));	  pr_w_not_c = ((dv->idf - dv->entry[dvi].weight 			 + barrel->cdocs->length - 1)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -