📄 em.c

📁 卡内基梅隆大学MaCallum开发的文本分类系统
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
			/ 			(total_num_words - cdoc->word_count			 + (barrel->wi2dvf->num_words			    * (barrel->cdocs->length - 1))));	  log_likelihood_ratio = log (pr_w_c / pr_w_not_c);		  wci = num_to_print - 1;	  while (wci >= 0 && 		 (lors[dv->entry[dvi].di][wci].lor < pr_w_c * log_likelihood_ratio))	    wci--;	  if (wci < num_to_print - 1)	    {	      int new_wci = wci + 1;	      for (wci = num_to_print-1; wci > new_wci; wci--)		{		  lors[dv->entry[dvi].di][wci].lor = 		    lors[dv->entry[dvi].di][wci - 1].lor;		  lors[dv->entry[dvi].di][wci].wi = 		    lors[dv->entry[dvi].di][wci - 1].wi;		}	      lors[dv->entry[dvi].di][new_wci].lor = pr_w_c * log_likelihood_ratio;	      lors[dv->entry[dvi].di][new_wci].wi = wi;	    }	}      weight_setting_num_words++;      /* Set the IDF.  Kl doesn't use it; make it have no effect */      dv->idf = 1.0;    }  for (ci = 0; ci < barrel->cdocs->length; ci++)    {      bow_cdoc *cdoc = bow_array_entry_at_index(barrel->cdocs, ci);      bow_verbosify(bow_progress, "\n%s\n", filename_to_classname(cdoc->filename));      for (wci = 0; wci < num_to_print; wci++)	fprintf(stderr, "%1.4f %s\n", lors[ci][wci].lor, 		bow_int2word (lors[ci][wci].wi));    }}/* Print the P(C|w) distribution to a file so that we can later   calculate the KL-divergence between the current distribution   and the "correct" distribution.  */voidbow_em_print_word_distribution (bow_barrel *vpc_barrel, int em_runs, 				int num_classes){  char filename[1024];  FILE *fp;  const char *word;  int wi;  bow_dv *dv;  int c;			/* a class index */  float total_word_count;  int dvi;  /* Open the file. */  sprintf (filename, "pcw%02d", em_runs);  fp = bow_fopen (filename, "w");  /* Print the distribution for each word in the VPC_BARREL */  for (wi = 0; wi < vpc_barrel->wi2dvf->size; wi++)    {      dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi);      if (!dv)	continue;      word = bow_int2word (wi);      fprintf (fp, "%s ", word);      total_word_count = 0;      for (dvi = 0; dvi < dv->length; dvi++)	total_word_count += dv->entry[dvi].weight;      /* Print the probability for each class; don't smooth. */      for (c = 0, dvi = 0; c < num_classes; c++)	{	  while (dv->entry[dvi].di < c && dvi < dv->length)	    dvi++;	  if (dvi < dv->length && dv->entry[dvi].di == c)	    fprintf (fp, "%g ", 		     dv->entry[dvi].weight / total_word_count);	  else	    fprintf (fp, "0 ");	}      fprintf (fp, "\n");    }  fclose (fp);}/* Set the class prior probabilities by counting the number of   documents of each class. note this counts all train and unlabeled   docs.  Note that we're doing an m-estimate thing-y by starting   out as one doc each per class. */voidbow_em_set_priors_using_class_probs (bow_barrel *vpc_barrel,				     bow_barrel *doc_barrel)     {  float prior_sum = 0;  int ci;  int max_ci = vpc_barrel->cdocs->length - 1;  int di;  /* Zero them. */  for (ci = 0; ci <= max_ci; ci++)    {      bow_cdoc *cdoc;      cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci);      cdoc->prior = 1;    }  //prior_sum = max_ci;  /* Add in document class_probs. */  for (di = 0; di < doc_barrel->cdocs->length; di++)    {      bow_cdoc *doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);      bow_cdoc *vpc_cdoc;            if (doc_cdoc->type == bow_doc_train ||	  doc_cdoc->type == bow_doc_unlabeled)	{	  /* note that class probs correspond to CLASS barrel class indices */	  for (ci = 0; ci <= max_ci; ci++)	    {	      vpc_cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci); 	      vpc_cdoc->prior += doc_cdoc->class_probs[ci];	    }	}    }    /* Sum them all. */  for (ci = 0; ci <= max_ci; ci++)    {      bow_cdoc *cdoc;      cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci);      assert (cdoc->prior == cdoc->prior);      prior_sum += cdoc->prior;    }  /* Normalize to set the prior. */  for (ci = 0; ci <= max_ci; ci++)    {      bow_cdoc *cdoc;      cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci);      if (prior_sum != 0)	cdoc->prior /= prior_sum;      else	cdoc->prior = 1.0 / (float) max_ci;      assert (cdoc->prior > 0.0 && cdoc->prior < 1.0);    }}/* Return the probability of word WI in class CI.    If LOO_CLASS_PROBS is not NULL, then we are doing    leave-out-one-document evaulation.  LOO_CLASS_PROBS are the probs   of the classes from which the document has been removed.   LOO_WI_COUNT is the number of WI'th words that are in the document   LOO_W_COUNT is the total number of words in the docment   The last two argments help this function avoid searching for   the right entry in the DV from the beginning each time.   LAST_DV is a pointer to the DV to use.   LAST_DVI is a pointer to the index into the LAST_DV that is   guaranteed to have class index less than CI.*/doublebow_em_pr_wi_ci (bow_barrel *barrel,			 int wi, int ci,			 float *loo_class_probs,			 float loo_wi_count, float loo_w_count,			 bow_dv **last_dv, int *last_dvi){  bow_dv *dv;  bow_cdoc *cdoc;  float num_wi_ci;		/* the number of times wi occurs in class */  float num_w_ci;		/* the number of words in class. */  int dvi;  double m_est_m;  double m_est_p;  double pr_w_c;  cdoc = bow_array_entry_at_index (barrel->cdocs, ci);  if (last_dv && *last_dv)    {      dv = *last_dv;      dvi = *last_dvi;      /* No, not always true. assert (dv->entry[dvi].di <= ci); */    }  else    {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      dvi = 0;      if (last_dv)	*last_dv = dv;    }  /* If the model doesn't know about this word, return 0. */  if (!dv)    return -1.0;  /* Find the index of entry for this class. */  while (dvi < dv->length && dv->entry[dvi].di < ci)    dvi++;  /* Remember this index value for future calls to this function */  if (last_dvi)    *last_dvi = dvi;  if (dvi < dv->length && dv->entry[dvi].di == ci)    {      /* There is an entry in DV for class CI. */      num_wi_ci = dv->entry[dvi].weight;    }  else    {      /* There is no entry in DV for class CI. */      num_wi_ci = 0;      if (loo_class_probs &&	  loo_class_probs[ci] > 0)	bow_error ("There should be data for WI,CI");    }  num_w_ci = cdoc->normalizer;  assert (num_wi_ci >= 0 && num_w_ci >=0);  if (loo_class_probs != NULL &&      loo_class_probs[ci] > 0)    {      float reduction;            reduction = ((float) loo_class_probs[ci]) * ((float) loo_wi_count);      num_wi_ci -= reduction;      reduction = loo_class_probs[ci] * loo_w_count;      num_w_ci -= reduction;      /* be a little flexible with roundoff error.  Float's hold only         seven significant digits or so */#if 1      if (num_wi_ci < 0 && num_wi_ci >= -0.00001)	num_wi_ci = 0;      if (num_w_ci < 0 && num_w_ci >= -0.01)	num_w_ci = 0;#endif      if (!(num_wi_ci >= 0 && num_w_ci >= 0))	bow_error ("foo %g %g\n", num_wi_ci, num_w_ci);    }  if (bow_event_model == bow_event_document)    {      /* This corresponds to adding two training pseudo-data points:	 one that has all features, and one that has no features. */      pr_w_c = ((num_wi_ci + 1)		/ (num_w_ci + 2));    }  else if (bow_smoothing_method == bow_smoothing_laplace	   || bow_smoothing_method == bow_smoothing_mestimate)    {      /* xxx This is not exactly right, because 	 BARREL->WI2DVF->NUM_WORDS might have changed with the	 removal of QUERY_WV's document. */      if (naivebayes_argp_m_est_m == 0 	  || bow_smoothing_method == bow_smoothing_laplace)	m_est_m = barrel->wi2dvf->num_words;      else	m_est_m = naivebayes_argp_m_est_m;      m_est_p = 1.0 / barrel->wi2dvf->num_words;      pr_w_c = ((num_wi_ci + m_est_m * m_est_p)		/ (num_w_ci + m_est_m));    }  else if (bow_smoothing_method == bow_smoothing_wittenbell)    {      bow_error("Can't use WittenBell while normalizer is word_count substitute");      /* Here CDOC->NORMALIZER is the number of unique terms in the class */      if (num_wi_ci > 0)	pr_w_c =	  (num_wi_ci / (num_w_ci + cdoc->normalizer));      else	{	  if (cdoc->word_count)	    /* There is training data for this class */	    pr_w_c = 	      (cdoc->normalizer	       / ((num_w_ci + cdoc->normalizer)		  * (barrel->wi2dvf->num_words - cdoc->normalizer)));	  else	    /* There no training data for this class */	    pr_w_c = 1.0 / barrel->wi2dvf->num_words;	}    }  else if (bow_smoothing_method == bow_smoothing_dirichlet)    {      pr_w_c = (num_wi_ci + bow_naivebayes_dirichlet_alphas[wi]) / 	(num_w_ci + bow_naivebayes_dirichlet_total);    }  else    {      bow_error ("EM does not implement smoothing method %d",		 bow_smoothing_method);      pr_w_c = 0;		/* to avoid gcc warning */    }  if (pr_w_c <= 0)    bow_error ("A negative word probability was calculated. "	       "This can happen if you are using\n"	       "--test-files-loo and the test files are "	       "not being lexed in the same way as they\n"	       "were when the model was built");  assert (pr_w_c > 0 && pr_w_c <= 1);  return pr_w_c;}/* set the dv->idf, normalizer and word_count *//* Function to assign `Naive Bayes'-style weights to each element of   each document vector. */voidbow_em_set_weights (bow_barrel *barrel){  int ci;  bow_cdoc *cdoc;  int wi;			/* a "word index" into WI2DVF */  int max_wi;			/* the highest "word index" in WI2DVF. */  bow_dv *dv;			/* the "document vector" at index WI */  int dvi;			/* an index into the DV */  int weight_setting_num_words = 0;  double *pr_all_w_c = alloca (barrel->cdocs->length * sizeof (double));  double pr_w_c;  double total_num_words = 0;  /* Gather the word count here instead of directly of in CDOC->WORD_COUNT     so we avoid round-off error with each increment.  Remember,     CDOC->WORD_COUNT is a int! */  float num_words_per_ci[200];  int barrel_is_empty = 0;  assert (bow_barrel_num_classes (barrel) < 200);  /* We assume that we have already called BOW_BARREL_NEW_VPC() on     BARREL, so BARREL already has one-document-per-class. */#if 0      assert (!strcmp (barrel->method->name, "naivebayes")	  || !strcmp (barrel->method->name, "crossentropy")	  || !strcmp (barrel->method->name, "active"));#endif  max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  /* The CDOC->PRIOR should have been set in bow_barrel_new_vpc();     verify it. */  /* Get the total number of unique terms in each class; store this in     CDOC->NORMALIZER. */  for (ci = 0; ci < barrel->cdocs->length; ci++)    {      cdoc = bow_array_entry_at_index (barrel->cdocs, ci);      assert (cdoc->prior >= 0);      pr_all_w_c[ci] = 0;      cdoc->normalizer = 0;      num_words_per_ci[ci] = 0;    }  /* If we are using a document (binomial) model, then we'll just use     the value of WORD_COUNT set in bow_barrel_new_vpc(), which is the     total number of *documents* in the class, not the number of     words. */  /* Calculate P(w); store this in DV->IDF. */  if (bow_event_model != bow_event_document)    {      /* Get the total number of terms in each class; store this in	 CDOC->NORMALIZER for a non-integral value. */      /* No longer do : Calculate the total number of unique words,	 and make sure it is the same as BARREL->WI2DVF->NUM_WORDS. */      int num_unique_words = 0;      for (wi = 0; wi < max_wi; wi++) 	{	  dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);	  if (dv == NULL)	    continue;	  num_unique_words++;	  dv->idf = 0.0;	  for (dvi = 0; dvi < dv->length; dvi++) 	    {	      cdoc = bow_array_entry_at_index (barrel->cdocs, 					       dv->entry[dvi].di);	      ci = dv->entry[dvi].di;	      num_words_per_ci[ci] += dv->entry[dvi].weight;#if 0	      /* inactive while normalizer is word_count sub */	      cdoc->normalizer++;#endif	      dv->idf += dv->entry[dvi].weight;	      total_num_words += dv->entry[dvi].weight;	    }	}      for (ci = 0; ci < barrel->cdocs->length; ci++)	{	  cdoc = bow_array_entry_at_index (barrel->cdocs, ci);	  cdoc->normalizer = num_words_per_ci[ci];#if 0	  cdoc->word_count = (int) rint (num_words_per_ci[ci]);#endif	}      assert (num_unique_words == barrel->wi2dvf->num_words);      /* Normalize the DV->IDF to sum to one across all words, so it is	 P(w). */      if (total_num_words)	{	  for (wi = 0; wi < max_wi; wi++) 	    {	      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);	      if (dv == NULL)		continue;	      dv->idf /= total_num_words;	    }	}      else	{	  barrel_is_empty = 1;	  bow_verbosify (bow_progress, "Zero words in class barrel\n");	}    }#if 0  /* initialize Good-Turing smoothing */  if (bow_smoothing_method == bow_smoo
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -