📄 em.c

📁 卡内基梅隆大学MaCallum开发的文本分类系统
💻 C
📖 第 1 页 / 共 5 页
字号:
doublebow_gamma_distribution (int ia){  int j;  double am, e, s, v1, v2, x, y;  assert (ia >= 1) ;  if (ia < 6)     {      x = 1.0;      for (j = 1; j <= ia; j++)	x *= bow_random_01 ();      x = - log (x);    }  else    {      do	{	  do	    {	      do		{		  v1 = 2.0 * bow_random_01 () - 1.0;		  v2 = 2.0 * bow_random_01 () - 1.0;		}	      while (v1 * v1 + v2 * v2 > 1.0);	      y = v2 / v1;	      am = ia - 1;	      s = sqrt (2.0 * am + 1.0);	      x = s * y + am;	    }	  while (x <= 0.0);	  e = (1.0 + y * y) * exp (am * log (x/am) - s * y);	}      while (bow_random_01 () > e);    }  return x;}/* Change the weights by sampling from the multinomial distribution   specified by the training data.  Start from the current values of   the DV WEIGHTS.  Typically this would be called after iteration 1   of EM, before the unlabeled documents were included in the   WEIGHTS. */voidbow_em_perturb_weights (bow_barrel *doc_barrel, bow_barrel *vpc_barrel){  double variance;  double num_words_per_ci[bow_barrel_num_classes (vpc_barrel)];  int ci, wi, dvi, max_wi;  bow_dv *dv;  bow_cdoc *cdoc;  double pr_w_c;  if (bow_em_perturb_starting_point == bow_em_perturb_none)    return;  bow_random_set_seed ();  max_wi = MIN (doc_barrel->wi2dvf->size, bow_num_words ());  /* Perturb the counts (which are stored in WEIGHT) */  for (wi = 0; wi < max_wi; wi++)    {      dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi);      if (!dv)	continue;      for (dvi = 0; dvi < dv->length; dvi++)	{	  /* WEIGHT can be zero if the prob of a class for the doc	     that had this word was zero */	  if (bow_em_perturb_starting_point 	      == bow_em_perturb_with_gaussian)	    {	      if (0 != dv->entry[dvi].weight)		{		  cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, 						   dv->entry[dvi].di);		  pr_w_c = dv->entry[dvi].weight / cdoc->normalizer;		  variance = cdoc->normalizer * pr_w_c * (1 - pr_w_c);		  dv->entry[dvi].weight =		    bow_em_gaussian (dv->entry[dvi].weight, variance);		  if (dv->entry[dvi].weight < 0)		    dv->entry[dvi].weight = 0;		}	    }	  else if (bow_em_perturb_starting_point		   == bow_em_perturb_with_dirichlet)	    {	      dv->entry[dvi].weight =		bow_gamma_distribution (dv->entry[dvi].weight + 1);	      /* The +1 is assuming we are using LaPlace smoothing */	      /* xxx I hope that we are still multiplying weights by		 200 (for a length 200 document), otherwise weight		 will always get rounded down into nothing, because		 bow_gamma_distribution only takes int's */	    }	}    }  /* Reset the CDOC->WORD_COUNT and CDOC->NORMALIZER */  for (ci = 0; ci < bow_barrel_num_classes (vpc_barrel); ci++)    {      cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci);      cdoc->normalizer = 0;      num_words_per_ci[ci] = 0;    }  for (wi = 0; wi < max_wi; wi++)    {      dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi);      if (!dv)	continue;      for (dvi = 0; dvi < dv->length; dvi++)	{	  /* WEIGHT can be zero if the prob of a class for the doc	     that had this word was zero */	  if (0 != dv->entry[dvi].weight)	    {	      cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, 					       dv->entry[dvi].di);	      num_words_per_ci[dv->entry[dvi].di] += 		dv->entry[dvi].weight;#if 0	      /* Now using normalizer for non-int word_count */	      cdoc->normalizer++;#endif	    }	}    }  for (ci = 0; ci < bow_barrel_num_classes (vpc_barrel); ci++)    {      bow_cdoc *cdoc =	bow_array_entry_at_index (vpc_barrel->cdocs, ci);      cdoc->normalizer = num_words_per_ci[ci];#if 0      cdoc->word_count = (int) rint(num_words_per_ci[ci]);      assert (cdoc->word_count >= 0);#endif      assert (cdoc->normalizer >= 0);    }}  /* Create a class barrel with EM-style clustering on unlabeled   docs */bow_barrel *bow_em_new_vpc_with_weights (bow_barrel *doc_barrel){  bow_barrel *vpc_barrel;   /* the vector-per-class barrel */  int wi;                   /* word index */  int max_wi;               /* max word index */  int dvi;                  /* document vector index */  int ci;                   /* class index */  bow_dv *dv;               /* document vector */  int di;                   /* document index */  int binary_neg_ci = -1;  bow_dv_heap *test_heap=NULL;	/* we'll extract test WV's from here */  bow_wv *query_wv;  bow_score *hits;  int actual_num_hits;  int hi;		     /* hit index */  bow_cdoc *doc_cdoc;  int num_tested;  int em_runs = 0;  int num_train_docs = 0;  int num_unlabeled_docs = 0;  int max_new_ci;  int max_old_ci;  int (* bow_cdoc_next_em_doc)(bow_cdoc *) = bow_cdoc_is_unlabeled;  double old_perplexity = DBL_MAX;  double new_perplexity = DBL_MAX / 2;  double old_accuracy = -2;  double new_accuracy = -1;  /*bow_wi2dvf *prev_wi2dvf = NULL;*/  /*float prev_priors[200];*/  /*int prev_word_counts[200];*/  /*float prev_normalizers[200];*/  float total_weight;  float labeled_weight_fraction;  float new_labeled_fraction;	    /* some sanity checks first */  assert(200 > bow_barrel_num_classes(doc_barrel));  assert(200 > bow_em_multi_hump_neg + 1);  assert (!bow_em_multi_hump_neg || 	  (bow_em_binary_case && em_stat_method == nb_score));	    assert (!strcmp(doc_barrel->method->name, "em") ||	  !strcmp(doc_barrel->method->name, "active"));  assert (doc_barrel->classnames);  assert (!(bow_em_perturb_starting_point && em_anneal));  assert (em_stat_method == nb_score || bow_em_multi_hump_neg == 0);  assert (bow_em_multi_hump_neg == 0 || em_labeled_for_start_only == 0);  /* this option is broken */  assert (!em_halt_using_perplexity);  /* initialize some variables */  bow_em_making_barrel = 1;  if (bow_smoothing_method == bow_smoothing_dirichlet)    bow_naivebayes_load_dirichlet_alphas ();  max_old_ci = bow_barrel_num_classes(doc_barrel);  if (bow_em_multi_hump_neg)    max_new_ci = bow_em_multi_hump_neg + 1;  else     max_new_ci = max_old_ci;  if (bow_em_multi_hump_neg > 1)    bow_cdoc_next_em_doc = bow_cdoc_is_multi_hump_doc;    max_wi = MIN (doc_barrel->wi2dvf->size, bow_num_words ());  /*  assert(doc_barrel->wi2dvf->size == bow_num_words ()); */  /* remove words from vocab if using only the unlabeled vocab */  if (em_set_vocab_from_unlabeled)    {      int removed = 0;      int kept = 0;      for (wi = 0; wi < max_wi; wi++)	{	  int found = 0;	  	  bow_dv *dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi);	  if (!dv)	    continue;	  dvi = 0;	  while (dvi < dv->length)	    {	      bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 							 dv->entry[dvi].di);	      	      if (cdoc->type == bow_doc_unlabeled)		{		  found = 1;		  break;		}	      dvi++;	    }	  	  if (!found) 	    {	      bow_wi2dvf_hide_wi (doc_barrel->wi2dvf, wi);	      removed++;	    }	  else	    kept++;	}      bow_verbosify (bow_progress, "Removed %d words using unlabeled data; %d remaining\n", 		     removed, kept);    }  /* Count the number of training and unlabeled documents */  for (di=0; di < doc_barrel->cdocs->length; di++)    {      bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);	      if (cdoc->type == bow_doc_train)	num_train_docs++;      else if (cdoc->type == bow_doc_unlabeled)	num_unlabeled_docs++;    }  /* Identify the binary positive and negative class */  if (bow_em_binary_case)    {      assert (em_binary_pos_classname != NULL);      assert (em_binary_neg_classname != NULL);      for (ci = 0; ci < max_old_ci; ci++)	{	  if (em_binary_pos_classname != NULL &&	      -1 == binary_pos_ci &&	      !strcmp(em_binary_pos_classname, 		      filename_to_classname		      (bow_barrel_classname_at_index (doc_barrel, ci))))	    {	      binary_pos_ci = ci;	    }	  	  if (em_binary_neg_classname != NULL &&	      -1 == binary_neg_ci &&	      !strcmp(em_binary_neg_classname, 		      filename_to_classname		      (bow_barrel_classname_at_index (doc_barrel, ci))))	    {	      binary_neg_ci = ci;	    }	}      if (binary_pos_ci == -1)	bow_error ("No such binary positive class %s.", 		   em_binary_pos_classname);            if (binary_neg_ci == -1)	bow_error ("No such binary negative class %s.", 		   em_binary_neg_classname);    }  /* should the free function be a real one? */  /* Create an empty barrel; we fill it with vector-per-class     data and return it. */  vpc_barrel = bow_barrel_new (doc_barrel->wi2dvf->size,			       doc_barrel->cdocs->length-1,			       doc_barrel->cdocs->entry_size,			       doc_barrel->cdocs->free_func);   vpc_barrel->method = doc_barrel->method;  vpc_barrel->classnames = bow_int4str_new (0);  /* setup the cdoc structure for the class barrel, except for the     word counts and normalizer, which we'll do later.  */  for (ci = 0; ci < max_old_ci; ci++)    {      bow_cdoc cdoc;            /* create the cdoc structure */      cdoc.type = bow_doc_train;      cdoc.normalizer = -0.0f; /* just a temporary measure */      cdoc.word_count = 0; /* just a temporary measure */      cdoc.filename = strdup (bow_barrel_classname_at_index (doc_barrel, 							     ci));      bow_barrel_add_classname(vpc_barrel, cdoc.filename);      if (!cdoc.filename)	bow_error ("Memory exhausted.");      cdoc.class_probs = NULL;      cdoc.class = ci;      bow_array_append (vpc_barrel->cdocs, &cdoc);    }  /* if multi-hump, then add a cdoc for each of the other negative     humps as well */  if (bow_em_multi_hump_neg)    {      for (ci = max_old_ci; ci < max_new_ci; ci++)	{	  bow_cdoc cdoc;	  char *name = bow_malloc (sizeof (char) * 				   (strlen(em_binary_neg_classname) + 10));	  cdoc.type = bow_doc_train;	  cdoc.normalizer = 0.0f; /* just a temporary measure */	  cdoc.word_count = 0; /* just a temporary measure */	  sprintf(name, "%s%d", em_binary_neg_classname, ci);	  	  cdoc.filename = name;	  bow_barrel_add_classname(vpc_barrel, cdoc.filename);	  if (!cdoc.filename)	    bow_error ("Memory exhausted.");	  cdoc.class_probs = NULL;	  cdoc.class = ci;	  bow_array_append (vpc_barrel->cdocs, &cdoc);	}    }	    /* if we're comparing to naivebayes, do that now */  if (em_compare_to_nb == 1)    bow_em_compare_to_nb(doc_barrel);    /* Set word_count for docs correctly.  Do this after comparing to NB b/c     making a NB class barrel messes with the word counts.  */  {    /* Create the heap from which we'll get WV's. */    query_wv = NULL;    test_heap = bow_test_new_heap (doc_barrel);        /* Loop once for each document. */    while (-1 != (di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, 					 bow_cdoc_yes)))      {	int word_count = 0;	int wvi;	doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 					     di);	for (wvi = 0; wvi < query_wv->num_entries; wvi++)	  word_count += query_wv->entry[wvi].count;	doc_cdoc->word_count = word_count;      }  }  /* initialize the EM starting point */  {    /* cycle through the document barrel and make sure that each       document has a correctly initialized class_probs structure.       set class_probs of train docs.  Note that these class_probs       indexes are indexes into the NEW class indexes not the OLD       ones!*/    for (di=0; di < doc_barrel->cdocs->length; di++)      {	bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);		if (!cdoc->class_probs)	  cdoc->class_probs = bow_malloc (sizeof (float) * max_new_ci);		/* initialize the class_probs to all zeros */	for (ci=0; ci < max_new_ci; ci++)	  cdoc->class_probs[ci] = 0.0;		/* if it's a known doc, set its class_probs that way */	if (cdoc->type == bow_doc_train)	  cdoc->class_probs[cdoc->class] = 1.0;      }    /* redistribute class probs of negative docs if multi-hump */    if (bow_em_multi_hump_neg)      {	if (em_multi_hump_init == bow_em_init_spiked)	  {	    int counts[500];	    int n;	    int yet_to_find = 0; 	    assert (bow_em_multi_hump_neg < 500);	    /* Count the number of negative documents */	    for (di=0; di < doc_barrel->cdocs->length; di++)	      {		bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);				if (cdoc->class == binary_neg_ci)		  yet_to_find++;	      }	    /* set the number of docs per negative hump */	    assert(yet_to_find >= bow_em_multi_hump_neg);	    for (n=0; n < bow_em_multi_hump_neg; n++)	      counts[n] = 0;	    for (n=0; n < yet_to_find; n++)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -