📄 em.c

📁 卡内基梅隆大学MaCallum开发的文本分类系统
💻 C
📖 第 1 页 / 共 5 页
字号:
	      counts[n % bow_em_multi_hump_neg]++;	    /* reassign the negative docs */	    for (di=0; di < doc_barrel->cdocs->length; di++)	      {		bow_cdoc *cdoc = 		  bow_array_entry_at_index (doc_barrel->cdocs, di);		int new_class;	  		if (cdoc->type != bow_doc_train || 		    cdoc->class == binary_pos_ci)		  continue;		assert(yet_to_find > 0);		/* find a new class */		for (new_class = rand() % bow_em_multi_hump_neg;		     counts[new_class] == 0;		     new_class = rand() % bow_em_multi_hump_neg);		yet_to_find--;		counts[new_class]--;		/* assign it to the right hump */		if (new_class != 0)		  {		    cdoc->class_probs[new_class + 1] = 1.0;		    cdoc->class_probs[binary_neg_ci] = 0.0;		  }	      }	    assert(yet_to_find == 0);	  }	else if (em_multi_hump_init == bow_em_init_spread)	  {	    bow_random_set_seed();	    	    /* spread each negative doc randomly over neg components */	    for (di=0; di < doc_barrel->cdocs->length; di++)	      {		bow_cdoc *cdoc = 		  bow_array_entry_at_index (doc_barrel->cdocs, di);		float total = 0;				if (cdoc->type != bow_doc_train || cdoc->class == binary_pos_ci)		  continue;				for (ci=0; ci < max_new_ci; ci++)		  {		    if (ci == binary_pos_ci)		      cdoc->class_probs[ci] = 0.0;		    else		      {			cdoc->class_probs[ci] = (float) (rand() % 100) + 1;			total += cdoc->class_probs[ci];		      }		  }				for (ci=0; ci < max_new_ci; ci++)		  {		    cdoc->class_probs[ci] /= total ;		  }	      }	  }	else	  bow_error ("No initialization for this type");      }        /* set priors using just the known docs if we'll need them        for setting class_probs */    if (em_unlabeled_start == em_start_prior)      {	assert (num_train_docs > 0);	assert (!bow_uniform_class_priors);	(*doc_barrel->method->vpc_set_priors) (vpc_barrel, doc_barrel);      }    else      {	for (ci = 0; ci < max_new_ci; ci++)	  {	    bow_cdoc *cdoc = bow_array_entry_at_index(vpc_barrel->cdocs, ci);	    cdoc->prior = 0.0;	  }      }    /* set the class probs of all the unlabeled docs to determine the EM       starting point */    for (di=0; di < doc_barrel->cdocs->length; di++)      {	bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);	if (cdoc->type != bow_doc_unlabeled)	  continue;	if (em_unlabeled_start == em_start_zero)	  {	    /* set class_probs as all zeros (ignore them for first M step) */	    for (ci=0; ci < max_new_ci; ci++)	      cdoc->class_probs[ci] = 0.0;	  }	else if (em_unlabeled_start == em_start_random)	  {	    float total = 0;	    /* if there are no labeled docs, randomly assign class probs */	    bow_random_set_seed();	  	    for (ci=0; ci < max_new_ci; ci++)	      {		cdoc->class_probs[ci] = (float) (rand() % 100);		total += cdoc->class_probs[ci];	      }	  	    for (ci=0; ci < max_new_ci; ci++)	      {		cdoc->class_probs[ci] *= unlabeled_normalizer / total ;	      }	  }  	else if (em_unlabeled_start == em_start_prior)	  {	    /* distribute class_probs according to priors on just the known */	    assert (!bow_em_multi_hump_neg && !bow_uniform_class_priors);	    assert (num_train_docs > 0);	    for (ci=0; ci < max_new_ci; ci++)	      {		bow_cdoc *class_cdoc = bow_array_entry_at_index 		  (vpc_barrel->cdocs, ci);	      		cdoc->class_probs[ci] = class_cdoc->prior * 		  unlabeled_normalizer;	      }	  }	else if (em_unlabeled_start == em_start_even)	  {	    /* distribute class_probs evenly across all classes */	    for (ci=0; ci < max_new_ci; ci++)	      {		cdoc->class_probs[ci] = unlabeled_normalizer / 		  bow_barrel_num_classes(vpc_barrel);	      }	  }	else	  bow_error ("No such value for em_unlabeled_start");      }  }  /* let's do some EM */  while (em_anneal	 ? em_temperature >= 1.0	 : (em_halt_using_perplexity 	    ? (old_perplexity > new_perplexity &&	       ABS (new_perplexity - old_perplexity) > 0.05)	    : (em_halt_using_accuracy	       ? old_accuracy < new_accuracy	       : em_runs < bow_em_num_em_runs)))    {      em_runs++;      /* the M-step */      bow_verbosify (bow_progress, 		     "Making class barrel by counting words:       ");      if (vpc_barrel->wi2dvf != NULL)	bow_wi2dvf_free(vpc_barrel->wi2dvf);#if 0            /* save the previous wi2dvf */      if (prev_wi2dvf != NULL)	bow_wi2dvf_free(prev_wi2dvf);      prev_wi2dvf = vpc_barrel->wi2dvf;      for (ci = 0; ci < max_new_ci; ci++)	{	  bow_cdoc *cdoc = bow_array_entry_at_index(vpc_barrel->cdocs, ci);	  	  prev_priors[ci] = cdoc->prior;	  prev_word_counts[ci] = cdoc->word_count;	  prev_normalizers[ci] = cdoc->normalizer;	}#endif      /* get a new wi2dvf structure for our class barrel */      vpc_barrel->wi2dvf = bow_wi2dvf_new (doc_barrel->wi2dvf->size);            /* Initialize the WI2DVF part of the VPC_BARREL.  Sum together the	 counts and weights for individual documents, grabbing only the	 training and unlabeled documents. */      for (wi = 0; wi < max_wi; wi++)	{	  dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi);	  if (!dv)	    continue;#if 0	  /* create the dv in the class barrel if there's an	     entry in the doc barrel.  This ensures that	     perplexity calculations happen correctly. */	  vpc_barrel->wi2dvf->entry[wi].dv = bow_dv_new (0);	  vpc_barrel->wi2dvf->entry[wi].seek_start = 2;	  (vpc_barrel->wi2dvf->num_words)++;#endif	  for (dvi = 0; dvi < dv->length; dvi++)	    {	      bow_cdoc *cdoc; 	      	      di = dv->entry[dvi].di;	      cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);	      if (cdoc->type == bow_doc_train ||		  cdoc->type == bow_doc_unlabeled)		{		  assert(cdoc->word_count > 0);		  for (ci=0; ci < max_new_ci; ci++)		    {		      /* it's important to do this even when class_prob is 0 to 			 ensure that perplexity calculations happen ok. */#if 0		      if (cdoc->class_probs[ci] > 0)#endif			{			  if (bow_event_model == bow_event_document_then_word)			    bow_wi2dvf_add_wi_di_count_weight 			      (&(vpc_barrel->wi2dvf), 			       wi, ci, 			       1,  /* hopelessly dummy value */			       (cdoc->class_probs[ci] *				(float) dv->entry[dvi].count * 				(float) bow_event_document_then_word_document_length / 				(float) cdoc->word_count));			  else if (bow_event_model == bow_event_word)			    {			      float addition = cdoc->class_probs[ci] *				(float) dv->entry[dvi].count;			      bow_wi2dvf_add_wi_di_count_weight 				(&(vpc_barrel->wi2dvf), 				 wi, ci, 				 1,  /* hopelessly dummy value */				 addition);			    }			  else			    bow_error("No implementation of this event model.");			}		    }		}	    }	  if (wi % 100 == 0)	    bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", max_wi - wi);	}      bow_verbosify (bow_progress, "\n");            /* set the dv->idf, normalizer and word_count */      bow_em_set_weights (vpc_barrel);      /* set priors */      if (doc_barrel->method->vpc_set_priors && !bow_uniform_class_priors)	(*doc_barrel->method->vpc_set_priors) (vpc_barrel, doc_barrel);            /* If on first EM run, and doing perturbed starting points	 (e.g. for active learning), then perturb the the weights	 using the variance */      if (em_runs == 1 && bow_em_perturb_starting_point)	bow_em_perturb_weights (doc_barrel, vpc_barrel);      /* print top words by class */      if (bow_em_print_word_vector)	bow_em_print_log_odds_ratio(vpc_barrel, 20);      /* Print the P(C|w) distribution to a file so that we can later	 calculate the KL-divergence between the current distribution	 and the "correct" distribution.  */      if (bow_em_print_probs)	bow_em_print_word_distribution(vpc_barrel, em_runs, 				       bow_barrel_num_classes(vpc_barrel));      /* if we're ignoring the labeled data during the iterations, then	 zero out their class probs now */      if (em_runs == 1 && em_labeled_for_start_only)	{	  for (di=0; di < doc_barrel->cdocs->length; di++)	    {	      bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);	      if (cdoc->type == bow_doc_train)		cdoc->class_probs[cdoc->class] = 0.0;	    }	}      /* OK.  we're done with our M-step.  We have a new vpc barrel to 	 use.  Let's now do the E-step, and classify all our documents. */      /* Calculate perplexity of the validation set for halting check */      if (em_perplexity_docs)	{	  old_perplexity = new_perplexity;	  new_perplexity = em_calculate_perplexity (doc_barrel, vpc_barrel);	  bow_verbosify(bow_progress, "Perplexity = %f\n", new_perplexity);	}      /* Calculate accuracy of the validation set for halting check */      if (em_accuracy_docs)	{	  old_accuracy = new_accuracy;	  new_accuracy = em_calculate_accuracy (doc_barrel, vpc_barrel);	  bow_verbosify (bow_progress, "Correct: %f\n", new_accuracy);	}	        /* adjust the normalizer if we're annealing it. */      if (bow_em_anneal_normalizer)	{	  float new_unlabeled_fraction;	      	  total_weight = ((float) num_train_docs) + 	    (unlabeled_normalizer * (float) num_unlabeled_docs);	  labeled_weight_fraction = (float) num_train_docs / 	    total_weight;	  	  /* increase weight of unlabeled data by factor of	     1.1, unless it's the first round; then bump it	     away from zero slightly */	  if (labeled_weight_fraction == 1.0)	    {	      new_labeled_fraction = 0.98;	      new_unlabeled_fraction = 0.02;	    }	  else	    {	      new_unlabeled_fraction = 1.1 * (1.0 - labeled_weight_fraction);	      new_labeled_fraction =  1.0 -  new_unlabeled_fraction;	    }	  unlabeled_normalizer = ((num_train_docs / new_labeled_fraction) - 				  num_train_docs) / num_unlabeled_docs;	  /* halt normalizer annealing when one labeled document	     is the same as one unlabeled document */	  if (new_unlabeled_fraction >= 1.0 ||	      unlabeled_normalizer >= 1.0)	    {	      unlabeled_normalizer = 1.0;	      bow_em_anneal_normalizer = 0;	      em_runs = 1;	    }	  assert (unlabeled_normalizer >= 0 &&		  unlabeled_normalizer <= 1 );	  bow_verbosify (bow_progress, 			 "Updating total labeled weight to %f (normalizer = %f).\n",			 new_labeled_fraction, unlabeled_normalizer);	}      /* only do the e-step if not the last round */      if (em_anneal	  ? 1	  : (em_halt_using_perplexity 	     ? (old_perplexity > new_perplexity &&		ABS(new_perplexity - old_perplexity) > 0.05)	     : (em_halt_using_accuracy		? old_accuracy < new_accuracy		: em_runs < bow_em_num_em_runs)))	{	  /* now classify the unknown documents */	  bow_verbosify(bow_progress, "\nClassifying unlabeled documents:       ");	  	  /* Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to             try to free.  Create the heap from which we'll get             WV's. */	  query_wv = NULL;	  hits = alloca (sizeof (bow_score) * max_new_ci);	  num_tested = 0;	  test_heap = bow_test_new_heap (doc_barrel);	  	  /* Loop once for each unlabeled document. */	  while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, 					 bow_cdoc_next_em_doc))		 != -1)	    {	      doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 						   di);	      bow_wv_set_weights (query_wv, vpc_barrel);	      bow_wv_normalize_weights (query_wv, vpc_barrel);	      actual_num_hits = 		bow_barrel_score (vpc_barrel, 				  query_wv, hits,				  max_new_ci, (int) NULL);	      assert (actual_num_hits == max_new_ci);	      if (em_stat_method == simple)		{		  /* set the class probs to 1 for the maximally likely class */		  for (ci = 0; ci < max_new_ci; ci++)		    doc_cdoc->class_probs[ci] = 0.0;		  		  doc_cdoc->class_probs[hits[0].di] = unlabeled_normalizer;		}	      else if (em_stat_method == nb_score)		{		  /* set the class probs to the naive bayes score */		  for (hi = 0; hi < actual_num_hits; hi++)		    doc_cdoc->class_probs[hits[hi].di] = unlabeled_normalizer *		      hits[hi].weight;		  		  /* this is a neg training doc.  Zero out the pos		     component. */		      		  if (bow_em_multi_hump_neg > 1 &&		      doc_cdoc->type == bow_doc_train)		    {		      double new_total = 0;		      doc_cdoc->class_probs[binary_pos_ci] = 0;		      for (ci = 0; ci < max_new_ci; ci++)			new_total += doc_cdoc->class_probs[ci];		      if (new_total != 0)			{			  for (ci = 0; ci < max_new_ci; ci++)			    doc_cdoc->class_probs[ci] = unlabeled_normalizer *			      doc_cdoc->class_probs[ci] / new_total;			}		      else			{			  /* blech.  we got hosed on roundoff. */			  for (ci = 0; ci < max_new_ci; ci++)			    doc_cdoc->class_probs[ci] = 			      (float) unlabeled_normalizer /			      ((float) max_new_ci - 1.0);			  doc_cdoc->class_probs[binary_pos_ci] = 0;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -