📄 maxent.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
	    doc_dv->entry[doc_dvi].count;    }  /* find the max_fsharp */  for (di = 0; di < doc_barrel->cdocs->length ; di++)    {      bow_cdoc *doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);      if (doc_cdoc->type != bow_doc_train)	continue;      for (ci = 0; ci < max_ci; ci++)	if (f_sharp[di][ci] > max_f_sharp)	  max_f_sharp = f_sharp[di][ci];    }  max_f_sharp++;  /* allocate space for the coefficients */  for (ci = 0; ci < max_ci; ci++)    coefficients[ci] = bow_malloc (sizeof (double) * (max_f_sharp));  /* initialize coefficients */  for (ci = 0; ci < max_ci; ci++)    for (fi = 0; fi < max_f_sharp; fi++)      coefficients[ci][fi] = 0.0;  /* initialize the newton_poly structure */  newton_poly = bow_malloc (sizeof (maxent_polynomial) +			    sizeof (maxent_coefficient) * max_f_sharp);  newton_poly->size = max_f_sharp;  /* Lets start some maximum entropy iteration */  while  (maxent_logprob_docs ? 	  new_log_prob > old_log_prob : 	  (maxent_halt_accuracy_docs ? 	   new_accuracy > old_accuracy : 	   rounds < maxent_num_iterations))     {      int num_tested;      rounds++;      /* classify all the training documents, and store the class	 membership probs in each document's cdoc->class_probs */      query_wv = NULL;      hits = alloca (sizeof (bow_score) * max_ci);      num_tested = 0;      test_heap = bow_test_new_heap (doc_barrel);      log_prob_model = 0;	       /* Loop once for each training document, scoring it and recording its	class probs */      while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, 				     maxent_iteration_docs))	     != -1)	{	  double *double_class_probs;	  num_tested++;	  doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 					       di);	  bow_wv_set_weights (query_wv, vpc_barrel);	  bow_wv_normalize_weights (query_wv, vpc_barrel);	  actual_num_hits = 	    bow_barrel_score (vpc_barrel, 			      query_wv, hits,			      max_ci, -1);	  assert (actual_num_hits == max_ci);	  	  log_prob_model += log (hits[doc_cdoc->class].weight);	  double_class_probs = (double *) doc_cdoc->class_probs;	  /* record the scores in class_probs */	  for (ci = 0; ci < max_ci; ci++)	    {	      double_class_probs[ci] = hits[ci].weight;	      assert (double_class_probs[ci]); 	    }	}       /* Calculate accuracy of the validation set for halting check */      if (maxent_accuracy_docs)	bow_verbosify (bow_progress, 		       "%4d Training Log Prob: %f Selected Correct: %f\n",		       rounds, log_prob_model,		       maxent_calculate_accuracy(doc_barrel, vpc_barrel, 						 maxent_accuracy_docs, 1));      bow_verbosify (bow_progress, "Updating lambdas :         ");      /* now calculate a new lambda for each word feature.  */      for (wi = 0; wi < max_wi; wi++)	{	  constraint_dv = bow_wi2dvf_dv (constraint_wi2dvf, wi);	  	  if (wi % 100 == 0)	    bow_verbosify (bow_progress, "\b\b\b\b\b\b\b%7d", wi);	  if (!constraint_dv)	    continue;	  dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi);	  lambda_dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi);	  assert (dv && lambda_dv);	  	  /* collect the coefficients for all classes simultaneously */	  for (dvi = 0; dvi < dv->length; dvi++)	    {	      bow_cdoc *cdoc;	      double *double_class_probs;	      	      di = dv->entry[dvi].di;	      cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);	      if (cdoc->type != bow_doc_train)		continue;	      double_class_probs = (double *) cdoc->class_probs;	      for (constraint_dvi = 0; constraint_dvi < constraint_dv->length;		   constraint_dvi++)		{		  ci = constraint_dv->entry[constraint_dvi].di;		  coefficients[ci][f_sharp[di][ci]] += double_class_probs[ci] *		    dv->entry[dvi].weight / (double) total_num_docs;		}	    }	  /* now update the lambdas */	  for (dvi = 0; dvi < constraint_dv->length; dvi++)	    {	      bow_cdoc *cdoc;	      /* set the zeroth coefficient to -constraint */	      ci = constraint_dv->entry[dvi].di;	      cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci);	      /* skip classes for which there is no training data */	      if (!cdoc->word_count)		continue;	      coefficients[ci][0] -= constraint_dv->entry[dvi].weight;	      	      /* compile down the class coefficients into newton_poly */	      newton_poly->length = 0;	      for (fi = 0; fi < max_f_sharp; fi++)		{		  if (coefficients[ci][fi] != 0)		    {		      newton_poly->entry[newton_poly->length].coeff = 			coefficients[ci][fi] ;		      newton_poly->entry[newton_poly->length].power = fi;		      newton_poly->length++;		    }		}	      assert (newton_poly->length > 1);	      if (maxent_gaussian_prior)		{		  newton_poly->entry[0].coeff += 		    lambda_dv->entry[dvi].weight / maxent_prior_variance;		  newton_poly->entry[newton_poly->length].power = -1;		  newton_poly->entry[newton_poly->length].coeff = 1.0 / maxent_prior_variance;		}	      /* solve for beta using newton's method on the coefficients */	      beta = maxent_newton (newton_poly);	      /* update the right lambda */	      assert (lambda_dv->entry[dvi].di == ci);	      lambda_dv->entry[dvi].weight += log (beta);	      assert (lambda_dv->entry[dvi].weight ==  lambda_dv->entry[dvi].weight);	      /* clear out the coefficients used */	      for (fi = 0; fi < newton_poly->length; fi++)		coefficients[ci][newton_poly->entry[fi].power] = 0.0;							  	    }	}      bow_verbosify (bow_progress, "\b\b\b\b\b\b\b%7d\n", wi);      /* calculate the new accuracy or log_prob for the halting check */      if (maxent_logprob_docs)	{	  old_log_prob = new_log_prob;	  new_log_prob = maxent_calculate_accuracy(doc_barrel, vpc_barrel, 						   maxent_logprob_docs, 2);	  bow_verbosify (bow_progress, "Halting Log Prob: %f\n", new_log_prob);	}      else if (maxent_halt_accuracy_docs)	{	  old_accuracy = new_accuracy;	  new_accuracy = maxent_calculate_accuracy (doc_barrel, vpc_barrel, 						    maxent_halt_accuracy_docs,						    1);	  bow_verbosify (bow_progress, "Halting Accuracy: %f\n", new_accuracy);	}        }  for (di = 0; di < doc_barrel->cdocs->length; di++)    bow_free (f_sharp[di]);  bow_wi2dvf_free (constraint_wi2dvf);  bow_free (f_sharp);  bow_free (newton_poly);  for (ci = 0; ci < max_ci; ci++)    bow_free (coefficients[ci]);  bow_maxent_model_building = 0;  return (vpc_barrel);      }intbow_maxent_score (bow_barrel *barrel, bow_wv *query_wv, 		  bow_score *bscores, int bscores_len,		  int loo_class){  double *scores;		/* will become prob(class), indexed over CI */  int ci;			/* a "class index" (document index) */  int wvi;			/* an index into the entries of QUERY_WV. */  int dvi;			/* an index into a "document vector" */  double rescaler;		/* Rescale SCORES by this after each word */  int num_scores;		/* number of entries placed in SCORES */  int wi;                       /* word index */  int max_wi;  int max_ci;  assert (bow_event_model == bow_event_word ||	  bow_event_model == bow_event_document_then_word);  assert (loo_class == -1);  assert (bscores_len <= bow_barrel_num_classes (barrel));  assert (!bow_maxent_model_building || 	  bow_barrel_num_classes (barrel) == bscores_len);  /* Allocate space to store scores for *all* classes (documents) */  scores = alloca (bow_barrel_num_classes (barrel) * sizeof (double));  max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  max_ci = bow_barrel_num_classes (barrel);  /* Remove words not in the class_barrel */  bow_wv_prune_words_not_in_wi2dvf (query_wv, barrel->wi2dvf);  /* Yipes, now we are doing this twice, but we have to make sure that      it is re-done after any pruning of words in the QUERY_WV */  bow_wv_set_weights (query_wv, barrel);  bow_wv_normalize_weights (query_wv, barrel);#if 0  /* WhizBang Print the WV, just for debugging */  bow_wv_fprintf (stderr, query_wv);  fflush (stderr);#endif  /* Initialize the SCORES to 0. */  for (ci=0; ci < bow_barrel_num_classes (barrel); ci++)    scores[ci] = 0;    for (wvi=0; wvi < query_wv->num_entries; wvi++)    {      bow_dv *dv;		/* the "document vector" for the word WI */            wi = query_wv->entry[wvi].wi;      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);            /* If the model doesn't know about this word, skip it.  Is this	 really the right thing to do for maximum entropy?  */      if (!dv)	continue;            rescaler = DBL_MAX;            if (maxent_scoring_hack)	{	  /* loop over all classes, using the lambda, or the NB prob if zero */	  dvi = 0;	  for (ci = 0; ci < max_ci; ci++)	    {	      while (dv->entry[dvi].di < ci && dv->length < dvi)		dvi++;	      if (dvi < dv->length && dv->entry[dvi].di == ci)		scores[ci] += dv->entry[dvi].weight * query_wv->entry[wvi].weight;	      else		{		  bow_cdoc *cdoc = bow_array_entry_at_index(barrel->cdocs, ci);		  		  if (cdoc->word_count && cdoc->normalizer)		    scores[ci] += (1.0 / (double) (cdoc->word_count + cdoc->normalizer)) *		      query_wv->entry[wvi].weight;		  else		    scores[ci] -= 10;		}	      /* Keep track of the minimum score updated for this word. */	      if (rescaler > scores[ci])		rescaler = scores[ci];	    }	}      else	{	  /* Loop over all features using this word, putting this word's (WI's)	     contribution into SCORES.  */	  for (dvi = 0; dvi < dv->length; dvi++)	    {	      double lambda;	      	      ci = dv->entry[dvi].di;	      	      lambda = dv->entry[dvi].weight;	      	      /* update the scores for this word class combination */	      scores[ci] += lambda * query_wv->entry[wvi].weight;	      	      /* Keep track of the minimum score updated for this word. */	      if (rescaler > scores[ci])		rescaler = scores[ci];	    }	}            /* Loop over all classes, re-scaling SCORES so that they	 don't get so small we loose floating point resolution.	 This scaling always keeps all SCORES positive. */      if (rescaler < 0)	{	  for (ci = 0; ci < barrel->cdocs->length; ci++)	    {	      /* Add to SCORES to bring them close to zero.  RESCALER is		 expected to often be less than zero here. */	      /* xxx If this doesn't work, we could keep track of the min		 and the max, and sum by their average. */	      scores[ci] += -rescaler;	      assert (scores[ci] > -DBL_MAX + 1.0e5		      && scores[ci] < DBL_MAX - 1.0e5);	    }	}    }      /* Rescale the SCORE one last time, this time making them all 0 or     negative, so that exp() will work well, especially around the     higher-probability classes. */  rescaler = -DBL_MAX;  for (ci = 0; ci < barrel->cdocs->length; ci++)    if (scores[ci] > rescaler)       rescaler = scores[ci];  /* RESCALER is now the maximum of the SCORES. */  for (ci = 0; ci < barrel->cdocs->length; ci++)    scores[ci] -= rescaler;	     /* Use exp() on the SCORES */  for (ci = 0; ci < barrel->cdocs->length; ci++)    scores[ci] = exp (scores[ci]);  /* Now normalize to make them probabilities */  {    double scores_sum = 0;    for (ci = 0; ci < barrel->cdocs->length; ci++)      scores_sum += scores[ci];    for (ci = 0; ci < barrel->cdocs->length; ci++)      {	scores[ci] /= scores_sum;	/* assert (scores[ci] > 0); */      }  }    /* Return the SCORES by putting them (and the `class indices') into     SCORES in sorted order. */  if (!bow_maxent_model_building)    {      num_scores = 0;      for (ci = 0; ci < barrel->cdocs->length; ci++)	{	  if (num_scores < bscores_len	      || bscores[num_scores-1].weight < scores[ci])	    {	      /* We are going to put this score and CI into SCORES		 because either: (1) there is empty space in SCORES, or		 (2) SCORES[CI] is larger than the smallest score there		 currently. */	      int dsi;		/* an index into SCORES */	      if (num_scores < bscores_len)		num_scores++;	      dsi = num_scores - 1;	      /* Shift down all the entries that are smaller than SCORES[CI] */	      for (; dsi > 0 && bscores[dsi-1].weight < scores[ci]; dsi--)		bscores[dsi] = bscores[dsi-1];	      /* Insert the new score */	      bscores[dsi].weight = scores[ci];	      bscores[dsi].di = ci;	    }	}    }  else    {      for (ci = 0; ci < barrel->cdocs->length; ci++)	{	  bscores[ci].weight = scores[ci];	  bscores[ci].di = ci ; 	}      num_scores = bscores_len;    }  return num_scores;}rainbow_method bow_method_maxent = {  "maxent",  NULL, /* now weight setting function */  NULL, /* no weight scaling function */  NULL, /* bow_barrel_normalize_weights_by_summing, */  bow_maxent_new_vpc_with_weights,  bow_barrel_set_vpc_priors_by_counting,  bow_maxent_score,  bow_wv_set_weights_by_event_model,  NULL, /* no need for extra wv weight normalization */  bow_barrel_free, /* is this really the right one? */  NULL /* no parameters for the method */};void _register_method_maxent () __attribute__ ((constructor));void _register_method_maxent (){  bow_method_register_with_name ((bow_method*)&bow_method_maxent,				 "maxent",				 sizeof (rainbow_method),				 &maxent_argp_child);  bow_argp_add_child (&maxent_argp_child);}
上一页 1 2 34
💿 文件大小 522 K
👤 上传用户 yuanata
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#mitchell #tom #机器学习 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -