📄 maxent.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 4 页
字号:
			      query_wv, hits,			      max_ci, -1);	  assert (actual_num_hits == max_ci);	  	  for (ci = 0; ci < max_ci; ci++)	    total_count_per_ci[ci] += hits[ci].weight;	  /* now loop over the words in the document and all the classes,	     adding the contribution to E[f_{w,c}] */	  for (wvi=0; wvi < query_wv->num_entries; wvi++)	    {	      wi = query_wv->entry[wvi].wi;	      	      for (ci=0; ci < bow_barrel_num_classes (vpc_barrel); ci++)		bow_wi2dvf_add_wi_di_count_weight 		  (&exp_wi2dvf, wi, ci, 1,		   hits[ci].weight * query_wv->entry[wvi].weight);	    }	}            /* now update the lambdas.  Ignore zero constraints? */      for (wi = 0; wi < max_wi; wi++) 	{	  bow_dv *vpc_dv;	  bow_dv *constraint_dv;	  bow_dv *exp_dv;	  int exp_dvi = 0;	  vpc_dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi);	  constraint_dv = bow_wi2dvf_dv (constraint_wi2dvf, wi);	  exp_dv = bow_wi2dvf_dv (exp_wi2dvf, wi);	  /* the exp_dv can be null if we're using only some of the             documents for the iteration step.  If there are no             iteration docs that have this word, then we don't need to             worry about its weight... leave it at zero */	  if (!constraint_dv || !exp_dv)	    continue;	  /* the dvi goes over the constraint and the vpc; the	     constraint and vpc wi2dvf should have exactly	     corresponding entries.  The exp wi2dvf can have	     a superset of the entries; */	  for (dvi = 0; dvi < vpc_dv->length; dvi++)	    {	      ci = vpc_dv->entry[dvi].di;	      	      /* get the corresponding exp_dvi */	      while (exp_dvi < exp_dv->length &&		     ci > exp_dv->entry[exp_dvi].di)		exp_dvi++;	      assert (exp_dvi < exp_dv->length);	      assert (ci == constraint_dv->entry[dvi].di &&		      ci == exp_dv->entry[exp_dvi].di);		      /* need to normalize this delta with M? */#if 1	      if (exp_dv->entry[exp_dvi].weight == 0)		assert (constraint_dv->entry[dvi].weight == 0);	      else #endif		{		  double delta = 0;		  if (maxent_gaussian_prior)		    {		      double variance = maxent_prior_variance;		      		      if (maxent_prior_vary_by_count == 1)			variance = maxent_prior_variance * 			  log (1 + constraint_dv->entry[dvi].count);		      else if (maxent_prior_vary_by_count == 2)			variance = maxent_prior_variance * constraint_dv->entry[dvi].count;		      newton_poly->entry[0].coeff = -constraint_dv->entry[dvi].weight +			vpc_dv->entry[dvi].weight / variance;		      newton_poly->entry[1].coeff = exp_dv->entry[exp_dvi].weight / 			(double) num_tested;		      newton_poly->entry[2].coeff = 1.0 / variance;		      		      delta = maxent_newton (newton_poly);		      delta = log (delta);		    }		  else		    {		      if (exp_dv->entry[exp_dvi].weight != 0)			delta = log (((double) num_tested) * constraint_dv->entry[dvi].weight /				     (exp_dv->entry[exp_dvi].weight)) /			  (double) bow_event_document_then_word_document_length;		      else			delta = 0;		      		      /* check that delta is not NaN */		      assert (delta == delta);		      assert (constraint_dv->entry[dvi].weight);		    }		  bow_wi2dvf_set_wi_di_count_weight 		    (&(vpc_barrel->wi2dvf), wi, ci,		     vpc_dv->entry[dvi].count,		     (vpc_dv->entry[dvi].weight + delta));		}	    }	}            if (maxent_logprob_docs)	{	  old_log_prob = new_log_prob;	  new_log_prob = maxent_calculate_accuracy(doc_barrel, vpc_barrel, maxent_logprob_docs, 2);	  bow_verbosify (bow_progress, "Halting Log Prob: %f\n", new_log_prob);	}      else if (maxent_halt_accuracy_docs)	{	  old_accuracy = new_accuracy;	  new_accuracy = maxent_calculate_accuracy (doc_barrel, vpc_barrel, maxent_halt_accuracy_docs, 1);	  bow_verbosify (bow_progress, "Halting Accuracy: %f\n", new_accuracy);	}      bow_wi2dvf_free (exp_wi2dvf);    }  bow_free (newton_poly);  bow_wi2dvf_free (constraint_wi2dvf);  bow_maxent_model_building = 0;#if 0  if (maxent_print_lambdas)    {      bow_verbosify (bow_progress, "foo");      for (ci = 0; ci < max_ci; ci++)	bow_verbosify (bow_progress, " %s", bow_barrel_classname_at_index (doc_barrel, ci));      bow_verbosify (bow_progress, "\n");          for (wi = 0; wi < max_wi; wi++)	{	  bow_verbosify (bow_progress, "%s", bow_int2word (wi));	  dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi);	  dvi = 0;		  for (ci = 0; ci < max_ci; ci++)	    {	      while ((ci > dv->entry[dvi].di) && (dvi < dv->length))		dvi++;	      if ((ci == dv->entry[dvi].di) && (dvi < dv->length))		bow_verbosify (bow_progress, " %f", dv->entry[dvi].weight);	      else		bow_verbosify (bow_progress, " 0");	    }	  bow_verbosify (bow_progress, "\n");	}    }#endif    return (vpc_barrel);      }bow_barrel *bow_maxent_new_vpc_with_weights (bow_barrel *doc_barrel){  bow_barrel *vpc_barrel;   /* the vector-per-class barrel */  int wi;                   /* word index */  int max_wi;               /* max word index */  int dvi;                  /* document vector index */  int ci;                   /* class index */  bow_dv *dv;               /* document vector */  int di;                   /* document index */  bow_dv_heap *test_heap=NULL;	/* we'll extract test WV's from here */  bow_wv *query_wv;  bow_score *hits;  int actual_num_hits;  bow_cdoc *doc_cdoc;  bow_cdoc *cdoc;  bow_wi2dvf *constraint_wi2dvf;  int max_ci;  int rounds = 0;  int total_num_docs = 0;  int **f_sharp;  int max_f_sharp = 0;  double *coefficients[200];  bow_dv *doc_dv;  bow_dv *constraint_dv;  bow_dv *lambda_dv;  int constraint_dvi;  int doc_dvi;  int fi;  maxent_polynomial *newton_poly;  double log_prob_model;  double beta;  double num_words_per_ci[200];  int num_unique_words_per_ci[200];  float old_log_prob = -FLT_MAX;  float new_log_prob = -FLT_MAX / 2;  float old_accuracy = -1;  float new_accuracy = 0;  if (bow_event_model == bow_event_document_then_word)    return (bow_maxent_new_vpc_with_weights_doc_then_word (doc_barrel));  bow_maxent_model_building = 1;  /* some sanity checks first */  assert (200 > bow_barrel_num_classes(doc_barrel));  assert (doc_barrel->classnames);  assert (bow_event_model == bow_event_word);  assert (!maxent_words_per_class || !maxent_scoring_hack);  assert (!(maxent_smooth_counts && maxent_gaussian_prior));  assert (!maxent_words_per_class || !maxent_logprob_constraints);  assert (!maxent_logprob_constraints);  assert (!maxent_prior_vary_by_count);  assert (!maxent_constraint_use_unlabeled);  max_wi = MIN (doc_barrel->wi2dvf->size, bow_num_words ());  max_ci = bow_barrel_num_classes (doc_barrel);  f_sharp = bow_malloc (sizeof (int *) * doc_barrel->cdocs->length);  for (di = 0; di < doc_barrel->cdocs->length; di++)    f_sharp[di] = bow_malloc (sizeof (int) * max_ci);  /* initialize f_sharp */  for (di = 0; di < doc_barrel->cdocs->length; di++)    for (ci = 0; ci < max_ci; ci++)      f_sharp[di][ci] = 0;  /* if we're doing log counts, set the document weights appropriately.      Otherwise, set the weights to the counts for each document. */  if (maxent_logprob_constraints)    {      for (wi = 0; wi < max_wi; wi++)	{	  dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi);	  if (dv == NULL)	    continue;	  for (dvi = 0; dvi < dv->length; dvi++) 	    dv->entry[dvi].weight = log (dv->entry[dvi].count + 1);	}    }  else    {      for (wi = 0; wi < max_wi; wi++)	{	  dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi);	  if (dv == NULL)	    continue;	  for (dvi = 0; dvi < dv->length; dvi++) 	    dv->entry[dvi].weight = (float) dv->entry[dvi].count;	}    }  /* get a barrel where the counts are set to word counts and the     weights are set to normalized or unnormalized counts as     appropriate for the event model */  vpc_barrel = bow_barrel_new_vpc (doc_barrel);  /* if doing occurrence count pruning of features, do that now. */  if (maxent_prune_features_by_count)    maxent_prune_features_by_occurrence_count (vpc_barrel, 					       maxent_prune_features_by_count);  /* set the word count and normalizer of each class cdoc correctly.     Use the weight here, b/c maybe doing logprob_constraints.  The word     counts and normalizer are used by mutual information feature     pruning.*/  for (ci = 0; ci < max_ci; ci++)    {      num_words_per_ci[ci] = 0;      num_unique_words_per_ci[ci] = 0;    }  for (wi = 0; wi < max_wi; wi++)     {      dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi);      if (dv == NULL)	continue;      for (dvi = 0; dvi < dv->length; dvi++) 	{	  num_words_per_ci[dv->entry[dvi].di] += dv->entry[dvi].weight;	  num_unique_words_per_ci[dv->entry[dvi].di]++;	}    }  for (ci = 0; ci < vpc_barrel->cdocs->length; ci++)    {      cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci);      cdoc->word_count = (int) rint (num_words_per_ci[ci]);      cdoc->normalizer = num_unique_words_per_ci[ci];    }  /* If doing feature selection by mutual information, do that now.     Ensure that cdoc->word_count set correctly beforehand.  It should     be ok to do both kinds of feature selection pruning. */  if (maxent_words_per_class > 0)    maxent_prune_vocab_by_mutual_information (vpc_barrel, 					      maxent_words_per_class);  /* initialize cdoc->class_probs for all the docs and initialize     total_num_docs to the number of training docs */  for (di=0; di < doc_barrel->cdocs->length; di++)    {      bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di);      double *double_class_probs;	      if (cdoc->type == bow_doc_train)	total_num_docs++;      if (!cdoc->class_probs)	cdoc->class_probs = (float *) bow_malloc (sizeof (double) * max_ci);	      double_class_probs = (double *) cdoc->class_probs;      /* initialize the class_probs to all zeros */      for (ci=0; ci < max_ci; ci++)	double_class_probs[ci] = 0.0;    }    /* Set the constraint wi2dvf to be the (vpc weight / number of     documents).  Re-initialize the vpc weights to 0 (initialize the     lambdas to be zero).  */  constraint_wi2dvf = bow_wi2dvf_new (doc_barrel->wi2dvf->size);  for (wi = 0; wi < max_wi; wi++)     {      dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi);            if (!dv)	continue;      if (maxent_smooth_counts)	{	  dvi = 0;	  for (ci = 0; ci < max_ci; ci++)	    {	     	      while (dv->entry[dvi].di < ci &&		     dvi < dv->length)		dvi++;	      /* set contraint to smoothed empirical average */	      if (dvi < dv->length && dv->entry[dvi].di == ci)		bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 						  dv->entry[dvi].count + 1,						  (dv->entry[dvi].weight + 1.0) / 						  (double) total_num_docs);	      else		bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 						  1,						  1.0 / (double) total_num_docs);			      /* initialize the lambda to 0 */	      bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf),						 wi, ci,						 1,						 0);	    }	}      else if (maxent_gaussian_prior)	{	  dvi = 0;	  for (ci = 0; ci < max_ci; ci++)	    {	      	      while (dv->entry[dvi].di < ci &&		     dvi < dv->length)		dvi++;	      	      /* set contraint to smoothed empirical average */	      if (dvi < dv->length && dv->entry[dvi].di == ci)		{		  bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 						    dv->entry[dvi].count,						    dv->entry[dvi].weight / 						    (double) total_num_docs);		  /* initialize the lambda to 0 */		  bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf),						     wi, ci,						     1,						     0);		}	      else if (maxent_gaussian_prior_zero_constraints)		{		  bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 						    1,						    0);	      		  /* initialize the lambda to 0 */		  bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf),						     wi, ci,						     1,						     0);		}	    }	}      else	{	  	  for (dvi = 0; dvi < dv->length; dvi++)	    {	      ci = dv->entry[dvi].di;	      	      assert (dv->entry[dvi].weight > 0);	      	      	      /* set contraint to empirical average */	      bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 						dv->entry[dvi].count,						dv->entry[dvi].weight / 						(double) total_num_docs);	      	      /* initialize the lambda to 0 */	      bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf),						 wi, ci,						 dv->entry[dvi].count,						 0);	    }	}    }  /* set f_sharp of each document/class combination to be the sum of     all the feature weights for that class for that doc.  set     max_f_sharp to be the maximum of all the f_sharp values.  Note     that we're summing document word counts here, and not document     word weights.  We'll have to do something more sneaky for logprob     constraints when we implement it.  For now, though, this should     be ok.*/    /* walk the document wi2dvf with the constraint wi2dvf and increment */  for (wi = 0; wi < max_wi; wi++)    {      doc_dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi);      constraint_dv = bow_wi2dvf_dv (constraint_wi2dvf, wi);            if (!constraint_dv || !doc_dv)	continue;      for (doc_dvi = 0; doc_dvi < doc_dv->length; doc_dvi++)	for (constraint_dvi = 0; constraint_dvi < constraint_dv->length; 	     constraint_dvi++)	  f_sharp[doc_dv->entry[doc_dvi].di][constraint_dv->entry[constraint_dvi].di] +=
💿 文件大小 522 K
👤 上传用户 yuanata
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#mitchell #tom #机器学习 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -