📄 maxent.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 4 页
字号:
	bow_wi2dvf_hide_wi (barrel->wi2dvf, wi);      else	dv->length = new_dvi;    }  for (ci = 0; ci < max_ci; ci++)    bow_free (mis[ci]);  bow_free (mis);  return;}/* Newton's method.  max_fi is one more than the index of the largest   coefficient given */doublemaxent_newton (maxent_polynomial *poly){  double root;  double fun_val = 100;  double deriv_val;  double low = 0;  double high = 1000000;  double dxold = 1000000;  double dx = 1000000;  static double epsilon=1.0E-8;  int fi;  int rounds = 0;  /* initial guess of root */  root = 1.0001;  while (fabs(fun_val) > epsilon)    {           /* calculate function at new root */      fun_val = 0.0;      for (fi = 0; fi < poly->length; fi++)	fun_val += poly->entry[fi].coeff * pow (root, poly->entry[fi].power);      if (maxent_gaussian_prior)	fun_val += poly->entry[poly->length].coeff * log (root);      /* calculate derivative at root */      deriv_val = 0.0;      for (fi = 1; fi < poly->length; fi++)	deriv_val += poly->entry[fi].power * poly->entry[fi].coeff * 	  pow (root, poly->entry[fi].power - 1);      if (maxent_gaussian_prior)	deriv_val += poly->entry[poly->length].coeff / root;      assert (fun_val == fun_val &&	      deriv_val == deriv_val);      if (fun_val < 0)	low = root;      else	high = root;            dxold = dx;      /* if fun_val is infinity, bisect the region.	 otherwise, use newton.  */      if (fun_val == HUGE_VAL ||	  2 * fabs(fun_val) > fabs(dxold * deriv_val))	{	  dx = (high - low) / 2.0;	  root = low + dx;	}      else	{	  dx = fun_val / deriv_val;	  root -= dx;	  /* if newton thinks to go outside the region, bisect the	     region */	  if (root < low || root > high)	    {	      dx = (high - low) / 2.0;	      root = low + dx;	    }	}      rounds++;      if (rounds > 100)	bow_error ("Newton's method did not converge.\n");    }  return (root);}/* Calculate the accuracy or the model prob of the barrel on a set of   docs.  accuracy_or_logprob 1 for accuracy, else for logprob */floatmaxent_calculate_accuracy (bow_barrel *doc_barrel, bow_barrel *class_barrel, 			   int (*test_docs)(bow_cdoc *), int accuracy_or_logprob){  bow_dv_heap *test_heap;	/* we'll extract test WV's from here */  bow_wv *query_wv;  int di;			/* a document index */  bow_score *hits;  int num_hits_to_retrieve = bow_barrel_num_classes (class_barrel);  int actual_num_hits;  bow_cdoc *doc_cdoc;  int num_tested = 0;  int num_correct = 0;  int old_model_building = 0;  double log_prob = 0;    if (accuracy_or_logprob == 1)    {      old_model_building = bow_maxent_model_building;      bow_maxent_model_building = 0;    }  /* Create the heap from which we'll get WV's. Initialize QUERY_WV so     BOW_TEST_NEXT_WV() knows not to try to free. */  hits = alloca (sizeof (bow_score) * num_hits_to_retrieve);  test_heap = bow_test_new_heap (doc_barrel);  query_wv = NULL;    /* Loop once for each test document. */  while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, 				 test_docs))	 != -1)    {      doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 					   di);      bow_wv_set_weights (query_wv, class_barrel);      bow_wv_normalize_weights (query_wv, class_barrel);      actual_num_hits = 	bow_barrel_score (class_barrel, 			  query_wv, hits,			  num_hits_to_retrieve, -1);      assert (actual_num_hits == num_hits_to_retrieve);      if (doc_cdoc->class == hits[0].di)	num_correct++;      log_prob += log(hits[doc_cdoc->class].weight);      num_tested++;    }  if (accuracy_or_logprob == 1)    {      bow_maxent_model_building = old_model_building;      return (((float) num_correct) / ((float) num_tested));    }  else    return (log_prob);}bow_barrel *bow_maxent_new_vpc_with_weights_doc_then_word (bow_barrel *doc_barrel){  bow_barrel *vpc_barrel;   /* the vector-per-class barrel */  int wi;                   /* word index */  int wvi;  int max_wi;               /* max word index */  int dvi;                  /* document vector index */  int ci;                   /* class index */  bow_dv *dv;               /* document vector */  int di;                   /* document index */  bow_dv_heap *test_heap=NULL;	/* we'll extract test WV's from here */  bow_wv *query_wv;  bow_score *hits;  int actual_num_hits;  bow_cdoc *doc_cdoc;  bow_wi2dvf *constraint_wi2dvf;  int max_ci;  int rounds = 0;  double total_count_per_ci[200];  int total_num_docs = 0;  float old_log_prob = -FLT_MAX;  float new_log_prob = -FLT_MAX / 2;  float old_accuracy = -1;  float new_accuracy = 0;  maxent_polynomial *newton_poly;  int num_unlabeled = 0;  int *unlabeled_dis = NULL;      bow_maxent_model_building = 1;  /* some sanity checks first */  assert (200 > bow_barrel_num_classes(doc_barrel));  assert (doc_barrel->classnames);  assert (bow_event_model == bow_event_document_then_word);  assert (!maxent_scoring_hack);  assert (!maxent_logprob_constraints);  assert (!maxent_words_per_class);  assert (!maxent_prune_features_by_count);  max_wi = MIN (doc_barrel->wi2dvf->size, bow_num_words ());  max_ci = bow_barrel_num_classes (doc_barrel);  newton_poly = bow_malloc (sizeof (maxent_polynomial) +			    sizeof (maxent_coefficient) * 3);  newton_poly->size = 3;  newton_poly->length = 2;  newton_poly->entry[0].power = 0;  newton_poly->entry[1].power = bow_event_document_then_word_document_length;  newton_poly->entry[2].power = -1;  /* if we're using unlabeled data to set the constraints, then we     need to temporarily convert these into training documents. */  if (maxent_constraint_use_unlabeled)     {      unlabeled_dis = bow_malloc (sizeof (int) * doc_barrel->cdocs->length);      for (di = 0; di < doc_barrel->cdocs->length; di++) 	{	  bow_cdoc *cdoc = bow_array_entry_at_index(doc_barrel->cdocs, di);	  if (cdoc->type == bow_doc_unlabeled)	    {	      unlabeled_dis[num_unlabeled] = di;	      num_unlabeled++;	      cdoc->type = bow_doc_train;	    }	}    }  /* get a barrel where the weights and counts are set to word counts     based on the labeled data only */  vpc_barrel = bow_barrel_new_vpc (doc_barrel);  /* switch back the unlabeled documents to have their original tag */  if (maxent_constraint_use_unlabeled)    {      int ui;      for (ui = 0; ui < num_unlabeled; ui++)	{	  bow_cdoc *cdoc = bow_array_entry_at_index(doc_barrel->cdocs, 						    unlabeled_dis[ui]);	  cdoc->type = bow_doc_unlabeled;	}      bow_free (unlabeled_dis);    }  /* re-initialize the weights to 0.     Set the constraint wi2dvf to the counts.  */  for (ci = 0; ci < max_ci; ci++)    {      bow_cdoc *cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci);      total_num_docs += cdoc->word_count;    }    /* Count how many training documents there are.  Exclude documents     that have no features, as we need to ignore them for     doc_then_word */  query_wv = NULL;  test_heap = bow_test_new_heap (doc_barrel);    /* Iterate over each document. */  while (-1 != (di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, 				       bow_cdoc_is_train)))    {      if (query_wv->num_entries == 0)	total_num_docs--;    }    constraint_wi2dvf = bow_wi2dvf_new (doc_barrel->wi2dvf->size);  for (wi = 0; wi < max_wi; wi++)     {      dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi);            if (!dv)	continue;      if (maxent_smooth_counts)	{	  dvi = 0;	  for (ci = 0; ci < max_ci; ci++)	    {	      	      while (dv->entry[dvi].di < ci &&		     dvi < dv->length)		dvi++;	      /* set contraint to smoothed empirical average */	      if (dvi < dv->length && dv->entry[dvi].di == ci)		bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 						  dv->entry[dvi].count + 1,						  (dv->entry[dvi].weight + 1.0) / 						  (double) total_num_docs);	      else		bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 						  1,						  1.0 / (double) total_num_docs);			      /* initialize the lambda to 0 */	      bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf),						 wi, ci,						 1,						 0);	    }	}      else if (maxent_gaussian_prior)	{	  dvi = 0;	  for (ci = 0; ci < max_ci; ci++)	    {	      	      while (dv->entry[dvi].di < ci &&		     dvi < dv->length)		dvi++;	      	      /* set contraint to smoothed empirical average */	      if (dvi < dv->length && dv->entry[dvi].di == ci)		{		  bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 						    dv->entry[dvi].count,						    dv->entry[dvi].weight / 						    (double) total_num_docs);		  /* initialize the lambda to 0 */		  bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf),						     wi, ci,						     1,						     0);		}	      else if (maxent_gaussian_prior_zero_constraints)		{		  bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 						    1,						    0);		  /* initialize the lambda to 0 */		  bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf),						     wi, ci,						     1,						     0);		}	      	    }	}      else	{	  for (dvi = 0; dvi < dv->length; dvi++)	    {	      bow_cdoc *cdoc;	      	      ci = dv->entry[dvi].di;	      	      cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci);	      	      assert (dv->entry[dvi].weight > 0);	      	      /* */	      bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 						dv->entry[dvi].count,						dv->entry[dvi].weight / 						(double) total_num_docs);	      	      bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf),						 wi, ci,						 dv->entry[dvi].count,						 0);	    }	}    }#if 0  if (maxent_print_constraints)    {      bow_verbosify (bow_progress, "foo");      for (ci = 0; ci < max_ci; ci++)	bow_verbosify (bow_progress, " %s", bow_barrel_classname_at_index (doc_barrel, ci));      bow_verbosify (bow_progress, "\n");          for (wi = 0; wi < max_wi; wi++)	{	  bow_verbosify (bow_progress, "%s", bow_int2word (wi));	  dv = bow_wi2dvf_dv (constraint_wi2dvf, wi);	  dvi = 0;		  for (ci = 0; ci < max_ci; ci++)	    {	      while ((ci > dv->entry[dvi].di) && (dvi < dv->length))		dvi++;	      if ((ci == dv->entry[dvi].di) && (dvi < dv->length))		bow_verbosify (bow_progress, " %f", dv->entry[dvi].weight);	      else		bow_verbosify (bow_progress, " 0");	    }	  bow_verbosify (bow_progress, "\n");	}    }#endif    /* Lets start some maximum entropy iteration */  while (maxent_logprob_docs ? 	 new_log_prob > old_log_prob : 	 (maxent_halt_accuracy_docs ? 	  new_accuracy > old_accuracy : 	  rounds < maxent_num_iterations))     {      bow_wi2dvf *exp_wi2dvf = bow_wi2dvf_new (doc_barrel->wi2dvf->size);      int num_tested = 0;      for (ci = 0; ci < max_ci; ci++)	total_count_per_ci[ci] = 0.0;      rounds++;      /* classify all the documents, and put their contribution into the 	 different lambdas */      query_wv = NULL;      hits = alloca (sizeof (bow_score) * max_ci);      num_tested = 0;      test_heap = bow_test_new_heap (doc_barrel);	         /* Calculate accuracy of the validation set for halting check */      if (maxent_accuracy_docs)	bow_verbosify (bow_progress, "%4d Correct: %f\n", rounds,		       maxent_calculate_accuracy(doc_barrel, vpc_barrel, maxent_accuracy_docs, 1));	       /* Loop once for each training document, scoring it and recording its	 contribution to all the E[f_{w,c}] */      while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, 				     maxent_iteration_docs))	     != -1)	{	  doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, 					       di);	  bow_wv_set_weights (query_wv, vpc_barrel);	  bow_wv_normalize_weights (query_wv, vpc_barrel);	  // skip documents with no words	  if (query_wv->num_entries == 0)	    continue;	  	  num_tested++;	  actual_num_hits = 	    bow_barrel_score (vpc_barrel,
💿 文件大小 522 K
👤 上传用户 yuanata
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#mitchell #tom #机器学习 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -