⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hem.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 4 页
字号:
  double log_prob_of_data = 0;  int num_data_words = 0;	/* the number of word occurrences */  num_leaves = bow_treenode_leaf_count (crossbow_root);  for (di = 0; di < crossbow_docs->length; di++)    {      doc = bow_array_entry_at_index (crossbow_docs, di);      if (! (*use_doc_p)((bow_doc*)doc))	continue;      wv = crossbow_wv_at_di (di);      max_leaf_pp = -FLT_MAX;      for (iterator = crossbow_root, li = 0;	   (leaf = bow_treenode_iterate_leaves (&iterator)); 	   li++)	{	  if (crossbow_hem_shrinkage)	    leaf_data_log_prob = bow_treenode_log_prob_of_wv (leaf, wv);	  else	    leaf_data_log_prob = bow_treenode_log_local_prob_of_wv (leaf, wv);	  leaf_pp = log(leaf->prior) + leaf_data_log_prob;	  assert (leaf_pp == leaf_pp);#if 1	  /* Test for -Inf, and if so, immediately return Inf */	  if (leaf_pp == -HUGE_VAL)	    return HUGE_VAL;#endif	  if (leaf_pp > max_leaf_pp)	    max_leaf_pp = leaf_pp;	}      assert (max_leaf_pp != -FLT_MAX);      log_prob_of_data += max_leaf_pp;      num_data_words += bow_wv_word_count (wv);    }  /* Return the perlexity */  if (num_data_words)    return exp (-log_prob_of_data / num_data_words);  return 0;}/* Return the perplexity (given knowledge of the class label,   P(D,C|theta)) of the data in documents for which the function   USE_DOC_P returns non-zero. */doublecrossbow_hem_labeled_perplexity (int (*use_doc_p)(bow_doc*)){  int di;  crossbow_doc *doc;  bow_wv *wv;  treenode *leaf;  int num_leaves;  double leaf_data_log_prob;  double log_prob_of_data = 0;  int num_data_words = 0;	/* the number of word occurrences */  num_leaves = bow_treenode_leaf_count (crossbow_root);  for (di = 0; di < crossbow_docs->length; di++)    {      doc = bow_array_entry_at_index (crossbow_docs, di);      if (! (*use_doc_p)((bow_doc*)doc))	continue;      wv = crossbow_wv_at_di (di);      leaf = bow_treenode_descendant_matching_name (crossbow_root,						    doc->filename);      if (crossbow_hem_shrinkage)	leaf_data_log_prob = bow_treenode_log_prob_of_wv (leaf, wv);      else	leaf_data_log_prob = bow_treenode_log_local_prob_of_wv (leaf, wv);      /* Test for -Inf, and if so, immediately return Inf */      if (leaf_data_log_prob == -HUGE_VAL)	return HUGE_VAL;      log_prob_of_data += (log (leaf->prior) + leaf_data_log_prob);      assert (log_prob_of_data == log_prob_of_data);      num_data_words += bow_wv_word_count (wv);    }  /* Return the perlexity */  if (num_data_words)    return exp (-log_prob_of_data / num_data_words);  return 0;}/* Classify all unlabeled documents and convert the most confidently   classified to labeled */voidcrossbow_hem_label_most_confident (){  int di, li;  crossbow_doc *doc;  //  bow_wv *wv;  bow_wa *wa;  int word_count;  double score;  int leaf_count = bow_treenode_leaf_count (crossbow_root);  bow_wa **high_scores_per_class;  static int unlabeled_count = -1;  static int num_to_label = 999;  treenode *iterator, *leaf;  assert (crossbow_hem_incremental_labeling);  /* Calculate num_to_label if we are to label all examples in 20     iterations. */  if (unlabeled_count == -1)    {      unlabeled_count = 0;      for (di = 0; di < crossbow_docs->length; di++)	{	  doc = bow_array_entry_at_index (crossbow_docs, di);	  if (doc->tag == bow_doc_unlabeled)	    unlabeled_count++;	}      num_to_label = unlabeled_count / 20;    }  high_scores_per_class = alloca (leaf_count * sizeof (void*));  for (li = 0; li < leaf_count; li++)    high_scores_per_class[li] = bow_wa_new (0);  for (di = 0; di < crossbow_docs->length; di++)    {      bow_wv *wv;      doc = bow_array_entry_at_index (crossbow_docs, di);      if (doc->tag != bow_doc_unlabeled)        continue;      wv = crossbow_wv_at_di (doc->di);      word_count = bow_wv_word_count (wv);      wv = crossbow_wv_at_di (doc->di);      assert (wv);      wa = crossbow_classify_doc_new_wa (wv);      bow_wa_sort (wa);      score = wa->entry[0].weight;      score /= ((word_count + 1) / MIN(9,word_count));      bow_wa_append (high_scores_per_class[wa->entry[0].wi], di, score);      bow_wa_free (wa);    }  for (iterator = crossbow_root, li = 0;       (leaf = bow_treenode_iterate_leaves (&iterator));        li++)    {      int i, num_to_label_this_class = MAX(1,num_to_label * leaf->prior);      if (high_scores_per_class[li]->length == 0)	continue;      bow_wa_sort (high_scores_per_class[li]);      if (num_to_label_this_class > high_scores_per_class[li]->length)	{	  bow_verbosify (bow_quiet,			 "Not enough unlabeled documents classified as %s\n",			 leaf->name);	  num_to_label_this_class = high_scores_per_class[li]->length;	}      for (i = 0; i < num_to_label_this_class; i++)	{	  char *newname = bow_malloc (128);	  doc = 	    bow_array_entry_at_index (crossbow_docs, 				      high_scores_per_class[li]->entry[i].wi);	  assert (doc->tag = bow_doc_unlabeled);	  doc->tag = bow_doc_train;	  doc->ci = li;	  /* xxx Yuck!  WhizBang-specific */	  sprintf (newname, "./data%s%s", leaf->name, 		   strrchr(doc->filename, '/') + 1);	  /* xxx Memory leak here.  Free the doc->name first. */	  doc->filename = newname;	  bow_verbosify (bow_progress, "Labeling class %10s %35s %g\n",			 leaf->name, doc->filename,			 high_scores_per_class[li]->entry[i].weight);	}    }  for (li = 0; li < leaf_count; li++)    bow_wa_free (high_scores_per_class[li]);}#if MN#include "mn.c"#endif/* Return the perplexity */doublecrossbow_hem_em_one_iteration (){  int di;  crossbow_doc *doc;  bow_wv *wv;  treenode *iterator, *leaf, *ancestor;  int li;			/* a leaf index */  int wvi;  int num_leaves;  double *leaf_membership;  double *leaf_data_prob;  double pp, log_prob_of_data = 0;  int num_data_words = 0;	/* the number of word occurrences */  double *ancestor_membership;  double ancestor_membership_total;  double total_deposit_prob;  int found_deterministic_leaf;  int docs_added_count = 0;#if MN  return crossbow_hem_em_one_mn_iteration ();#endif  num_leaves = bow_treenode_leaf_count (crossbow_root);  leaf_membership = alloca (num_leaves * sizeof (double));  leaf_data_prob = alloca (num_leaves * sizeof (double));  /* xxx Here NUM_LEAVES+10 should be MAX_DEPTH */  ancestor_membership = alloca ((num_leaves + 10) * sizeof (double));  for (di = 0; di < crossbow_docs->length; di++)    {      total_deposit_prob = 0;      doc = bow_array_entry_at_index (crossbow_docs, di);      if (crossbow_hem_incremental_labeling)	{	  if (crossbow_hem_lambdas_from_validation)	    {	      if (doc->tag != bow_doc_train		  && doc->tag != bow_doc_validation)		continue;	    }	  else	    {	      if (doc->tag != bow_doc_train)		continue;	    }	}      else if (crossbow_hem_lambdas_from_validation)	{	  if (doc->tag != bow_doc_train	      && doc->tag != bow_doc_unlabeled	      && doc->tag != bow_doc_validation)	    continue;	}      else	{	  if (doc->tag != bow_doc_train && doc->tag != bow_doc_unlabeled)	    continue;	}      /* Temporary fix */      if (strstr (doc->filename, ".include")	  || strstr (doc->filename, ".exclude"))	continue;      /* E-step estimating leaf membership probability for one         document, with annealing temperature. */      wv = crossbow_wv_at_di (di);      found_deterministic_leaf = 0;      for (iterator = crossbow_root, li = 0;	   (leaf = bow_treenode_iterate_leaves (&iterator)); 	   li++)	{	  if (crossbow_hem_shrinkage)	    {	      if (crossbow_hem_loo)		leaf_data_prob[li] = 		  bow_treenode_log_prob_of_wv_loo (leaf, wv, di);	      else		leaf_data_prob[li] = bow_treenode_log_prob_of_wv (leaf, wv);	    }	  else	    {	      if (crossbow_hem_loo)		leaf_data_prob[li] = 		  bow_treenode_log_local_prob_of_wv_loo (leaf, wv, di);	      else		leaf_data_prob[li] = 		  bow_treenode_log_local_prob_of_wv (leaf, wv);	    }	  assert (leaf_data_prob[li] > -HUGE_VAL);	  if (crossbow_hem_deterministic_horizontal	      && (doc->tag == bow_doc_train		  || doc->tag == bow_doc_validation))	    {	      if (strstr (doc->filename, leaf->name))		{		  assert (!found_deterministic_leaf);		  leaf_membership[li] = 1.0;		  found_deterministic_leaf = 1;		}	      else		/* The validation document was formerly an unlabeled                   document.  Set the membership to zero for now; we                   will set it to the results of the E-step below when                   we call crossbow_convert_log_probs_to_probs */		leaf_membership[li] = 0.0;	      continue;	    }	  else if (crossbow_hem_restricted_horizontal		   && (doc->tag == bow_doc_train		       || doc->tag == bow_doc_validation))	    {	      treenode *label_node =		bow_treenode_descendant_matching_name (crossbow_root,						       doc->filename);	      if (strstr (leaf->name, label_node->name))		leaf_membership[li] = (log (leaf->prior)				       + (leaf_data_prob[li]					  / crossbow_hem_temperature));	      else		/* Set it to probability zero, which, in log space is -Inf */		leaf_membership[li] = -HUGE_VAL;	    }	  else	    {	      leaf_membership[li] = (log (leaf->prior)				     + (leaf_data_prob[li]					/ crossbow_hem_temperature));	    }	}      if (!crossbow_hem_deterministic_horizontal	  || doc->tag == bow_doc_unlabeled	  || !found_deterministic_leaf)	/* Last condition above for unlabeled docs that were changed 	   to validation docs */	crossbow_convert_log_probs_to_probs (leaf_membership, num_leaves);      else	/* No longer meaningful!? */	assert (found_deterministic_leaf);      /* For perplexity calculation */      for (iterator = crossbow_root, li = 0;	   (leaf = bow_treenode_iterate_leaves (&iterator)); 	   li++)	{	  /* xxx Should this be with bow_treenode_complete_log_prob_of_wv()? */	  if (leaf_membership[li])	    log_prob_of_data += (leaf_membership[li] * leaf_data_prob[li]);	  assert (log_prob_of_data == log_prob_of_data);	}      num_data_words += bow_wv_word_count (wv);      docs_added_count++;      /* E-step estimating ancestor membership probability for words         in one document, and M-step for one document */      for (iterator = crossbow_root, li = 0;	   (leaf = bow_treenode_iterate_leaves (&iterator)); 	   li++)	{	  if (leaf_membership[li] == 0)	    continue;	  if (strstr (leaf->name, "/Misc/"))	    continue;	  for (wvi = 0; wvi < wv->num_entries; wvi++)	    {	      if (crossbow_hem_shrinkage)		{		  int ai;		  double word_deposit, lambda_deposit;		  /* Calculate normalized ancestor membership probs */		  ancestor_membership_total = 0;		  for (ancestor = leaf, ai = 0; ancestor; 		       ancestor = ancestor->parent, ai++)		    {		      if (crossbow_hem_loo)			ancestor_membership[ai] =			  leaf->lambdas[ai]			  * bow_treenode_pr_wi_loo_local (ancestor, 							  wv->entry[wvi].wi,							  di, wvi);		      else			ancestor_membership[ai] = leaf->lambdas[ai] * 			  ancestor->words[wv->entry[wvi].wi];		      assert (ancestor_membership[ai] >= 0);		      ancestor_membership_total += ancestor_membership[ai];		    }		  ancestor_membership[ai] =		    leaf->lambdas[ai] * 1.0 / leaf->words_capacity;		  ancestor_membership_total += ancestor_membership[ai];		  assert (ancestor_membership_total);		  for (ai = 0; ai < leaf->depth + 2; ai++)		    {		      assert (ancestor_membership[ai] >= 0);		      ancestor_membership[ai] /= ancestor_membership_total;		    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -