📄 treenode.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 4 页
字号:
{  int ai;  double total_lambdas_count = 0.0;#if MISC_STAYS_FLAT  if (strstr (tn->name, "/Misc/"))    {      bow_treenode_set_lambdas_uniform (tn);      for (ai = 0; ai < tn->depth + 2; ai++)	tn->new_lambdas[ai] = 0;    }#endif  /* Calculate the normalizing constant */  for (ai = 0; ai < tn->depth + 2; ai++)    total_lambdas_count += tn->new_lambdas[ai];  total_lambdas_count += alpha * (tn->depth + 2);  //assert (total_lambdas_count);  if (total_lambdas_count == 0)    {      alpha = 1.0 / (tn->depth + 2);      total_lambdas_count = 1.0;    }  for (ai = 0; ai < tn->depth + 2; ai++)    {      assert (tn->new_lambdas[ai] >= 0);#if 1 || !USE_ACCELERATED_EM      tn->lambdas[ai] = (alpha + tn->new_lambdas[ai]) / total_lambdas_count;#else      tn->lambdas[ai] = 	(((1.0 - EM_ACCELERATION) * tn->lambdas[ai])	 + (EM_ACCELERATION * (alpha + tn->new_lambdas[ai])	    / total_lambdas_count));      if (tn->lambdas[ai] < 0)	tn->lambdas[ai] = 0;#endif      assert (tn->lambdas[ai] >= 0);      assert (tn->lambdas[ai] <= 1);      tn->new_lambdas[ai] = 0;    }}/* Set the CLASSES distribution to uniform, allocating space for it if   necessary */voidbow_treenode_set_classes_uniform (treenode *tn, int classes_capacity){  int ci;  if (tn->classes_capacity == 0)    {      tn->classes_capacity = classes_capacity;      tn->classes = bow_malloc (classes_capacity * sizeof (double));      tn->new_classes = bow_malloc (classes_capacity * sizeof (double));    }  assert (classes_capacity == tn->classes_capacity);  for (ci = 0; ci < classes_capacity; ci++)    {      tn->classes[ci] = 1.0 / classes_capacity;      tn->new_classes[ci] = 0.0;    }}/* Normalize the NEW_CLASSES distribution, move it into the CLASSES array   and zero the NEW_CLASSES array.  ALPHA is the parameter for the   Dirichlet prior. */voidbow_treenode_set_classes_from_new_classes (treenode *tn, double alpha){  int ci;  double total_classes_count = 0;  assert (tn->classes_capacity > 0);  for (ci = 0; ci < tn->classes_capacity; ci++)    total_classes_count += tn->new_classes[ci];  total_classes_count += alpha * tn->classes_capacity;  if (total_classes_count == 0)    {      alpha = 1.0 / tn->classes_capacity;      total_classes_count = 1.0;    }  for (ci = 0; ci < tn->classes_capacity; ci++)    {      tn->classes[ci] = (alpha + tn->new_classes[ci]) / total_classes_count;      tn->new_classes[ci] = 0;    }}/* Return the log-probability of node TN's WORD distribution having   produced the word vector WV. */doublebow_treenode_log_local_prob_of_wv (treenode *tn, bow_wv *wv){  int wvi;  double log_prob = 0;  for (wvi = 0; wvi < wv->num_entries; wvi++)    log_prob += wv->entry[wvi].count * log (tn->words[wv->entry[wvi].wi]);  return log_prob;}/* Return the expected complete log likelihood of node TN's word   distribution having produced the word vector WV. */doublebow_treenode_complete_log_prob_of_wv (treenode *tn, bow_wv *wv){  int wvi, ai;  double log_prob = 0;  treenode *ancestor;  double *ancestor_membership;  double ancestor_membership_normalizer;  ancestor_membership = alloca ((tn->depth + 2) * sizeof (double));  for (wvi = 0; wvi < wv->num_entries; wvi++)    {      ancestor_membership_normalizer = 0;      for (ancestor = tn, ai = 0; ancestor; 	   ancestor = ancestor->parent, ai++)	{	  ancestor_membership[ai] = (tn->lambdas[ai]				     * ancestor->words[wv->entry[wvi].wi]);	  ancestor_membership_normalizer += ancestor_membership[ai];	}      ancestor_membership[ai] = tn->lambdas[ai] * 1.0 / tn->words_capacity;      ancestor_membership_normalizer += ancestor_membership[ai];      for (ancestor = tn, ai = 0; ancestor; 	   ancestor = ancestor->parent, ai++)	{	  log_prob += (wv->entry[wvi].count		       * (ancestor_membership[ai]			  / ancestor_membership_normalizer)		       * log (ancestor->words[wv->entry[wvi].wi]));	}      log_prob += (wv->entry[wvi].count		   * (ancestor_membership[ai]		      / ancestor_membership_normalizer)		   * log (1.0 / tn->words_capacity));    }  assert (log_prob == log_prob);  return log_prob;}/* Return the probability of word WI in LEAF, using the hierarchical   mixture */doublebow_treenode_pr_wi (treenode *node, int wi){  int i;  treenode *ancestor;  double ret = 0;  if (node->children_count == 0)    {      /* NODE is a leaf.  Return the vertical mixture using shrinkage */      for (ancestor = node, i = 0; 	   ancestor; ancestor = ancestor->parent, i++)	ret += node->lambdas[i] * ancestor->words[wi];      /* Add in the uniform distribution */      ret += node->lambdas[i] / node->words_capacity;    }  else    {      /* NODE is an interior node of the tree.  Return a weighted	 average of the leaves under NODE. */      double prior_sum = 0;      treenode *iterator, *leaf;      for (iterator = node;	   (leaf = bow_treenode_iterate_leaves_under_node (&iterator, node)); )	{	  prior_sum += leaf->prior;	  ret += leaf->prior * bow_treenode_pr_wi (leaf, wi);	}      ret /= prior_sum;    }  return ret;}/* Return the probability of word WI in node TN, but with the   probability mass of document LOO_DI removed. */doublebow_treenode_pr_wi_loo_local (treenode *tn, int wi, 			      int loo_di, int loo_wvi){  double ret;  double denominator;  /* If there is no LOO information, return the non-LOO estimate. */  if (!(tn->di_loo) || !(tn->di_wvi_loo) || !(tn->di_wvi_loo[loo_di]))    ret = tn->words[wi];  else    {#if 0      double foo1 = ((tn->words[wi] * tn->new_words_normalizer)	      - tn->di_wvi_loo[loo_di][loo_wvi]);      double foo2 = (tn->new_words_normalizer - tn->di_loo[loo_di]);      if (foo1 < -1e-14)	bow_error ("Foo1 %g orig %.18f minus %.18f", 		   foo1,		   (tn->words[wi] * tn->new_words_normalizer),		   tn->di_wvi_loo[loo_di][loo_wvi]);      assert (foo2 >= -1e-14);#endif      denominator = (tn->new_words_normalizer - tn->di_loo[loo_di]);      assert (denominator >= 0);      /* Make sure it is non-negative, but account for round-off error */      assert ((tn->words[wi] * tn->new_words_normalizer)	      - tn->di_wvi_loo[loo_di][loo_wvi] >= -1e-7);      if (denominator)	ret = (((tn->words[wi] * tn->new_words_normalizer)		- tn->di_wvi_loo[loo_di][loo_wvi])	       / denominator);      else	/* Without this document, there is no training data for this class.	   Return a uniform distribution. */	ret = 1.0 / tn->words_capacity;      if (ret < 0)	{	  /* Account for roundoff error */	  assert (ret > -1e-14);	  ret = 0;	}    }  assert (ret >= 0);  assert (ret <= 1);  assert (tn);			/* to keep tn available in debugger */  assert (loo_di >= 0 && loo_wvi >= 0);  return ret;}/* Return the probability of word WI in LEAF, using the hierarchical   mixture, but with the probability mass of document LOO_DI removed. */doublebow_treenode_pr_wi_loo (treenode *tn, int wi,		    int loo_di, int loo_wvi){  int i;  treenode *ancestor;  double ret = 0;  if (tn->children_count == 0)    {      /* TN is a leaf.  Return the vertical mixture using shrinkage */      for (ancestor = tn, i = 0; 	   ancestor; ancestor = ancestor->parent, i++)	ret += (tn->lambdas[i] * 		bow_treenode_pr_wi_loo_local (ancestor, wi, loo_di, loo_wvi));      /* Add in the uniform distribution */      ret += tn->lambdas[i] / tn->words_capacity;    }  else    {      /* TN is an interior node of the tree.  Return a weighted	 average of the leaves under TN. */      double prior_sum = 0;      treenode *iterator, *leaf;      for (iterator = tn;	   (leaf = bow_treenode_iterate_leaves_under_node (&iterator, tn)); )	{	  prior_sum += leaf->prior;	  ret += leaf->prior * bow_treenode_pr_wi_loo (leaf, wi, 						       loo_di, loo_wvi);	}      ret /= prior_sum;    }  assert (ret > 0);  assert (ret < 1);  return ret;}/* Return the prior probability of TN being in a path selected for   generating a document. */doublebow_treenode_prior (treenode *tn){  if (tn->children_count != 0)    {      /* TN is an interior node of the tree; sum the priors of the	 leaves under it. */      treenode *iterator, *leaf;      double ret = 0;      for (iterator = tn;	   (leaf = bow_treenode_iterate_leaves_under_node (&iterator, tn)); )	{	  ret += leaf->prior;	}      return ret;    }  /* TN is a leaf; simply return its prior. */  return tn->prior;}/* Return the log-probability of node TN's WORD distribution having   produced the word vector WV. */doublebow_treenode_log_prob_of_wv (treenode *tn, bow_wv *wv){  int wvi;  double log_prob = 0;  for (wvi = 0; wvi < wv->num_entries; wvi++)    log_prob += (wv->entry[wvi].count		 * log (bow_treenode_pr_wi (tn, wv->entry[wvi].wi)));  return log_prob;}/* Same as above, but return a probability instead of a log-probability */doublebow_treenode_prob_of_wv (treenode *tn, bow_wv *wv){  return exp (bow_treenode_log_prob_of_wv (tn, wv));}/* Return the log-probability of node TN's WORD distribution having   produced the word vector WV, but with document DI removed from TN's   WORD distribution. */doublebow_treenode_log_prob_of_wv_loo (treenode *tn, bow_wv *wv, int loo_di){  int wvi;  double log_prob = 0;  for (wvi = 0; wvi < wv->num_entries; wvi++)    log_prob += (wv->entry[wvi].count		 * log (bow_treenode_pr_wi_loo (tn, wv->entry[wvi].wi, 					    loo_di, wvi)));  assert (log_prob < 0);  return log_prob;}/* Return the local log-probability of node TN's WORD distribution   having produced the word vector WV, but with document DI removed   from TN's WORD distribution. */doublebow_treenode_log_local_prob_of_wv_loo (treenode *tn, bow_wv *wv, int loo_di){  int wvi;  double log_prob = 0;    for (wvi = 0; wvi < wv->num_entries; wvi++)    log_prob += (wv->entry[wvi].count		 * log (bow_treenode_pr_wi_loo_local (tn, wv->entry[wvi].wi, 						  loo_di, wvi)));  assert (log_prob < 0);  return log_prob;}/* Return the number of leaves under (and including) TN */intbow_treenode_leaf_count (treenode *tn){  if (tn->children_count == 0)    return 1;  else    {      int ci, lc = 0;      for (ci = 0; ci < tn->children_count; ci++)	lc += bow_treenode_leaf_count (tn->children[ci]);      return lc;    }}/* Return the number of tree nodes under (and including) TN */intbow_treenode_node_count (treenode *tn){  if (tn->children_count == 0)    return 1;  else    {      int ci, lc = 0;      for (ci = 0; ci < tn->children_count; ci++)	lc += bow_treenode_node_count (tn->children[ci]);      /* Plus one for TN itself. */      return lc + 1;    }}/* Return an array of words with their associated likelihood ratios,   calculated relative to its siblings. */bow_wa *bow_treenode_word_likelihood_ratios (treenode *tn){  int wi, ci;  bow_wa *wa;  double pr_wi_given_tn;  double pr_wi_given_not_tn;  double lr;  if (tn->parent == NULL)    return NULL;  wa = bow_wa_new (tn->words_capacity+2);  for (wi = 0; wi < tn->words_capacity; wi++)    {      pr_wi_given_tn = bow_treenode_pr_wi (tn, wi);      pr_wi_given_not_tn = 0;      for (ci = 0; ci < tn->parent->children_count; ci++)	{	  if (ci != tn->ci_in_parent)	    pr_wi_given_not_tn += 	      (bow_treenode_pr_wi (tn->parent->children[ci], wi)	       / (tn->parent->children_count - 1));	}      if (pr_wi_given_tn == 0)	lr = -1;      else if (pr_wi_given_not_tn == 0)	lr = 1;      else	lr = (pr_wi_given_tn 	      * log (pr_wi_given_tn / pr_wi_given_not_tn));      //assert (lr < 1);      bow_wa_append (wa, wi, lr);    }  return wa;}/* Return an array of words with their associated likelihood ratios,   calculated relative to all the leaves. */bow_wa *bow_treenode_word_leaf_likelihood_ratios (treenode *tn){  int wi, leaf_count;  bow_wa *wa;  double pr_wi_given_tn;  double pr_wi_given_not_tn;  double lr;  treenode *root, *iterator, *leaf;  if (tn->children_count != 0)    return NULL;  root = tn;  while (root->parent)    root = root->parent;  leaf_count = bow_treenode_leaf_count (root);  wa = bow_wa_new (tn->words_capacity+2);  for (wi = 0; wi < tn->words_capacity; wi++)    {      //pr_wi_given_tn = tn->words[wi];
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -