📄 treenode.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
      pr_wi_given_tn = bow_treenode_pr_wi (tn, wi);      pr_wi_given_not_tn = 0;      for (iterator = root; (leaf = bow_treenode_iterate_leaves (&iterator)); )	{	  if (leaf != tn)	    pr_wi_given_not_tn += (bow_treenode_pr_wi (leaf, wi)				   / (leaf_count-1));	  //pr_wi_given_not_tn += leaf->words[wi] / (leaf_count - 1);	}      if (pr_wi_given_tn == 0)	lr = -1;      else if (pr_wi_given_not_tn == 0)	lr = 1;      else	lr = (pr_wi_given_tn 	      * log (pr_wi_given_tn / pr_wi_given_not_tn));      //assert (lr < 1);      bow_wa_append (wa, wi, lr);    }  return wa;}/* Return an array of words with their associated odds ratios,   calculated relative to all the leaves. */bow_wa *bow_treenode_word_leaf_odds_ratios (treenode *tn){  int wi, leaf_count;  bow_wa *wa;  double pr_wi_given_tn;  double pr_wi_given_not_tn;  double lr;  treenode *root, *iterator, *leaf;  if (tn->children_count != 0)    return NULL;  root = tn;  while (root->parent)    root = root->parent;  leaf_count = bow_treenode_leaf_count (root);  wa = bow_wa_new (tn->words_capacity+2);  for (wi = 0; wi < tn->words_capacity; wi++)    {      pr_wi_given_tn = tn->words[wi];      pr_wi_given_not_tn = 0;      for (iterator = root; (leaf = bow_treenode_iterate_leaves (&iterator)); )	{	  if (leaf != tn)	    pr_wi_given_not_tn += leaf->words[wi] / (leaf_count - 1);	}      lr = (/* pr_wi_given_tn * */	    log ((pr_wi_given_tn * (1 - pr_wi_given_not_tn))		 / (pr_wi_given_not_tn * (1 - pr_wi_given_tn))));      bow_wa_append (wa, wi, lr);    }  return wa;}/* Return an array of words with their associated likelihood ratios,   calculated relative to all the leaves. */bow_wa *bow_treenode_word_leaf_mean_ratios (treenode *tn){  int wi, leaf_count;  bow_wa *wa;  double pr_wi_given_tn;  double pr_wi;  double lr;  treenode *root, *iterator, *leaf;  if (tn->children_count != 0)    return NULL;  root = tn;  while (root->parent)    root = root->parent;  leaf_count = bow_treenode_leaf_count (root);  wa = bow_wa_new (tn->words_capacity+2);  for (wi = 0; wi < tn->words_capacity; wi++)    {      pr_wi_given_tn = tn->words[wi];      pr_wi = 0;      for (iterator = root; (leaf = bow_treenode_iterate_leaves (&iterator)); )	{	  pr_wi += leaf->words[wi] / leaf_count;	}      assert (pr_wi > 0);      lr = pr_wi_given_tn / pr_wi;      bow_wa_append (wa, wi, lr);    }  return wa;}/* Print the NUM_TO_PRINT words with highest likelihood ratios,   calculated relative to its siblings. */voidbow_treenode_word_likelihood_ratios_print (treenode *tn, int num_to_print){  bow_wa *wa;  wa = bow_treenode_word_likelihood_ratios (tn);  if (wa)    {      bow_wa_sort (wa);      bow_wa_fprintf (wa, stdout, num_to_print);      bow_wa_free (wa);    }}/* Print the NUM_TO_PRINT words with highest likelihood ratios,   calculated relative to all the leaves. */voidbow_treenode_word_leaf_likelihood_ratios_print (treenode *tn, int num_to_print){  bow_wa *wa;  wa = bow_treenode_word_leaf_likelihood_ratios (tn);  if (wa)    {      bow_wa_sort (wa);      bow_wa_fprintf (wa, stdout, num_to_print);      bow_wa_free (wa);    }}/* Print the NUM_TO_PRINT words with highest odds ratios,   calculated relative to all the leaves. */voidbow_treenode_word_leaf_odds_ratios_print (treenode *tn, int num_to_print){  bow_wa *wa;  wa = bow_treenode_word_leaf_odds_ratios (tn);  if (wa)    {      bow_wa_sort (wa);      bow_wa_fprintf (wa, stdout, num_to_print);      bow_wa_free (wa);    }}/* Same as above, for all nodes in the tree. */voidbow_treenode_word_likelihood_ratios_print_all (treenode *tn, int num_to_print){  int ci;  printf ("%s\nprior=%g\n", tn->name, tn->prior);  bow_treenode_word_likelihood_ratios_print (tn, num_to_print);  for (ci = 0; ci < tn->children_count; ci++)    bow_treenode_word_likelihood_ratios_print_all (tn->children[ci], 						   num_to_print);}/* Return a bow_wa array of words with their associated probabilities */bow_wa *bow_treenode_word_probs (treenode *tn){  int wi;  bow_wa *wa;  wa = bow_wa_new (tn->words_capacity+2);  for (wi = 0; wi < tn->words_capacity; wi++)    bow_wa_append (wa, wi, tn->words[wi]);  return wa;}/* Print the NUM_TO_PRINT words with highest probability */voidbow_treenode_word_probs_print (treenode *tn, int num_to_print){  bow_wa *wa;  wa = bow_treenode_word_probs (tn);  if (wa)    {      bow_wa_sort (wa);      bow_wa_fprintf (wa, stdout, num_to_print);      bow_wa_free (wa);    }}/* Same as above, for all nodes in the tree. */voidbow_treenode_word_probs_print_all (treenode *tn, int num_to_print){  int ci;  printf ("%s\n", tn->name);  if (tn->children_count == 0)    printf ("  prior=%g\n", tn->prior);  bow_treenode_word_probs_print (tn, num_to_print);  for (ci = 0; ci < tn->children_count; ci++)    bow_treenode_word_probs_print_all (tn->children[ci], num_to_print);}/* Print most probable words in one line, and only if parent's   WKL is high enough */voidbow_treenode_keywords_print (treenode *tn, FILE *fp){  bow_wa *wa;  int wai;  //double kldiv;  if (tn->parent == NULL)    return;  //if (bow_treenode_children_weighted_kl_div (tn->parent) < 500) return;#if 0  if ((kldiv = bow_treenode_pair_kl_div (tn, tn->parent)) < 0.5)    {      fprintf (fp, "alias %s %s\n", 	       tn->name, tn->parent->name);      bow_verbosify (bow_progress, "%s kldiv versus parent %g SKIP\n",		     tn->name, kldiv);      return;    }  else    {      bow_verbosify (bow_progress, "%s kldiv versus parent %g\n",		     tn->name, kldiv);    }  for (ci = 0; ci < tn->ci_in_parent; ci++)    {      if (((kldiv = bow_treenode_pair_kl_div	    (tn, tn->parent->children[ci]))	   < 0.5))	{	  fprintf (fp, "alias %s %s\n", 		   tn->name, tn->parent->children[ci]->name);	  bow_verbosify (bow_progress, "%s %s kldiv versus sibling %g SKIP\n",			 tn->name, tn->parent->children[ci]->name, kldiv);	  return;	}      else	{	  bow_verbosify (bow_progress, "%s %s kldiv versus sibling %g\n",			 tn->name, tn->parent->children[ci]->name, kldiv);	}    }#endif  wa = bow_treenode_word_probs (tn);  if (wa)    {      fprintf (fp, "%s %g ", tn->name, 	       bow_treenode_pair_kl_div (tn, tn->parent));      bow_wa_sort (wa);      for (wai = 0; wai < 10; wai++)	fprintf (fp, "%s ", bow_int2word (wa->entry[wai].wi));      fprintf (fp, "\n");      bow_wa_free (wa);    }}/* Same as above, but for TN and all treenodes under TN */voidbow_treenode_keywords_print_all (treenode *tn, FILE *fp){  int ci;  bow_treenode_keywords_print (tn, fp);  for (ci = 0; ci < tn->children_count; ci++)    bow_treenode_keywords_print_all (tn->children[ci], fp);}/* Print the (normalized) probability of word WI in each of the nodes   of the tree rooted at ROOT. */voidbow_treenode_normalized_word_prob_all_print (treenode *root, int wi){  int leaf_count;  double *nodes, nodes_total;  int ni;  treenode *iterator, *node;  leaf_count = bow_treenode_leaf_count (root);  nodes = alloca (sizeof (double) * leaf_count);  nodes_total = 0;  for (iterator = root, ni = 0;       (node=bow_treenode_iterate_all (&iterator));        ni++)    {      nodes[ni] = node->words[wi];      nodes_total += nodes[ni];    }  for (iterator = root, ni = 0;       (node=bow_treenode_iterate_all (&iterator));        ni++)    printf ("%10f %s\n", nodes[ni] / nodes_total, node->name);}/* Print the word distribution for each leaf to a separate file, each   file having prefix FILENAME_PREFIX.  Use vertical mixture if   SHRINKAGE is non-zero. */voidbow_treenode_print_all_word_probabilities_all (const char *filename_prefix,					       int shrinkage){  int li, wi;  char *s;  treenode *iterator, *leaf;  char leafname[BOW_MAX_WORD_LENGTH];  char filename[BOW_MAX_WORD_LENGTH];  FILE *fp;  double pr_w;  bow_verbosify (bow_progress, "Starting word probability printing\n");  for (iterator = crossbow_root, li = 0;       (leaf = bow_treenode_iterate_leaves (&iterator));        li++)    {      strcpy (leafname, leaf->name);      /* Convert '/' to '-' */      for (s = leafname; *s; s++)	if (*s == '/')	  *s = '-';      sprintf (filename, "%s-%s", filename_prefix, leafname);      fp = bow_fopen (filename, "w");      for (wi = 0; wi < leaf->words_capacity; wi++)	{	  if (shrinkage)	    pr_w = bow_treenode_pr_wi (leaf, wi);	  else	    pr_w = leaf->words[wi];	  fprintf (fp, "%f %s\n", pr_w, bow_int2word (wi));	}      fclose (fp);    }}/* Return the "KL Divergence to the Mean" among the children of TN */doublebow_treenode_children_kl_div (treenode *tn){  double *mean;  double kldiv;  int wi, ci;  if (tn->children_count < 2)    return 0;  /* Calculate the mean distribution */  mean = bow_malloc (tn->words_capacity * sizeof (double));  for (wi = 0; wi < tn->words_capacity; wi++)    {      mean[wi] = 0;      for (ci = 0; ci < tn->children_count; ci++)	mean[wi] += tn->children[ci]->words[wi];      mean[wi] /= tn->children_count;    }  /* Calculate "KL Divergence to the Mean" for each child. */  kldiv = 0;  for (ci = 0; ci < tn->children_count; ci++)    {      for (wi = 0; wi < tn->words_capacity; wi++)	{	  /* Testing for tn->children[ci]->words[wi] is legitimate.	     Testing for mean[wi] is a concession to round-off error */	  if (tn->children[ci]->words[wi] && mean[wi])	    kldiv += (tn->children[ci]->words[wi]		      * log (tn->children[ci]->words[wi] / mean[wi]));	  //assert (kldiv < 10);	}    }  bow_free (mean);  kldiv /= tn->children_count;  return kldiv;}/* Return the weighted "KL Divergence to the mean among the children   of TN" multiplied by the number of words of training data in the   children. */doublebow_treenode_children_weighted_kl_div (treenode *tn){  double weight = 0;  int ci;  for (ci = 0; ci < tn->children_count; ci++)    weight += tn->children[ci]->new_words_normalizer;  return weight * bow_treenode_children_kl_div (tn);}/* Return the "KL Divergence to the mean" between TN1 and TN2. */doublebow_treenode_pair_kl_div (treenode *tn1, treenode *tn2){  double *mean;  double kldiv;  int wi;  /* Calculate the mean distribution */  mean = bow_malloc (tn1->words_capacity * sizeof (double));  for (wi = 0; wi < tn1->words_capacity; wi++)    {      mean[wi] = 0;      mean[wi] += tn1->words[wi];      mean[wi] += tn2->words[wi];      mean[wi] /= 2;    }  /* Calculate "KL Divergence to the Mean" for each one. */  kldiv = 0;  for (wi = 0; wi < tn1->words_capacity; wi++)    {      /* Testing for tn->children[ci]->words[wi] is legitimate.	 Testing for mean[wi] is a concession to round-off error */      if (mean[wi])	{	  if (tn1->words[wi])	    kldiv += tn1->words[wi] * log (tn1->words[wi] / mean[wi]);	  if (tn2->words[wi])	    kldiv += tn2->words[wi] * log (tn2->words[wi] / mean[wi]);	}    }  bow_free (mean);  kldiv /= 2;  return kldiv;}/* Same as above, but multiply by the number of words in TN1 and TN2. */doublebow_treenode_pair_weighted_kl_div (treenode *tn1, treenode *tn2){  return ((tn1->new_words_normalizer + tn2->new_words_normalizer)	  * bow_treenode_pair_kl_div (tn1, tn2));}/* Return non-zero if any of TN's children are leaves */intbow_treenode_is_leaf_parent (treenode *tn){  int ci;  for (ci = 0; ci < tn->children_count; ci++)    if (tn->children[ci]->children_count == 0)      return 1;  return 0;}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -