📄 hem.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
  /* Initialize the lambdas */#if SHRINK_WITH_UNIFORM_ONLY  /* Set the lambdas to use the uniform and the leaf, and nothing else */     for (iterator = crossbow_root;        (tn = bow_treenode_iterate_leaves (&iterator));)    {      int li;      for (li = 0; li < tn->depth + 2; li++)	{	  if (li == 0 || li == tn->depth+1)	    tn->lambdas[li] = 0.5;	  else	    tn->lambdas[li] = 0;	}    }#elif 1     if (crossbow_hem_shrinkage)       bow_treenode_set_lambdas_uniform_all (crossbow_root);     else       bow_treenode_set_lambdas_leaf_only_all (crossbow_root);     #else  /* Just for fun see what happens when we initialize more data in leaves */  for (iterator = crossbow_root;        (tn = bow_treenode_iterate_leaves (&iterator));)    {      int li;      for (li = 0; li < tn->depth + 2; li++)	{	  if (li == 0)	    tn->lambdas[li] = 0.5;	  else	    tn->lambdas[li] = 0.5 / (tn->depth + 1);	}    }#endif  //bow_treenode_word_probs_print_all (crossbow_root, 5);  if (crossbow_hem_pseudo_labeled)    bow_tag_change_tags (crossbow_docs, bow_doc_train, bow_doc_unlabeled);  /* Run EM to convergence. */  old_pp = FLT_MAX;  pp = -1;  crossbow_hem_temperature = 1;  /* Loop until convergence, i.e. perplexity doesn't change */  while (/* ABS (old_pp - pp) > 0.1 && */	 iteration < crossbow_hem_max_num_iterations)    {      printf ("--------------------------------------------------"	      " Iteration %d\n", iteration);      /* Output the percent correct, and various perplexities. */      crossbow_classify_tagged_docs (bow_doc_test, 0, stdout);      train_labeled_pp = 	crossbow_hem_labeled_perplexity (bow_doc_is_train);      train_unlabeled_pp =	crossbow_hem_unlabeled_perplexity (bow_doc_is_train);      test_labeled_pp = 	crossbow_hem_labeled_perplexity (bow_doc_is_test);      test_unlabeled_pp = 	crossbow_hem_unlabeled_perplexity (bow_doc_is_test);      printf ("train-unlabeled-pp=%f train-labeled-pp=%f\n"	      " test-unlabeled-pp=%f  test-labeled-pp=%f\n", 	      train_unlabeled_pp, train_labeled_pp,	      test_unlabeled_pp, test_labeled_pp);#if PRINT_WORD_DISTS      sprintf (prefix, "word-dists/em%d-%d", iteration, bow_random_seed);      bow_treenode_print_all_word_probabilities_all (prefix, 1);#endif      for (iterator = crossbow_root; 	   (tn = bow_treenode_iterate_all (&iterator));)	{	  printf ("%s", tn->name);	  if (tn->children_count == 0)	    {	      int ai;	      printf ("\n lambdas=[ ");	      for (ai = 0; ai < tn->depth + 2; ai++)		printf ("%5.3f ", tn->lambdas[ai]);	      printf ("]");	    }	  printf ("\n");	  if (1 || tn->children_count == 0)	    {	      printf ("prior=%g\n", tn->prior);	      //bow_treenode_word_likelihood_ratios_print (tn, 10);	      //printf ("\n");	      if (crossbow_hem_vertical_word_movement)		bow_treenode_word_probs_print (tn, 5);	      //bow_treenode_word_likelihood_ratios_print (tn, 5);	      //bow_treenode_word_leaf_likelihood_ratios_print (tn, 5);	      //bow_treenode_word_leaf_odds_ratios_print (tn, 10);	    }	}      old_pp = pp;      pp = crossbow_hem_em_one_iteration ();      if (iteration % 2 == 0 && crossbow_hem_incremental_labeling)	crossbow_hem_label_most_confident ();      iteration++;    }}/* If we replace the loss function   L= sum_i (tilde{p}_i - p_i)^2with   LL = sum_i (tilde{p}_i - p_i)^2/ (p_i (1-p_i) )then we get a loss function which is still tractable but is moresensitive to errors for small probabilities.If I repeat the calucations I get that lambda should be:   lambda = (t/n) / (   (t/n) + B)where   B = sum_i  (u_i -p_i)^2 /( p_i (1-p_i) )(the sum is over the vocabulary).  Here, t is the vocabulary size.  */#define LOG_LOSS 1voidcrossbow_hem_fienberg_treenode (treenode *tn){  double u;  double numerator;  double wi_err;  double sq_err;  double n;  double lambda;  treenode *ancestor, *node;  int wi, i;  double b;  double t;  /* Sample size = Total number of word occurrences. */  n = tn->new_words_normalizer;    t = tn->words_capacity;  numerator = sq_err = b = 0;#if SHRINK_WITH_UNIFORM_ONLY  if (tn->children_count != 0)    {      for (i = 0; i < tn->depth + 2; i++)	tn->lambdas[i] = 0;      goto do_children;    }#endif  if (SHRINK_WITH_UNIFORM_ONLY || tn->parent == NULL)    {      /* Calculating lambda for the root */      for (wi = 0; wi < tn->words_capacity; wi++)	{	  /* Parent word distribution is the uniform distribution */	  u = 1.0 / tn->words_capacity;	  numerator += tn->words[wi] * (1.0 - tn->words[wi]);	  wi_err = u - tn->words[wi];	  sq_err += wi_err * wi_err;	  b += ((wi_err * wi_err) / (tn->words[wi] * (1.0 - tn->words[wi])));	}      printf ("  n = %d  sum p*(1-p) = %f   squared error = %f  b = %f\n", 	      (int)n, numerator, sq_err, b);#if LOG_LOSS      lambda = (t/n) / ((t/n) + b);#else      lambda = (1.0/n) * (numerator / (sq_err + (1.0/n) * numerator));#endif#if SHRINK_WITH_UNIFORM_ONLY      tn->lambdas[0] = 1.0 - lambda;      for (i = 1; i < tn->depth + 1; i++)	tn->lambdas[i] = 0;      tn->lambdas[tn->depth+1] = lambda;#else      tn->lambdas[1] = lambda;      tn->lambdas[0] = 1.0 - lambda;#endif    }  else    {      /* Calculating lambda for an interior node or leaf */      for (wi = 0; wi < tn->words_capacity; wi++)	{	  /* Calculate parent word distribution as a mixture */	  u = 0;	  node = tn->parent;	  for (ancestor = node, i = 0; 	       ancestor; ancestor = ancestor->parent, i++)	    u += node->lambdas[i] * ancestor->words[wi];	  /* Add in the uniform distribution */	  u += node->lambdas[i] / node->words_capacity;	  numerator += tn->words[wi] * (1.0 - tn->words[wi]);	  wi_err = u - tn->words[wi];	  sq_err += wi_err * wi_err;	  b += ((wi_err * wi_err) / (tn->words[wi] * (1.0 - tn->words[wi])));	  if (0 && wi % 1000 == 0)	    printf ("n %f s %f\n", numerator, sq_err);	}      printf ("  n = %d  sum p*(1-p) = %f   squared error = %f  b = %f\n", 	      (int)n, numerator, sq_err, b);#if LOG_LOSS      lambda = (t/n) / ((t/n) + b);#else      lambda = (1.0/n) * (numerator / (sq_err + (1.0/n) * numerator));#endif      tn->lambdas[0] = 1.0 - lambda;      for (i = 1; i < tn->depth + 2; i++)	tn->lambdas[i] = lambda * tn->parent->lambdas[i-1];    }    bow_verbosify (bow_progress, "%20s\n  local_lambda=%f parent_lambda=%f\n",		 tn->name, 1.0 - lambda, lambda);#if SHRINK_WITH_UNIFORM_ONLY do_children:#endif  for (i = 0; i < tn->children_count; i++)    crossbow_hem_fienberg_treenode (tn->children[i]);}voidcrossbow_hem_fienberg (){  treenode *iterator, *tn;  double test_labeled_pp, test_unlabeled_pp;  double train_labeled_pp, train_unlabeled_pp;#if PRINT_WORD_DISTS  char prefix[BOW_MAX_WORD_LENGTH];#endif  double lambda;#if 0  /* Print the word distribution of all the data, then exit. */  bow_set_all_docs_untagged (crossbow_docs);  bow_set_doc_types_of_remaining (crossbow_docs, bow_doc_train);  crossbow_hem_place_labeled_data ();  bow_treenode_set_words_from_new_words_all (crossbow_root, 0);  bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 0);  bow_treenode_set_lambdas_uniform_all (crossbow_root);  sprintf (prefix, "word-dists/all-mle");  bow_treenode_print_all_word_probabilities_all (prefix, 0);  sprintf (prefix, "word-dists/all-uniform");  bow_treenode_print_all_word_probabilities_all (prefix, 1);  exit (0);#endif#if 0  /* Initialize the word distributions and LOO entries with the data     and initialize lambdas to use local estimates only */  crossbow_hem_place_labeled_data ();  bow_treenode_set_words_from_new_words_all (crossbow_root, 1);  bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1);  bow_treenode_set_lambdas_leaf_only_all (crossbow_root);  printf ("\n\nNo Shrinkage\n");  crossbow_classify_tagged_docs (bow_doc_test, 0, 0, stdout);#endif  crossbow_hem_place_labeled_data ();  bow_treenode_set_words_from_new_words_all (crossbow_root, 1);  bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1);  crossbow_hem_fienberg_treenode (crossbow_root);    /* Print the tree */  for (iterator = crossbow_root;        (tn = bow_treenode_iterate_all (&iterator));)    {      int ai;      printf ("%s", tn->name);      printf (" prior=%g lambdas=[ ", tn->prior);      for (ai = 0; ai < tn->depth + 2; ai++)	printf ("%5.3f ", tn->lambdas[ai]);      printf ("]\n");    }  printf ("\n\nFienberg\n");#if PRINT_WORD_DISTS  sprintf (prefix, "word-dists/fienberg-%d", bow_random_seed);  bow_treenode_print_all_word_probabilities_all (prefix, 1);  sprintf (prefix, "word-dists/map-%d", bow_random_seed);  bow_treenode_print_all_word_probabilities_all (prefix, 0);#endif  crossbow_classify_tagged_docs (bow_doc_test, 0, stdout);  train_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_train);  train_unlabeled_pp=crossbow_hem_unlabeled_perplexity (bow_doc_is_train);  test_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_test);  test_unlabeled_pp = crossbow_hem_unlabeled_perplexity (bow_doc_is_test);  printf ("train-unlabeled-pp=%f train-labeled-pp=%f\n"	  "test-unlabeled-pp=%f test-labeled-pp=%f\n", 	  train_unlabeled_pp, train_labeled_pp,	  test_unlabeled_pp, test_labeled_pp);#if 1  /* Set lambdas several different constants and test */  crossbow_hem_place_labeled_data ();  bow_treenode_set_words_from_new_words_all (crossbow_root, 0);  bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 0);  for (lambda = 0.0; lambda < 1.01; lambda += 0.05)    {      printf ("\nFixed local_lambda=%f uniform_lambda=%f\n", 	      1.0 - lambda, lambda);      for (iterator = crossbow_root; 	   (tn = bow_treenode_iterate_all (&iterator));)	{	  int ai;	  for (ai = 0; ai < tn->depth + 2; ai++)	    {	      if (ai == 0)		tn->lambdas[ai] = 1.0 - lambda;	      else if (ai == tn->depth + 1)		tn->lambdas[ai] = lambda;	      else		tn->lambdas[ai] = 0;	    }	}      crossbow_classify_tagged_docs (bow_doc_test, 0, stdout);      train_labeled_pp = 	crossbow_hem_labeled_perplexity (bow_doc_is_train);      train_unlabeled_pp =	crossbow_hem_unlabeled_perplexity (bow_doc_is_train);      test_labeled_pp = 	crossbow_hem_labeled_perplexity (bow_doc_is_test);      test_unlabeled_pp = 	crossbow_hem_unlabeled_perplexity (bow_doc_is_test);      printf ("train-unlabeled-pp=%f train-labeled-pp=%f\n"	      "test-unlabeled-pp=%f test-labeled-pp=%f\n", 	      train_unlabeled_pp, train_labeled_pp,	      test_unlabeled_pp, test_labeled_pp);    }#endif}extern int crossbow_classify_doc (crossbow_doc *doc, int verbose, FILE *out);crossbow_method hem_cluster_method ={  "hem-cluster",  NULL,  NULL,  crossbow_hem_cluster,  crossbow_classify_doc,};crossbow_method hem_classify_method ={  "hem-classify",  NULL,  crossbow_hem_full_em,  NULL,  crossbow_classify_doc,};crossbow_method hem_fienberg_method ={  "fienberg-classify",  NULL,  crossbow_hem_fienberg,  NULL,  crossbow_classify_doc,};void _register_method_hem () __attribute__ ((constructor));void _register_method_hem (){  bow_method_register_with_name ((bow_method*)&hem_cluster_method,				 "hem-cluster", 				 sizeof (crossbow_method),				 NULL);  bow_method_register_with_name ((bow_method*)&hem_classify_method,				 "hem-classify", 				 sizeof (crossbow_method),				 NULL);  bow_method_register_with_name ((bow_method*)&hem_fienberg_method,				 "fienberg-classify", 				 sizeof (crossbow_method),				 NULL);  bow_argp_add_child (&crossbow_hem_argp_child);}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -