⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hem.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 4 页
字号:
		  /* The M-step */		  for (ancestor = leaf, ai = 0; ancestor; 		       ancestor = ancestor->parent, ai++)		    {		      if (crossbow_hem_vertical_word_movement)			word_deposit = wv->entry[wvi].count			  * leaf_membership[li] * ancestor_membership[ai];		      else			word_deposit = wv->entry[wvi].count			  * leaf_membership[li];#define UNLABELED_WEIGHT_REDUCED 0#if UNLABELED_WEIGHT_REDUCED		      if (doc->tag == bow_doc_unlabeled)			word_deposit /= 3;#endif		      assert (word_deposit >= 0);		      if (!crossbow_hem_lambdas_from_validation			  || doc->tag != bow_doc_validation)			{			  if (crossbow_hem_loo)			    bow_treenode_add_new_loo_for_di_wvi			      (ancestor, word_deposit, di, wvi,			       wv->num_entries, crossbow_docs->length);			  ancestor->new_words[wv->entry[wvi].wi] += 			    word_deposit;			}		      if (ancestor_membership[ai] == 0) 			continue;		      lambda_deposit = wv->entry[wvi].count			* leaf_membership[li] * ancestor_membership[ai];		      assert (lambda_deposit >= 0);		      if (!crossbow_hem_lambdas_from_validation			  || doc->tag == bow_doc_validation)			leaf->new_lambdas[ai] += lambda_deposit;		    }		  /* The uniform distribution */		  if (!crossbow_hem_lambdas_from_validation		      || doc->tag == bow_doc_validation)		    leaf->new_lambdas[ai] += 		      wv->entry[wvi].count		      * leaf_membership[li] * ancestor_membership[ai];		} /* if crossbow_hem_shrinkage */	      else 		{		  /* The M-step without shrinkage, without ancestor		     membership probabilities. */		  leaf->new_words[wv->entry[wvi].wi] += 		    wv->entry[wvi].count * leaf_membership[li];		  leaf->new_lambdas[0]++;		}	      assert (leaf->new_words[wv->entry[wvi].wi] >= 0);	      assert (leaf->new_words[wv->entry[wvi].wi]		      == leaf->new_words[wv->entry[wvi].wi]);	    }	  leaf->new_prior += leaf_membership[li];	}    }  /* Finish M-step */  bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1);  for (iterator = crossbow_root;       (leaf = bow_treenode_iterate_all (&iterator));)    {      if (crossbow_hem_shrinkage)	{	  bow_treenode_set_words_from_new_words (leaf, 0);	  bow_treenode_set_lambdas_from_new_lambdas (leaf, 1);	}      else	{	  bow_treenode_set_words_from_new_words (leaf, 1);	  bow_treenode_set_lambdas_from_new_lambdas (leaf, 0);	}    }  pp = exp (-log_prob_of_data / num_data_words);  bow_verbosify (bow_progress, "EM incorporated %d documents; pp=%g\n", 		 docs_added_count, pp);  /* Return the perlexity */  return pp;}intcrossbow_hem_consider_splitting (){  int grandparents_count;  treenode *tn, *iterator, **grandparents;  int ci;  int num_leaves;  int did_split = 0;  /* Make an array of grandparents, then try splitting them.	 If you just iterate through tree, then iteration gets messed	 up the creation of new grandchildren. */  num_leaves = bow_treenode_leaf_count (crossbow_root);  grandparents = bow_malloc (num_leaves * sizeof (void*));  grandparents_count = 0;  for (iterator = crossbow_root;        (tn = bow_treenode_iterate_all (&iterator));)    {      if (bow_treenode_is_leaf_parent (tn))	grandparents[grandparents_count++] = tn;    }  for (ci = 0; ci < grandparents_count; ci++)    did_split |=      crossbow_hem_hypothesize_grandchildren (grandparents[ci], 	crossbow_hem_branching_factor);#if 0  printf (".........................................................\n");  for (iterator = crossbow_root;        (tn = bow_treenode_iterate_all (&iterator));)    {      printf ("%s %g\n", tn->name, tn->prior);      if (tn->children_count == 0)	{	  bow_treenode_word_probs_print (tn, 5);	  printf ("\n");	  bow_treenode_word_leaf_likelihood_ratios_print (tn, 5);	  //bow_treenode_word_likelihood_ratios_print (tn, 10);	}    }#endif  bow_free (grandparents);  return did_split;}void crossbow_hem_cluster (){  int di;  crossbow_doc *doc;  double pp, old_pp, test_pp;  treenode *iterator, *tn;  FILE *classify_fp;  int iteration;  char buf[1024];  bow_random_set_seed();  bow_treenode_set_lambdas_uniform (crossbow_root);    /* initialize all data to be at the root */  for (di = 0; di < crossbow_docs->length; di++)    {      int wvi;      bow_wv *wv = crossbow_wv_at_di (di);            doc = bow_array_entry_at_index (crossbow_docs, di);      if (doc->tag != bow_doc_train && doc->tag != bow_doc_unlabeled)	continue;            for (wvi = 0; wvi < wv->num_entries; wvi++)	{ 	  crossbow_root->new_words[wv->entry[wvi].wi] += 	    wv->entry[wvi].count;	  if (crossbow_hem_loo)	    bow_treenode_add_new_loo_for_di_wvi	      (crossbow_root, wv->entry[wvi].count, di, wvi,	       wv->num_entries, crossbow_docs->length);	}    }  crossbow_root->new_prior = 1.0;    //bow_treenode_set_new_words_from_perturbed_words_all (crossbow_root);  bow_treenode_set_words_from_new_words_all (crossbow_root, 					 1.0 / crossbow_root->words_capacity);  bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1.0);  /* Initialize children of the root */  if (crossbow_root->children_count == 0)    crossbow_hem_create_children_for_node (crossbow_root, 					   crossbow_hem_branching_factor);    /* CROSSBOW_HEM_TEMPERATURE already set */  iteration = 0;  for ( ; crossbow_hem_temperature >= crossbow_hem_temperature_end;	crossbow_hem_temperature *= crossbow_hem_temperature_decay)    {      bow_verbosify (bow_progress, "TEMPERATURE = %g\n",		     crossbow_hem_temperature);      printf ("TEMPERATURE = %g\n", crossbow_hem_temperature);      /* Always Add hypothesis children here. */      /* Run EM to convergence. */      old_pp = FLT_MAX;      pp = old_pp / 2;      /* Loop until convergence, i.e. perplexity doesn't change */      while (ABS (old_pp - pp) > 2	     && iteration < crossbow_hem_max_num_iterations)	{	  printf ("--------------------------------------------------"		  " Iteration %d\n", iteration);	  old_pp = pp;	  pp = crossbow_hem_em_one_iteration ();	  iteration++;	  test_pp = crossbow_hem_perplexity (bow_doc_is_test);	  printf ("train-pp=%f test-pp=%f \n", pp, test_pp);	  for (iterator = crossbow_root; 	       (tn = bow_treenode_iterate_all (&iterator));)	    {	      printf ("%s", tn->name);	      if (tn->children_count == 0)		{		  int ai, ci;		  printf (" prior=%g lambdas=[ ", tn->prior);		  for (ai = 0; ai < tn->depth + 2; ai++)		    printf ("%5.3f ", tn->lambdas[ai]);		  printf ("]");		  if (0 && crossbow_classes_count > 1)		    {		      printf ("\n  classes=[ ");		      for (ci = 0; ci < crossbow_classes_count; ci++)			printf ("%5.3f ", tn->classes[ci]);		      printf ("]");		    }		}	      else		printf (" KL %g WKL %g", 			bow_treenode_children_kl_div (tn),			bow_treenode_children_weighted_kl_div (tn));	      printf ("\n");	      if (1 || tn->children_count == 0)		{		  //bow_treenode_word_likelihood_ratios_print (tn, 10);		  bow_treenode_word_probs_print (tn, 10);		  printf ("\n");		  bow_treenode_word_likelihood_ratios_print (tn, 5);		  //bow_treenode_word_leaf_likelihood_ratios_print (tn, 5);		  //bow_treenode_word_leaf_odds_ratios_print (tn, 10);		}	    }	  //crossbow_leaf_document_probs_print (3);	}      /* Consider making splits. */      /* xxx This function should delete leaves that didn't become "real". */      if (crossbow_hem_consider_splitting ())	{	  /* xxx But leaves were just perturbed!!! */	  /* Output document classifications */	  sprintf (buf, "crossbow-classifications-%d", iteration);	  classify_fp = bow_fopen (buf, "w");	  crossbow_classify_tagged_docs (-1, 1, classify_fp);	  fflush (classify_fp);	  fclose (classify_fp);	  /* Output top words */	  sprintf (buf, "crossbow-words-%d", iteration);	  classify_fp = bow_fopen (buf, "w");	  bow_verbosify (bow_progress, "========= keywords ========\n");	  bow_treenode_keywords_print_all (crossbow_root, classify_fp);	  fflush (classify_fp);	  fclose (classify_fp);	}      /* Perturb the leaves */      bow_treenode_set_new_words_from_perturbed_words_all (crossbow_root, 0.1);      bow_treenode_set_words_from_new_words_all (crossbow_root, 0);    }}/* Put all documents into the NEW_WORDS distributions. */voidcrossbow_hem_place_labeled_data (){  int di, wvi;  crossbow_doc *doc;  treenode *leaf;  bow_wv *wv;  /* Clear all previous information. */  bow_treenode_set_new_words_to_zero_all (crossbow_root);  bow_treenode_free_loo_all (crossbow_root, crossbow_docs->length);  /* Initialize the word distributions and LOO entries with the data     and initialize lambdas to uniform */  for (di = 0; di < crossbow_docs->length; di++)    {      doc = bow_array_entry_at_index (crossbow_docs, di);      wv = crossbow_wv_at_di (di);      if (doc->tag != bow_doc_train)	continue;      /* Temporary fix */      if (strstr (doc->filename, ".include")	  || strstr (doc->filename, ".exclude"))	continue;      leaf = bow_treenode_descendant_matching_name (crossbow_root, 						    doc->filename);      //assert (leaf->children_count == 0);      leaf->new_prior++;      while (leaf)	{	  for (wvi = 0; wvi < wv->num_entries; wvi++)	    {	      leaf->new_words[wv->entry[wvi].wi] += wv->entry[wvi].count;	      bow_treenode_add_new_loo_for_di_wvi		(leaf, wv->entry[wvi].count, di, wvi,		 wv->num_entries, crossbow_docs->length);	    }	  leaf = leaf->parent;	}    }}/* Do full EM, without determinisitic annealing, or leaf splitting. */voidcrossbow_hem_full_em (){  double pp, old_pp;  double test_labeled_pp, test_unlabeled_pp;  double train_labeled_pp, train_unlabeled_pp;  treenode *iterator, *tn;  int iteration = 0;  int old_hem_shrinkage;#if PRINT_WORD_DISTS  char prefix[BOW_MAX_WORD_LENGTH];#endif  //assert (crossbow_hem_shrinkage);  //assert (crossbow_hem_loo);  if (crossbow_hem_garbage_collection)    {      /* Add "Misc" children to each parent in the tree */      bow_treenode_add_misc_child_all (crossbow_root);    }  /* If CROSSBOW_HEM_LAMBDAS_FROM_VALIDATION is non-zero, then change     X percent of the train and unlabeled documents to validation. */  if (crossbow_hem_lambdas_from_validation)    {      int di;      crossbow_doc *doc;      int validation_count = 0;      for (di = 0; di < crossbow_docs->length; di++)	{	  doc = bow_array_entry_at_index (crossbow_docs, di);	  if ((/*doc->tag == bow_doc_train ||*/ doc->tag == bow_doc_unlabeled)	      && (bow_random_double (0.0, 1.0)		  < crossbow_hem_lambdas_from_validation))	    {	      doc->tag = bow_doc_validation;	      validation_count++;	    }	}      bow_verbosify (bow_progress, "Placed %d document in validation set\n",		     validation_count);    }  /* Initialize the word distributions and LOO entries with the data     and initialize lambdas to uniform */  crossbow_hem_place_labeled_data ();  bow_treenode_set_words_from_new_words_all (crossbow_root, 1);  bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1);  bow_treenode_set_lambdas_leaf_only_all (crossbow_root);  printf ("No Shrinkage\n");  old_hem_shrinkage = crossbow_hem_shrinkage;  crossbow_hem_shrinkage = 0;  crossbow_classify_tagged_docs (bow_doc_test, 0, stdout);  crossbow_hem_shrinkage = old_hem_shrinkage;    train_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_train);  train_unlabeled_pp=crossbow_hem_unlabeled_perplexity (bow_doc_is_train);  test_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_test);  test_unlabeled_pp = crossbow_hem_unlabeled_perplexity (bow_doc_is_test);  printf ("train-unlabeled-pp=%f train-labeled-pp=%f\n"	  " test-unlabeled-pp=%f  test-labeled-pp=%f\n", 	  train_unlabeled_pp, train_labeled_pp,	  test_unlabeled_pp, test_labeled_pp);  if (crossbow_hem_vertical_word_movement)    bow_treenode_word_probs_print_all (crossbow_root, 5);  crossbow_hem_place_labeled_data ();  if (crossbow_hem_shrinkage)    bow_treenode_set_words_from_new_words_all (crossbow_root, 0);  else    bow_treenode_set_words_from_new_words_all (crossbow_root, 1);  bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -