📄 hem.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* hem.c - Hierarchical Expectation Maximization   Copyright (C) 1998, 1999 Andrew McCallum   Written by:  Andrew Kachites McCallum <mccallum@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <argp.h>#include <bow/crossbow.h>extern void crossbow_leaf_document_probs_print (int num_to_print);extern void crossbow_classify_tagged_docs (int tag, int verbose,					   FILE *out);#define SHRINK_WITH_UNIFORM_ONLY 0#define PRINT_WORD_DISTS 0#define MN 0#if MNextern double crossbow_hem_em_one_mn_iteration ();#endifstatic int crossbow_hem_branching_factor = 2;static double crossbow_hem_temperature = 100;static double crossbow_hem_temperature_end = 1;static int crossbow_hem_max_num_iterations = 9999999;static double crossbow_hem_temperature_decay = 0.9;static double crossbow_hem_em_acceleration = 1.0;static double crossbow_hem_split_kl_threshold = 0.4;static int crossbow_hem_maximum_depth = 6;static double crossbow_hem_lambdas_from_validation = 0.0;/* Doing statistical garbage collection? */static int crossbow_hem_garbage_collection = 0;/* Doing incremental labeling, ala co-training? */static int crossbow_hem_incremental_labeling = 0;/* Are the documents already labeled to belong to one leaf? */int crossbow_hem_deterministic_horizontal = 0;int crossbow_hem_restricted_horizontal = 0;/* Doing "full-EM"?, meaning that vertical word distributions are   changed by EM.  Note that speech recognitions's traditional    "deleted interpolation" only uses EM to set the lambdas.  */int crossbow_hem_vertical_word_movement = 1;/* Doing shrinkage */int crossbow_hem_shrinkage = 1;/* Using shrinkage, but with fixed weights.  Don't learn them by EM.   Only active is crossbow_hem_shrinkage = 1 */int crossbow_hem_fixed_shrinkage = 0;/* Doing Leave-One-Out */int crossbow_hem_loo = 1;/* The class tag is part of the generative model, and should be used   in the E-step to estimate class membership, and the M-step should   update the class distribution in each leaf. */int crossbow_hem_generates_class = 1;/* If non-zero, then after the initial E-step, change all labeled    documents to unlabeled. */int crossbow_hem_pseudo_labeled = 0;/* Command-line setting routines */enum {  BRANCHING_FACTOR_KEY = 17000,  TEMPERATURE_START_KEY,  TEMPERATURE_END_KEY,  TEMPERATURE_DECAY_KEY,  EM_ACCELERATION_KEY,  SPLIT_KL_THRESHOLD_KEY,  MAXIMUM_DEPTH_KEY,  NO_VERTICAL_WORD_MOVEMENT_KEY,  NO_SHRINKAGE_KEY,  NO_LOO_KEY,  DETERMINISTIC_HORIZONTAL_KEY,  RESTRICTED_HORIZONTAL_KEY,  PSEUDO_LABELED_KEY,  GARBAGE_COLLECTION_KEY,  MAX_NUM_ITERATIONS_KEY,  LAMBDAS_FROM_VALIDATION_KEY,  INCREMENTAL_LABELING_KEY,};static struct argp_option crossbow_hem_options[] ={  {0, 0, 0, 0,   "Hierarchical EM Clustering options:", 101},  {"hem-branching-factor", BRANCHING_FACTOR_KEY, "NUM", 0,   "Number of clusters to create.  Default is 2."},  {"hem-temperature-start", TEMPERATURE_START_KEY, "NUM", 0,   "The initial value of T."},  {"hem-temperature-end", TEMPERATURE_END_KEY, "NUM", 0,   "The final value of T.  Default is 1."},  {"hem-max-num-iterations", MAX_NUM_ITERATIONS_KEY, "NUM", 0,   "Do no more iterations of EM than this."},  {"hem-temperature-decay", TEMPERATURE_DECAY_KEY, "NUM", 0,   "Temperature decay factor.  Default is 0.9."},  {"hem-em-acceleration", EM_ACCELERATION_KEY, "NUM", OPTION_HIDDEN,   "Accelerated EM \eta factor.  1 is plain EM.  Can safely go "   "as high as 2.0.  1.8 is a good value.  Default is 1."},  {"hem-split-kl-threshold", SPLIT_KL_THRESHOLD_KEY, "NUM", 0,   "KL divergence value at which tree leaves will be split. "   "Default is 0.2"},  {"hem-maximum-depth", MAXIMUM_DEPTH_KEY, "NUM", 0,   "The hierarchy depth beyond which it will not split.  Default is 6."},  {"hem-no-vertical-word-movement", NO_VERTICAL_WORD_MOVEMENT_KEY, 0, 0,   "Use EM just to set the vertical priors, not to set the vertical "   "word distribution; i.e. do not to `full-EM'."},  {"hem-no-shrinkage", NO_SHRINKAGE_KEY, 0, 0,   "Use only the clusters at the leaves; do not do anything with the "   "hierarchy."},  {"hem-no-loo", NO_LOO_KEY, 0, 0,   "Do not use leave-one-out evaluation during the E-step."},  {"hem-deterministic-horizontal", DETERMINISTIC_HORIZONTAL_KEY, 0, 0,   "In the horizontal E-step for a document, set to zero the membership "   "probabilities of all leaves, except the one matching the document's "   "filename"},  {"hem-restricted-horizontal", RESTRICTED_HORIZONTAL_KEY, 0, 0,   "In the horizontal E-step for a document, set to zero the membership "   "probabilities of all leaves whose names are not found in the document's "   "filename"},  {"hem-pseudo-labeled", PSEUDO_LABELED_KEY, 0, 0,   "After using the labels to set the starting point for EM, change all "   "training documents to unlabeled, so that they can have their class "   "labels re-assigned by EM.  Useful for imperfectly labeled training data."},  {"hem-garbage-collection", GARBAGE_COLLECTION_KEY, 0, 0,   "Add extra /Misc/ children to every internal node of the hierarchy, "   "and keep their local word distributions flat"},  {"hem-lambdas-from-validation", LAMBDAS_FROM_VALIDATION_KEY, "NUM", 0,   "Instead of setting the lambdas from the labeled/unlabeled data "   "(possibly with LOO), instead set the lambdas using held-out "   "validation data.  0<NUM<1 is the fraction of unlabeled documents "   "just before EM training of the classifier begins.  Default is 0, "   "which leaves this option off."},  {"hem-incremental-labeling", INCREMENTAL_LABELING_KEY, 0, 0,   "Instead of using all unlabeled documents in the M-step, use only "   "the labeled documents, and incrementally label those unlabeled documents "   "that are most confidently classified in the E-step"},  {0, 0}};error_tcrossbow_hem_parse_opt (int key, char *arg, struct argp_state *state){  switch (key)    {    case BRANCHING_FACTOR_KEY:      crossbow_hem_branching_factor = atoi (arg);      break;    case TEMPERATURE_START_KEY:      crossbow_hem_temperature = atof (arg);      break;    case TEMPERATURE_END_KEY:      crossbow_hem_temperature_end = atof (arg);      break;    case TEMPERATURE_DECAY_KEY:      crossbow_hem_temperature_decay = atof (arg);      break;    case EM_ACCELERATION_KEY:      crossbow_hem_em_acceleration = atof (arg);      break;    case SPLIT_KL_THRESHOLD_KEY:      crossbow_hem_split_kl_threshold = atof (arg);      break;    case MAXIMUM_DEPTH_KEY:      crossbow_hem_maximum_depth = atoi (arg);      break;    case NO_VERTICAL_WORD_MOVEMENT_KEY:      crossbow_hem_vertical_word_movement = 0;      break;    case NO_SHRINKAGE_KEY:      crossbow_hem_shrinkage = 0;      break;    case NO_LOO_KEY:      crossbow_hem_loo = 0;      break;    case RESTRICTED_HORIZONTAL_KEY:      crossbow_hem_restricted_horizontal = 1;      break;    case DETERMINISTIC_HORIZONTAL_KEY:      crossbow_hem_deterministic_horizontal = 1;      break;    case PSEUDO_LABELED_KEY:      crossbow_hem_pseudo_labeled = 1;      break;    case GARBAGE_COLLECTION_KEY:      crossbow_hem_garbage_collection = 1;      break;    case MAX_NUM_ITERATIONS_KEY:      crossbow_hem_max_num_iterations = atoi (arg);      break;    case LAMBDAS_FROM_VALIDATION_KEY:      crossbow_hem_lambdas_from_validation = atof (arg);      break;    case INCREMENTAL_LABELING_KEY:      crossbow_hem_incremental_labeling = 1;      break;    default:      return ARGP_ERR_UNKNOWN;    }  return 0;}static const struct argp crossbow_hem_argp ={  crossbow_hem_options,  crossbow_hem_parse_opt};static struct argp_child crossbow_hem_argp_child ={  &crossbow_hem_argp,	/* This child's argp structure */  0,			/* flags for child */  0,			/* optional header in help message */  0			/* arbitrary group number for ordering */};/* create num_children children for the leaf node tn */voidcrossbow_hem_create_children_for_node (treenode *tn, int num_children){  int ci;  treenode *child;  int ai;  int wi;  assert (tn->children_count == 0);  for (ci = 0; ci < num_children; ci++)    {      child = bow_treenode_new (tn, num_children, NULL);      if (!crossbow_hem_shrinkage)	{	  /* if no shrinkage, set the lamdas all at the leaf */	  	  child->new_lambdas[0] = 1.0;	  for (ai = 1; ai < child->depth + 2; ai++)	    child->new_lambdas[ai] = 0.0;	  bow_treenode_set_lambdas_from_new_lambdas (child, 0);	}      else	{	  /* set the children close to parent, sharing their lambdas */	  child->new_lambdas[0] = tn->lambdas[0]/2;	  child->new_lambdas[1] = tn->lambdas[0]/2;	  for (ai = 2; ai < child->depth + 2; ai++)	    child->new_lambdas[ai] = 	      tn->lambdas[ai-1];	  bow_treenode_set_lambdas_from_new_lambdas (child, 0);	}      /* make each word distribution like parent's, but perturbed */      for (wi = 0; wi < tn->words_capacity; wi++)	child->words[wi] = tn->words[wi];      /* xxx But we're going to perturb them again in hem_cluster!!! */      bow_treenode_set_new_words_from_perturbed_words (child, 0.1);      /* split the prior of the parent amongst the children */      child->prior = tn->prior / num_children;      bow_treenode_set_words_from_new_words (child, 0);    }  /* zero out the prior of the parent now that it's not a leaf */  tn->prior = 0.0;}/* Return non-zero if a split happens */intcrossbow_hem_hypothesize_grandchildren (treenode *tn, int num_children){  int ci;  double kldiv;  /* The number of words of training data in the children of TN */  assert (bow_treenode_is_leaf_parent (tn));  kldiv = bow_treenode_children_kl_div (tn);  if (kldiv > crossbow_hem_split_kl_threshold       && tn->depth < crossbow_hem_maximum_depth)    {      printf ("Splitting children of node %s\n", tn->name);            /* Create and attach grandchildren, and copy perturbed word         distribution. */      for (ci = 0; ci < tn->children_count; ci++)	{	  crossbow_hem_create_children_for_node (tn->children[ci], 						 num_children);	}      return 1;    }  return 0;}/* Return the perplexity of the data in documents for which the   function USE_DOC_P returns non-zero. */doublecrossbow_hem_perplexity (int (*use_doc_p)(bow_doc*)){  int di;  crossbow_doc *doc;  bow_wv *wv;  treenode *iterator, *leaf;  int li;			/* a leaf index */  int num_leaves;  double *leaf_membership;  double *leaf_data_prob;  double log_prob_of_data = 0;  int num_data_words = 0;	/* the number of word occurrences */  num_leaves = bow_treenode_leaf_count (crossbow_root);  leaf_membership = alloca (num_leaves * sizeof (double));  leaf_data_prob = alloca (num_leaves * sizeof (double));  for (di = 0; di < crossbow_docs->length; di++)    {      doc = bow_array_entry_at_index (crossbow_docs, di);      if (! (*use_doc_p)((bow_doc*)doc))	continue;      /* E-step estimating leaf membership probability for one         document, with annealing temperature. */      wv = crossbow_wv_at_di (di);      for (iterator = crossbow_root, li = 0;	   (leaf = bow_treenode_iterate_leaves (&iterator)); 	   li++)	{	  if (crossbow_hem_shrinkage)	    leaf_data_prob[li] = bow_treenode_log_prob_of_wv (leaf, wv);	  else	    leaf_data_prob[li] = bow_treenode_log_local_prob_of_wv (leaf, wv);	  leaf_membership[li] = (log (leaf->prior)				 + (leaf_data_prob[li]				    / crossbow_hem_temperature));	}      crossbow_convert_log_probs_to_probs (leaf_membership, num_leaves);      /* For perplexity calculation */      for (iterator = crossbow_root, li = 0;	   (leaf = bow_treenode_iterate_leaves (&iterator)); 	   li++)	{	  /* xxx Should this be with bow_treenode_complete_log_prob_of_wv()? */	  log_prob_of_data += (leaf_membership[li] * leaf_data_prob[li]);	  assert (log_prob_of_data == log_prob_of_data);	}      num_data_words += bow_wv_word_count (wv);    }  /* Return the perlexity */  if (num_data_words)    return exp (-log_prob_of_data / num_data_words);  return 0;}/* Return the perplexity of the data (perplexity (without knowledge of   the class label, P(D|theta)) in documents for which the function   USE_DOC_P returns non-zero. */doublecrossbow_hem_unlabeled_perplexity (int (*use_doc_p)(bow_doc*)){  int di;  crossbow_doc *doc;  bow_wv *wv;  treenode *iterator, *leaf;  int li;			/* a leaf index */  int num_leaves;  double leaf_data_log_prob;  double leaf_pp;  double max_leaf_pp;
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -