📄 em.c

📁 卡内基梅隆大学MaCallum开发的文本分类系统
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* Weight-setting and scoring implementation for EM classification *//* Copyright (C) 1997, 1998, 1999 Andrew McCallum   Written by:  Kamal Nigam <knigam@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <math.h>#include <argp/argp.h>#include <stdlib.h>#include <bow/em.h>#include <bow/naivebayes.h>/* EM-specific types *//* a specification for how to convert naive bayes scores into probabilities */typedef enum{   simple,			/* 1 or 0 based on winning class */  nb_score			/* score directly from naivebayes */} bow_em_stat_method;/* a specification for how to use the unlabeled data when setting the EM   starting point */typedef enum{   em_start_zero,   /* unlabeled docs have no effect on starting point */  em_start_even,   /* unlabeled docs distributed evenly */  em_start_prior,  /* unlabeled docs distributed according to labeled prior */  em_start_random  /* unlabeled docs distributed randomly */} bow_em_unlabeled_start_method;/* a specification for how to use the unlabeled data when setting the EM   starting point for multi-hump negative class */typedef enum {  bow_em_init_spiked,  /* distribute each doc to one class */  bow_em_init_spread    /* distribute each doc across classes */} bow_em_multi_hump_init_method;/* some forward definitions */void bow_em_print_word_distribution (bow_barrel *vpc_barrel, 				     int em_runs, int num_classes);double em_calculate_perplexity (bow_barrel *doc_barrel, 				 bow_barrel *class_barrel);float em_calculate_accuracy (bow_barrel *doc_barrel, bow_barrel *class_barrel);void bow_em_set_weights (bow_barrel *barrel);/* Global Variables *//* hack for binary scoring method */static int bow_em_making_barrel = 0;/* hack for scoring for perplexity calculation */int bow_em_calculating_perplexity = 0;/* ci of binary positive class */static int binary_pos_ci = -1;/* Command-line options specific to EM.  See em_optinos for documentation*/static char * em_binary_pos_classname = NULL;static char * em_binary_neg_classname = NULL;static int em_compare_to_nb = 0;static bow_em_stat_method em_stat_method = nb_score;int bow_em_num_em_runs = 7;static int bow_em_print_probs = 0;static int bow_em_print_word_vector = 0;static int bow_em_binary_case = 0;static float unlabeled_normalizer = 1.0;static int bow_em_multi_hump_neg = 0;bow_em_perturb_method bow_em_perturb_starting_point = 0;int em_cross_entropy = 0;static int em_anneal = 0;static float em_temperature = 200;static float em_temp_reduction = 0.9;static bow_em_unlabeled_start_method em_unlabeled_start = em_start_zero;static bow_em_multi_hump_init_method em_multi_hump_init =          bow_em_init_spread;static int em_halt_using_perplexity = 0;static int (* em_perplexity_docs)(bow_cdoc *) = NULL;static int em_perplexity_loo = 0;static int bow_em_anneal_normalizer = 0;static int em_halt_using_accuracy = 0;static int (* em_accuracy_docs)(bow_cdoc *) = NULL;static int em_accuracy_loo = 0;static int em_labeled_for_start_only = 0;static int em_set_vocab_from_unlabeled = 0;/* The integer or single char used to represent this command-line option.   Make sure it is unique across all libbow and rainbow. */enum {  EM_COMPARE_TO_NB = 2222,  EM_STAT_METHOD,  EM_NUM_RUNS,  EM_PRINT_PROBS,  EM_BINARY_POS_CLASS,   EM_BINARY_NEG_CLASS,   EM_PRINT_TOP_WORDS,   EM_BINARY,   EM_UNLABELED_NORMALIZER,   EM_MULTI_HUMP,   EM_PERTURB_STARTING_POINT,   EM_NO_PERTURB,   EM_CROSSENTROPY,  EM_ANNEAL,  EM_TEMPERATURE,  EM_TEMP_REDUCE,  EM_UNLABELED_START,  EM_MULTI_HUMP_INIT,  EM_HALT_USING_PERPLEXITY,  EM_ANNEAL_NORMALIZER,  EM_PRINT_PERPLEXITY,  EM_HALT_USING_ACCURACY,  EM_PRINT_ACCURACY,  EM_LABELED_FOR_START_ONLY,  EM_SET_VOCAB_FROM_UNLABELED};static struct argp_option em_options[] ={  {0,0,0,0,   "EM options:", 60},  {"em-compare-to-nb", EM_COMPARE_TO_NB, 0, 0,   "When building an EM class barrel, show doc stats for the naivebayes"   "barrel equivalent.  Only use in conjunction with --test."},  {"em-stat-method", EM_STAT_METHOD, "STAT", 0,   "The method to convert scores to probabilities."   "The default is 'nb_score'."},  {"em-num-iterations", EM_NUM_RUNS, "NUM", 0,    "Number of EM iterations to run when building model."},  {"em-save-probs", EM_PRINT_PROBS, 0, 0,   "On each EM iteration, save all P(C|w) to a file."},  {"em-binary-pos-classname", EM_BINARY_POS_CLASS, "CLASS", 0,    "Specify the name of the positive class if building a binary classifier."},  {"em-binary-neg-classname", EM_BINARY_NEG_CLASS, "CLASS", 0,    "Specify the name of the negative class if building a binary classifier."},  {"em-print-top-words", EM_PRINT_TOP_WORDS, 0, 0,   "Print the top 10 words per class for each EM iteration."},  {"em-binary", EM_BINARY, 0, 0,   "Do special tricks for the binary case."},  {"em-unlabeled-normalizer", EM_UNLABELED_NORMALIZER, "NUM", 0,   "Number of unlabeled docs it takes to equal a labeled doc."   "Defaults to one."},  {"em-multi-hump-neg", EM_MULTI_HUMP, "NUM", 0,   "Use NUM center negative classes. Only use in binary case."   "Must be using scoring method nb_score."},  {"em-perturb-starting-point", EM_PERTURB_STARTING_POINT, "TYPE", 0,   "Instead of starting EM with P(w|c) from the labeled training data, "   "start from values that are randomly sampled from the multinomial "   "specified by the labeled training data.  TYPE specifies what "   "distribution to use for the perturbation; choices are `gaussian' "   "`dirichlet', and `none'.  Default is `none'."},  {"em-crossentropy", EM_CROSSENTROPY, 0, 0,   "Use crossentropy instead of naivebayes for scoring."},  {"em-anneal", EM_ANNEAL, 0, 0,   "Use Deterministic annealing EM."},  {"em-temperature", EM_TEMPERATURE, "NUM", 0,   "Initial temperature for deterministic annealing.  Default is 200."},  {"em-temp-reduce", EM_TEMP_REDUCE, "NUM", 0,   "Temperature reduction factor for deterministic annealing.  Default is 0.9."},  {"em-unlabeled-start", EM_UNLABELED_START, "TYPE", 0,    "When initializing the EM starting point, how the unlabeled docs"   " contribute.  Default is `zero'.  Other choices are `prior' `random' "   " and `even'."},  {"em-multi-hump-init", EM_MULTI_HUMP_INIT, "METHOD", 0,   "When initializing mixture components, how to assign component probs "   "to documents.  Default is `spread'.  Other choices are `spiked'."},  {"em-halt-using-perplexity", EM_HALT_USING_PERPLEXITY, "TYPE", 0,   "When running EM, halt when perplexity plataeus.  TYPE is type of document "   "to measure perplexity on.  Choices are `validation', `train', `test', "   "`unlabeled',  `trainandunlabeled' and `trainandunlabeledloo'"},  {"em-anneal-normalizer", EM_ANNEAL_NORMALIZER, 0, 0,    "When running EM, do deterministic annealing-ish stuff with the unlabeled "   "normalizer."},  {"em-print-perplexity", EM_PRINT_PERPLEXITY, "TYPE", 0,   "When running EM, print the perplexity of documents at each round.  "   "TYPE is type of document to measure perplexity on.  See "   "`--em-halt-using-perplexity` for choices for TYPE"},  {"em-halt-using-accuracy", EM_HALT_USING_ACCURACY, "TYPE", 0,   "When running EM, halt when accuracy plateaus.   TYPE is type of document "   "to measure perplexity on.  Choices are `validation', `train', `test', "   "`unlabeled' and `trainandunlabeled' and `trainandunlabeledloo'"},  {"em-print-accuracy", EM_PRINT_ACCURACY, "TYPE", 0,   "When running EM, print the accuracy of documents at each round.  "   "TYPE is type of document to measure perplexity on.  See "   "`--em-halt-using-perplexity` for choices for TYPE"},  {"em-labeled-for-start-only", EM_LABELED_FOR_START_ONLY, 0, 0,   "Use the labeled documents to set the starting point for EM, but"   "ignore them during the iterations"},  {"em-set-vocab-from-unlabeled", EM_SET_VOCAB_FROM_UNLABELED, 0, 0,   "Remove words from the vocabulary not used in the unlabeled data"},  {0, 0}};error_tem_parse_opt (int key, char *arg, struct argp_state *state){  switch (key)    {    case EM_COMPARE_TO_NB:      em_compare_to_nb = 1;      break;    case EM_STAT_METHOD:      if (!strcmp(arg, "nb_score"))	em_stat_method = nb_score;      else if (!strcmp(arg, "simple"))	em_stat_method = simple;      else	bow_error("Invalid argument for --em-stat-method");      break;    case EM_NUM_RUNS:      bow_em_num_em_runs = atoi(arg);      break;    case EM_PRINT_PROBS:      bow_em_print_probs = 1;      break;    case EM_BINARY_POS_CLASS:      em_binary_pos_classname = arg;      break;    case EM_BINARY_NEG_CLASS:      em_binary_neg_classname = arg;      break;    case EM_PRINT_TOP_WORDS:      bow_em_print_word_vector = 1;      break;    case EM_BINARY:      bow_em_binary_case = 1;      break;    case EM_UNLABELED_NORMALIZER:      unlabeled_normalizer = 1.0 / atoi(arg);      break;    case EM_MULTI_HUMP:      bow_em_multi_hump_neg = atoi(arg);      break;    case EM_PERTURB_STARTING_POINT:      if (!strcmp (arg, "none"))	bow_em_perturb_starting_point = bow_em_perturb_none;      else if (!strcmp (arg, "gaussian"))	bow_em_perturb_starting_point = bow_em_perturb_with_gaussian;      else if (!strcmp (arg, "dirichlet"))	bow_em_perturb_starting_point = bow_em_perturb_with_dirichlet;      else	bow_error ("Bad arg to --perturb-starting-point");       break;    case EM_CROSSENTROPY:      em_cross_entropy = 1;      break;    case EM_ANNEAL:      em_anneal = 1;      break;    case EM_TEMPERATURE:      em_temperature = atoi (arg);      break;    case EM_TEMP_REDUCE:      em_temp_reduction = atof (arg);      break;    case EM_UNLABELED_START:      if (!strcmp (arg, "zero"))	em_unlabeled_start = em_start_zero;      else if (!strcmp (arg, "prior"))	em_unlabeled_start = em_start_prior;      else if (!strcmp (arg, "even"))	em_unlabeled_start = em_start_even;      else if (!strcmp (arg, "random"))	em_unlabeled_start = em_start_random;      else 	bow_error ("Bad arg to --em-unlabled-start");       break;    case EM_MULTI_HUMP_INIT:      if (!strcmp(arg, "spread"))	em_multi_hump_init = bow_em_init_spread;      else if (!strcmp (arg, "spiked"))	em_multi_hump_init = bow_em_init_spiked;      else	bow_error ("Bad arg to --em-multi-hump-init");      break;    case EM_HALT_USING_PERPLEXITY:      em_halt_using_perplexity = 1;      /* Intentional lack of a break here */    case EM_PRINT_PERPLEXITY:      if (!strcmp (arg, "validation"))	em_perplexity_docs = bow_cdoc_is_validation;      else if (!strcmp (arg, "train"))	em_perplexity_docs = bow_cdoc_is_train;      else if (!strcmp (arg, "unlabeled"))	em_perplexity_docs = bow_cdoc_is_unlabeled;      else if (!strcmp (arg, "test"))	em_perplexity_docs = bow_cdoc_is_test;      else if (!strcmp (arg, "trainandunlabeled"))	em_perplexity_docs = bow_cdoc_is_train_or_unlabeled;      else if (!strcmp (arg, "trainandunlabeledloo"))	{	  em_perplexity_docs = bow_cdoc_is_train_or_unlabeled;	  em_perplexity_loo = 1;	}      else	bow_error("Unknown document type for --em-halt-using-perplexity");      break;    case EM_HALT_USING_ACCURACY:      em_halt_using_accuracy = 1;      /* Intentional lack of break here */    case EM_PRINT_ACCURACY:      if (!strcmp (arg, "validation"))	em_accuracy_docs = bow_cdoc_is_validation;      else if (!strcmp (arg, "train"))	em_accuracy_docs = bow_cdoc_is_train;      else if (!strcmp (arg, "test"))	em_accuracy_docs = bow_cdoc_is_test;      else if (!strcmp (arg, "trainloo"))	{	  em_accuracy_docs = bow_cdoc_is_train;	  em_accuracy_loo = 1;	}      else	bow_error("Unknown document type for --em-halt-using-accuracy");      break;    case EM_ANNEAL_NORMALIZER:      bow_em_anneal_normalizer = 1;      unlabeled_normalizer = 0;      break;    case EM_LABELED_FOR_START_ONLY:      em_labeled_for_start_only = 1;      break;    case EM_SET_VOCAB_FROM_UNLABELED:      em_set_vocab_from_unlabeled = 1;      break;    default:      return ARGP_ERR_UNKNOWN;    }  return 0;}static const struct argp em_argp ={  em_options,  em_parse_opt};static struct argp_child em_argp_child ={  &em_argp,		/* This child's argp structure */  0,			/* flags for child */  0,			/* optional header in help message */  0			/* arbitrary group number for ordering */};/* End of command-line options specific to EM *//* return 1 for all docs to be tested by EM during the E-step when    doing multi-hump negative class */int bow_cdoc_is_multi_hump_doc (bow_cdoc *cdoc){  return((cdoc->type == bow_doc_unlabeled) ||	 (cdoc->type == bow_doc_train && cdoc->class != binary_pos_ci));}/* Given a fully-specified file path name (all the way from `/'),   return just the last filename part of it. */static inline const char *filename_to_classname (const char *filename){  const char *ret;  ret = strrchr (filename, '/');  if (ret)    return ret + 1;  return filename;}intbow_em_pr_struct_compare (const void *x, const void *y){  if (((bow_em_pr_struct *)x)->score > ((bow_em_pr_struct *)y)->score)    return -1;  else if (((bow_em_pr_struct *)x)->score == ((bow_em_pr_struct *)y)->score)    return 0;  else    return 1;}/* Return a random number sampled from a gaussian with MEAN and VARIANCE. *//* From "Recipies in C", page 289. */doublebow_em_gaussian (double mean, double variance){  static int iset = 0;  static double gset;  double fac, rsq, v1, v2;  double gaussian_zero_one;	/* random gaussian with mean=0, variance=1 */  bow_random_set_seed ();   if (iset == 0)    {      do	{	  v1 = 2.0 * bow_random_double (0.0, 1.0) - 1.0;	  v2 = 2.0 * bow_random_double (0.0, 1.0) - 1.0;	  rsq = v1 * v1 + v2 * v2;	}      while (rsq >= 1.0 || rsq == 0.0);      fac = sqrt (-2.0 * log (rsq)/rsq);      gset = v1 * fac;      iset = 1;      gaussian_zero_one = v2 * fac;    }  else    {      iset = 0;      gaussian_zero_one = gset;    }  return gaussian_zero_one * sqrt (variance) + mean;}/* From Numerical "Recipes in C", page 292 */
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -