📄 naivebayes.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* Weight-setting and scoring implementation for Naive-Bayes classification *//* Copyright (C) 1997, 1998, 1999 Andrew McCallum   Written by:  Andrew Kachites McCallum <mccallum@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <math.h>#include <argp/argp.h>/* Command-line options specific to NaiveBayes *//* Default value for option "naivebayes-m-est-m".  When zero, then use   size-of-vocabulary instead. */double naivebayes_argp_m_est_m = 0;int naivebayes_score_returns_doc_pr;int naivebayes_score_unsorted;static int naivebayes_binary_scoring = 0;static int naivebayes_normalize_log = 0;static int naivebayes_rescale_scores = 1;static int naivebayes_final_rescale_scores = 1;static int naivebayes_return_log_pr = 0;static int naivebayes_cross_entropy = 0;double bow_naivebayes_anneal_temperature = 1;/* icky globals for Good-Turing discounting */static double **bow_naivebayes_goodturing_discounts = NULL;static bow_barrel *bow_naivebayes_goodturing_barrel = NULL;/* icky globals for Dirichlet smoothing */double *bow_naivebayes_dirichlet_alphas = NULL;double bow_naivebayes_dirichlet_total = 0;/* The integer or single char used to represent this command-line option.   Make sure it is unique across all libbow and rainbow. */#define NB_M_EST_M_KEY 3001#define NB_BINARY_SCORE 3002#define NB_NORMALIZE_LOG 3003static struct argp_option naivebayes_options[] ={  {0,0,0,0,   "Naive Bayes options, --method=naivebayes:", 30},  {"naivebayes-m-est-m", NB_M_EST_M_KEY, "M", 0,   "When using `m'-estimates for smoothing in NaiveBayes, use M as the "   "value for `m'.  The default is the size of vocabulary."},  {"naivebayes-binary-scoring", NB_BINARY_SCORE, 0, 0,   "When using naivebayes, use hacky scoring to get good Precision-Recall "   "curves."},  {"naivebayes-normalize-log", NB_NORMALIZE_LOG, 0, 0,   "When using naivebayes, return -1/log(P(C|d), normalized to sum to one "   "instead of P(C|d).  This results in values that are not so close to "   "zero and one."},  {0, 0}};error_tnaivebayes_parse_opt (int key, char *arg, struct argp_state *state){  switch (key)    {    case NB_M_EST_M_KEY:      naivebayes_argp_m_est_m = atof (arg);      break;    case NB_BINARY_SCORE:      naivebayes_binary_scoring = 1;      break;    case NB_NORMALIZE_LOG:      naivebayes_normalize_log = 1;      naivebayes_rescale_scores = 1;      naivebayes_final_rescale_scores = 1;      break;    default:      return ARGP_ERR_UNKNOWN;    }  return 0;}static const struct argp naivebayes_argp ={  naivebayes_options,  naivebayes_parse_opt};static struct argp_child naivebayes_argp_child ={  &naivebayes_argp,		/* This child's argp structure */  0,				/* flags for child */  0,				/* optional header in help message */  0				/* arbitrary group number for ordering */};/* End of command-line options specific to NaiveBayes *//* Defined in goodturing.c */extern int simple_good_turing (int length, int *freq, double *disc);voidbow_naivebayes_initialize_goodturing (bow_barrel *barrel){  int *counts =     bow_malloc (sizeof (int) * (bow_smoothing_goodturing_k + 1));  int len = bow_smoothing_goodturing_k + 1;  int k;  int ci;  int wi;  int max_wi;  int dvi;  bow_dv *dv;  int zero_count;  int total_words = 0;    if (NULL != bow_naivebayes_goodturing_discounts)    {      for (k = 0; k < bow_barrel_num_classes(barrel) ; k++)	bow_free (bow_naivebayes_goodturing_discounts[k]);      bow_free (bow_naivebayes_goodturing_discounts);    }  bow_naivebayes_goodturing_barrel = barrel;  bow_naivebayes_goodturing_discounts = bow_malloc (sizeof (double *) * 				     bow_barrel_num_classes(barrel));  for (k = 0; k < bow_barrel_num_classes(barrel) ; k++)    {      bow_naivebayes_goodturing_discounts[k] = 	bow_malloc (sizeof (double) * len);    }  max_wi = MIN (barrel->wi2dvf->size, bow_num_words ());  for (ci = 0; ci < bow_barrel_num_classes(barrel); ci ++)    {      bow_cdoc *cdoc = bow_array_entry_at_index (barrel->cdocs, ci);            total_words = 0;      for (k = 0; k < len ; k++)	{	  bow_naivebayes_goodturing_discounts[ci][k] = 0.0;	  counts[k] = 0;	}            zero_count = barrel->wi2dvf->num_words - cdoc->normalizer;      counts[0] = zero_count;      for (wi = 0; wi < max_wi; wi++)	{	  dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);	  if (!dv)	    continue;	  dvi = 0;	  /* Find the index of entry for this class. */	  while (dvi < dv->length && dv->entry[dvi].di < ci)	    dvi++;	  if (dvi < dv->length && dv->entry[dvi].di == ci)	    {	      /* There is an entry in DV for class CI. 		 Note it if it's in the interesting range */	      if (dv->entry[dvi].count > 0 &&		  dv->entry[dvi].count < len)		{		  counts[dv->entry[dvi].count]++;		  total_words += dv->entry[dvi].count;		}	    }	}      bow_verbosify(bow_progress, "Class %d:\n", ci);      for (k = 0; k < len; k++)	{	  bow_verbosify(bow_progress, "(%d %d)", k, counts[k]);	}      bow_verbosify(bow_progress, "\n");      /* Calculate all the discount factors */      if (0 != simple_good_turing(len, counts, 				  &(bow_naivebayes_goodturing_discounts[ci][0])))	bow_error("Simple Good-Turing calculation error.");            /* Distribute the weight of the zero mass evenly */      bow_naivebayes_goodturing_discounts[ci][0] = 	bow_naivebayes_goodturing_discounts[ci][0] * total_words / 	(cdoc->word_count * zero_count);      for (k = 0; k < len; k++)	{	  bow_verbosify(bow_progress, "(%d %f)", k, 			bow_naivebayes_goodturing_discounts[ci][k] );	}      bow_verbosify(bow_progress, "\n");    }}voidbow_naivebayes_load_dirichlet_alphas (){  int max_wi = bow_num_words ();  FILE *fp;  float x;  char s[256];  int wi;    if (bow_naivebayes_dirichlet_alphas)    bow_free (bow_naivebayes_dirichlet_alphas);  bow_naivebayes_dirichlet_alphas = bow_malloc (sizeof (double) * max_wi);  for (wi = 0; wi < max_wi; wi++)    bow_naivebayes_dirichlet_alphas[wi] = 0.0;    fp = fopen (bow_smoothing_dirichlet_filename, "r");  while (fscanf(fp, "%f %s", &x, s)==2)    {      wi = bow_word2int (s);            assert (wi != -1);      bow_naivebayes_dirichlet_alphas[wi] = x * bow_smoothing_dirichlet_weight;    }  fclose (fp);}/* load up the alphas */voidbow_naivebayes_initialize_dirichlet_smoothing (bow_barrel *barrel){  int max_wi = MIN (barrel->wi2dvf->size, bow_num_words ());  int wi;    bow_naivebayes_dirichlet_total = 0;  /* make sure all the alphas are > 0 and calculate the sum */  for (wi = 0; wi < max_wi; wi++)    {      bow_dv *dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      if (dv)	{	  bow_naivebayes_dirichlet_total += bow_naivebayes_dirichlet_alphas[wi];	  assert (bow_naivebayes_dirichlet_alphas[wi] > 0);	}    }}/* Return the probability of word WI in class CI.    If LOO_CLASS is non-negative, then we are doing    leave-out-one-document evaulation.  LOO_CLASS is the index   of the class from which the document has been removed.   LOO_WI_COUNT is the number of WI'th words that are in the document   LOO_W_COUNT is the total number of words in the docment   The last two argments help this function avoid searching for   the right entry in the DV from the beginning each time.   LAST_DV is a pointer to the DV to use.   LAST_DVI is a pointer to the index into the LAST_DV that is   guaranteed to have class index less than CI.*/doublebow_naivebayes_pr_wi_ci (bow_barrel *barrel,			 int wi, int ci,			 int loo_class,			 float loo_wi_count, float loo_w_count,			 bow_dv **last_dv, int *last_dvi){  bow_dv *dv;  bow_cdoc *cdoc;  double num_wi_ci;		/* the number of times wi occurs in class */  double num_w_ci;		/* the number of words in class. */  int dvi;  double m_est_m;  double m_est_p;  double pr_w_c;  cdoc = bow_array_entry_at_index (barrel->cdocs, ci);  if (last_dv && *last_dv)    {      dv = *last_dv;      dvi = *last_dvi;      /* No, not always true. assert (dv->entry[dvi].di <= ci); */    }  else    {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      dvi = 0;      if (last_dv)	*last_dv = dv;    }  /* If the model doesn't know about this word, return 0. */  if (!dv)    return -1.0;  /* Find the index of entry for this class. */  while (dvi < dv->length && dv->entry[dvi].di < ci)    dvi++;  /* Remember this index value for future calls to this function */  if (last_dvi)    *last_dvi = dvi;  if (dvi < dv->length && dv->entry[dvi].di == ci)    {      /* There is an entry in DV for class CI. */      num_wi_ci = dv->entry[dvi].weight;    }  else    {      /* There is no entry in DV for class CI. */      num_wi_ci = 0;      if (loo_class == ci)	bow_error ("There should be data for WI,CI");    }  num_w_ci = cdoc->word_count;#if 0  fprintf (stdout, "count-%-25s %f\n",	   bow_int2word (wi), num_wi_ci);#endif  if (loo_class == ci)    {      num_wi_ci -= loo_wi_count;      num_w_ci -= loo_w_count;      if (!(num_wi_ci >= 0 && num_w_ci >= 0))	bow_error ("foo %g %g\n", num_wi_ci, num_w_ci);    }  if (bow_event_model == bow_event_document)    {      /* This corresponds to adding two training pseudo-data points:	 one that has all features, and one that has no features. */      pr_w_c = ((num_wi_ci + 1)		/ (num_w_ci + 2));    }  else if (bow_smoothing_method == bow_smoothing_laplace	   || bow_smoothing_method == bow_smoothing_mestimate)    {      /* xxx This is not exactly right, because 	 BARREL->WI2DVF->NUM_WORDS might have changed with the	 removal of QUERY_WV's document. */      if (/* naivebayes_argp_m_est_m == 0 	     || */ bow_smoothing_method == bow_smoothing_laplace)	m_est_m = barrel->wi2dvf->num_words;      else	m_est_m = naivebayes_argp_m_est_m;      m_est_p = 1.0 / barrel->wi2dvf->num_words;      pr_w_c = ((num_wi_ci + m_est_m * m_est_p)		/ (num_w_ci + m_est_m));    }  else if (bow_smoothing_method == bow_smoothing_wittenbell)    {      /* Here CDOC->NORMALIZER is the number of unique terms in the class */      if (num_wi_ci > 0)	pr_w_c =	  (num_wi_ci / (num_w_ci + cdoc->normalizer));      else	{	  if (cdoc->word_count)	    /* There is training data for this class */	    pr_w_c = 	      (cdoc->normalizer	       / ((num_w_ci + cdoc->normalizer)		  * (barrel->wi2dvf->num_words - cdoc->normalizer)));	  else	    /* There no training data for this class */	    pr_w_c = 1.0 / barrel->wi2dvf->num_words;	}    }
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -