maxent.c

来自「机器学习作者tom mitchell的书上代码」· C语言代码 · 共 1,782 行 · 第 1/4 页
1,782 行
/* Weight-setting and scoring implementation for Maximum Entropy classification *//* Copyright (C) 1997, 1998, 1999 Andrew McCallum   Written by:  Kamal Nigam (knigam@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <math.h>#include <argp/argp.h>typedef struct _maxent_coefficient {  int power;  double coeff;} maxent_coefficient;typedef struct _maxent_polynomial {  int length;  int size;  maxent_coefficient entry[0];} maxent_polynomial;/* whether to add a gaussian prior for each lambda value */static int maxent_gaussian_prior = 0;/* the variance of each gaussian prior */static float maxent_prior_variance = 0.01;/* whether to enforce zero constraints when using a gaussian prior */static int maxent_gaussian_prior_zero_constraints = 1;/* if == 0, then constant variance.  if ==1 then K*log(1+N(w,c)). If ==2 then K*N(w,c) */static int maxent_prior_vary_by_count = 0;static int maxent_scoring_hack = 0;/* option to add one to each empirical count when calculating constraint */static int maxent_smooth_counts = 0;static int maxent_logprob_constraints = 0;/* internal variable for scoring */static int bow_maxent_model_building = 0;/* the function to use to identify documents for checking accuracy */static int (*maxent_accuracy_docs)(bow_cdoc *) = NULL;static int (*maxent_logprob_docs)(bow_cdoc *) = NULL;static int (*maxent_halt_accuracy_docs)(bow_cdoc *) = NULL;/* which documents to use for maxent iterations */static int (*maxent_iteration_docs)(bow_cdoc *) = bow_cdoc_is_train;/* the number of iterations of iterative scaling to perform */static int maxent_num_iterations = 40;/* the number of top words by mutual info per class to use as features. 0 means all */static int maxent_words_per_class = 0;/* the minimum count for a word/class feature */static int maxent_prune_features_by_count = 0;/* whether or not to use unlabeled docs in setting the constraints */static int maxent_constraint_use_unlabeled = 0;#if 0static int maxent_print_constraints = 1;static int maxent_print_lambdas = 1;#endifenum {  MAXENT_PRINT_ACCURACY = 6000,  MAXENT_ITERATIONS,  MAXENT_WORDS_PER_CLASS,  MAXENT_HALT_BY_LOGPROB,  MAXENT_HALT_BY_ACCURACY,  MAXENT_LOGPROB_CONSTRAINTS,  MAXENT_SMOOTH_COUNTS,  MAXENT_SCORING_HACK,  MAXENT_GAUSSIAN_PRIOR,  MAXENT_PRIOR_VARIANCE,  MAXENT_PRUNE_FEATURES_BY_COUNT,  MAXENT_GAUSSIAN_PRIOR_ZERO_CONSTRAINTS,  MAXENT_PRIOR_VARY_BY_COUNT,  MAXENT_PRIOR_VARY_BY_COUNT_LINEAR,  MAXENT_ITERATION_DOCS,  MAXENT_CONSTRAINT_DOCS,};static struct argp_option maxent_options[] ={  {0,0,0,0,   "Maximum Entropy options, --method=maxent:", 55},  {"maxent-print-accuracy", MAXENT_PRINT_ACCURACY, "TYPE", 0,   "When running maximum entropy, print the accuracy of documents at each round.  "   "TYPE is type of document to measure perplexity on.  See "   "`--em-halt-using-perplexity` for choices for TYPE"},  {"maxent-halt-by-logprob", MAXENT_HALT_BY_LOGPROB, "TYPE", 0,   "When running maxent, halt iterations using the logprob of documents.  TYPE is type of documents"   "to test.  See `--em-halt-using-perplexity` for choices for TYPE"},  {"maxent-halt-by-accuracy", MAXENT_HALT_BY_ACCURACY, "TYPE", 0,   "When running maxent, halt iterations using the accuracy of documents.  TYPE is type of documents"   "to test.  See `--em-halt-using-perplexity` for choices for TYPE"},  {"maxent-iterations", MAXENT_ITERATIONS, "NUM", 0,   "The number of iterative scaling iterations to perform.  The default is 40."},  {"maxent-keep-features-by-mi", MAXENT_WORDS_PER_CLASS, "NUM", 0,   "The number of top words by mutual information per class to use as features.  Zero"   "implies no pruning and is the default."},  {"maxent-logprob-constraints", MAXENT_LOGPROB_CONSTRAINTS, 0, 0,   "Set constraints to be the log prob of the word."},  {"maxent-smooth-counts", MAXENT_SMOOTH_COUNTS, 0, 0,   "Add 1 to the count of each word/class pair when calculating the constraint values."},  {"maxent-scoring-hack", MAXENT_SCORING_HACK, 0, 0,   "Use smoothed naive Bayes probability for zero occuring word/class pairs during scoring"},  {"maxent-gaussian-prior", MAXENT_GAUSSIAN_PRIOR, 0, 0,   "Add a Gaussian prior to each word/class feature constraint."},  {"maxent-prior-variance", MAXENT_PRIOR_VARIANCE, "NUM", 0,   "The variance to use for the Gaussian prior.  The default is 0.01."},  {"maxent-vary-prior-by-count", MAXENT_PRIOR_VARY_BY_COUNT, 0, 0,   "Multiply log (1 + N(w,c)) times variance when using a gaussian prior."},  {"maxent-gaussian-prior-no-zero-constraints", MAXENT_GAUSSIAN_PRIOR_ZERO_CONSTRAINTS, 0, 0,    "When using a gaussian prior, do not enforce constraints that have no"   "training data."},  {"maxent-prune-features-by-count", MAXENT_PRUNE_FEATURES_BY_COUNT, "NUM", 0,   "Prune the word/class feature set, keeping only those features that have"   "at least NUM occurrences in the training set."},  {"maxent-vary-prior-by-count-linearly", MAXENT_PRIOR_VARY_BY_COUNT_LINEAR, 0, 0,   "Mulitple N(w,c) times variance when using a Gaussian prior."},  {"maxent-iteration-docs", MAXENT_ITERATION_DOCS, "TYPE", 0,   "The types of documents to use for maxent iterations.  The default is train.  "   "TYPE is type of documents to test.  See `--em-halt-using-perplexity` "   "for choices for TYPE"},  {"maxent-constraint-docs", MAXENT_CONSTRAINT_DOCS, "TYPE", 0,    "The documents to use for setting the constraints.  The default is train. "   "The other choice is trainandunlabeled."},   {0, 0}};error_tmaxent_parse_opt (int key, char *arg, struct argp_state *state){  switch (key)    {    case MAXENT_ITERATION_DOCS:      if (!strcmp (arg, "train"))	maxent_iteration_docs = bow_cdoc_is_train;      else if (!strcmp (arg, "unlabeled"))	maxent_iteration_docs = bow_cdoc_is_unlabeled;      else if (!strcmp (arg, "trainandunlabeled"))	maxent_iteration_docs = bow_cdoc_is_train_or_unlabeled;      else	bow_error("Unknown document type for --maxent-iteration-docs");      break;    case MAXENT_CONSTRAINT_DOCS:      if (!strcmp (arg, "train"))	maxent_constraint_use_unlabeled = 0;      else if (!strcmp (arg, "trainandunlabeled"))	maxent_constraint_use_unlabeled = 1;      else	bow_error("Unknown document type for --maxent-constraint-docs");      break;    case MAXENT_PRINT_ACCURACY:      if (!strcmp (arg, "validation"))	maxent_accuracy_docs = bow_cdoc_is_validation;      else if (!strcmp (arg, "train"))	maxent_accuracy_docs = bow_cdoc_is_train;      else if (!strcmp (arg, "unlabeled"))	maxent_accuracy_docs = bow_cdoc_is_unlabeled;      else if (!strcmp (arg, "test"))	maxent_accuracy_docs = bow_cdoc_is_test;      else	bow_error("Unknown document type for --maxent-print-accuracy");      break;    case MAXENT_HALT_BY_LOGPROB:      if (!strcmp (arg, "validation"))	maxent_logprob_docs = bow_cdoc_is_validation;      else if (!strcmp (arg, "train"))	maxent_logprob_docs = bow_cdoc_is_train;      else if (!strcmp (arg, "unlabeled"))	maxent_logprob_docs = bow_cdoc_is_unlabeled;      else if (!strcmp (arg, "test"))	maxent_logprob_docs = bow_cdoc_is_test;      else	bow_error("Unknown document type for --maxent-halt-by-logprob");      break;    case MAXENT_HALT_BY_ACCURACY:      if (!strcmp (arg, "validation"))	maxent_halt_accuracy_docs = bow_cdoc_is_validation;      else if (!strcmp (arg, "train"))	maxent_halt_accuracy_docs = bow_cdoc_is_train;      else if (!strcmp (arg, "unlabeled"))	maxent_halt_accuracy_docs = bow_cdoc_is_unlabeled;      else if (!strcmp (arg, "test"))	maxent_halt_accuracy_docs = bow_cdoc_is_test;      else	bow_error("Unknown document type for --maxent-halt-by-accuracy");      break;    case MAXENT_ITERATIONS:      maxent_num_iterations = atoi (arg);      break;    case MAXENT_WORDS_PER_CLASS:      maxent_words_per_class = atoi (arg);      break;    case MAXENT_LOGPROB_CONSTRAINTS:      maxent_logprob_constraints = 1;      break;    case MAXENT_SMOOTH_COUNTS:      maxent_smooth_counts = 1;      break;    case MAXENT_SCORING_HACK:      maxent_scoring_hack = 1;      break;    case MAXENT_GAUSSIAN_PRIOR:      maxent_gaussian_prior = 1;      break;    case MAXENT_PRIOR_VARIANCE:      maxent_prior_variance = atof(arg);      assert (maxent_prior_variance > 0);      break;    case MAXENT_PRUNE_FEATURES_BY_COUNT:      maxent_prune_features_by_count = atoi (arg);      break;    case MAXENT_GAUSSIAN_PRIOR_ZERO_CONSTRAINTS:      maxent_gaussian_prior_zero_constraints = 0;      break;    case MAXENT_PRIOR_VARY_BY_COUNT:      maxent_prior_vary_by_count = 1;      break;    case MAXENT_PRIOR_VARY_BY_COUNT_LINEAR:      maxent_prior_vary_by_count = 2;      break;    default:      return ARGP_ERR_UNKNOWN;    }  return 0;}static const struct argp maxent_argp ={  maxent_options,  maxent_parse_opt};static struct argp_child maxent_argp_child ={  &maxent_argp,		/* This child's argp structure */  0,				/* flags for child */  0,				/* optional header in help message */  0				/* arbitrary group number for ordering */};/* alter the class barrel and excise dv entries that have count less   than min_count */void maxent_prune_features_by_occurrence_count (bow_barrel *barrel, int min_count){  int wi;  int max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  /* delete dv entries and dvs (if necessary) for word/class pairs     with less than min_count occurrences */  for (wi = 0; wi < max_wi; wi++)    {      bow_dv *dv = bow_wi2dvf_dv(barrel->wi2dvf, wi);      int new_dvi = 0;      int old_dvi = 0;      if (!dv)	continue;      for (old_dvi = 0; old_dvi < dv->length; old_dvi++)	if (dv->entry[old_dvi].count >= min_count)	  {	    dv->entry[new_dvi].count = dv->entry[old_dvi].count;	    dv->entry[new_dvi].weight = dv->entry[old_dvi].weight;	    new_dvi++;	  }      /* adjust the length of the dv, or remove it if empty */      if (new_dvi == 0)	bow_wi2dvf_hide_wi (barrel->wi2dvf, wi);      else	dv->length = new_dvi;    }}/* alter the class barrel and excise dv's and dv entries that are not   in the top num_per_class words per class */voidmaxent_prune_vocab_by_mutual_information (bow_barrel *barrel, int num_per_class){  int ci;  int wi;  long total_words = 0;  int max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  int max_ci = bow_barrel_num_classes (barrel);  struct wiig { int wi; float ig; } **mis;  int wpi;  int wiig_compare (const void *wiig1, const void *wiig2)    {      if (((struct wiig*)wiig1)->ig > ((struct wiig*)wiig2)->ig)	return -1;      else if (((struct wiig*)wiig1)->ig == ((struct wiig*)wiig2)->ig)	return 0;      else	return 1;    }  /* do this on a class barrel */  /* malloc and initialize the double array of mutual infos */  mis = bow_malloc (sizeof(struct wiig *) * max_ci);  for (ci = 0; ci < max_ci ; ci ++)    mis[ci] = bow_malloc (sizeof (struct wiig) * max_wi);  for (ci = 0; ci < max_ci ; ci++)    for (wi = 0 ; wi < max_wi; wi++)      {	mis[ci][wi].wi  = wi;	mis[ci][wi].ig = 0;      }  /* first calculate total_words  */  for (ci = 0 ; ci < max_ci ; ci++)    {      bow_cdoc *cdoc = bow_array_entry_at_index (barrel->cdocs, ci);      total_words += cdoc->word_count;    }  /* calculate the mutual informations */  for (wi = 0; wi < max_wi; wi++)    {      bow_dv *dv;      int dvi;      int local_total = 0;            dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      if (!dv)	continue;      dvi = 0;            for (dvi = 0; dvi < dv->length; dvi++)	local_total += dv->entry[dvi].count;            for (dvi = 0; dvi < dv->length; dvi++)	{	  bow_cdoc *cdoc = bow_array_entry_at_index (barrel->cdocs, 						     dv->entry[dvi].di);	  	  mis[dv->entry[dvi].di][wi].ig = fabs((double) dv->entry[dvi].count * 	    log ((double) dv->entry[dvi].count * (double) total_words  / 		 (double) cdoc->word_count / (double) local_total));	    	}    }  /* ok, now sort each class to bring the best infogains to the top*/  for (ci = 0; ci < max_ci; ci++)    qsort(mis[ci], max_wi, sizeof (struct wiig), wiig_compare);  /* Check that we're not saving bogus words */  for (ci = 0; ci < max_ci; ci++)    assert (mis[ci][num_per_class - 1].ig > 0);  for (ci = 0; ci < max_ci; ci++)    {      bow_verbosify (bow_progress, "\n%s\n", bow_barrel_classname_at_index (barrel, ci));      for (wi = 0; wi < 10; wi++)	bow_verbosify (bow_progress, "%20.10f %s\n", mis[ci][wi].ig,		       bow_int2word (mis[ci][wi].wi));    }  /* now edit the class barrel, knocking out word/class pairs where appropriate */  /* first set all word counts to be 1 */  for (wi = 0; wi < max_wi; wi++)    {      bow_dv *dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      int dvi;      if (!dv)	continue;            for (dvi = 0; dvi < dv->length; dvi++)	dv->entry[dvi].count = 1;    }  /* now set counts to be 2 where we're keeping the word/class pair */  for (ci = 0; ci < max_ci; ci++)    for (wpi = 0; wpi < num_per_class; wpi++)      {	bow_dv *dv = bow_wi2dvf_dv(barrel->wi2dvf, mis[ci][wpi].wi);	int dvi;	assert (dv);	/* find the class pair for this word */	for (dvi = 0; dvi < dv->length && ci > dv->entry[dvi].di ; dvi++);	assert (dvi < dv->length && ci == dv->entry[dvi].di);  	dv->entry[dvi].count = 2;      }  /* now delete dv entries and dvs (if necessary) for word/class pairs with 1 */  for (wi = 0; wi < max_wi; wi++)    {      bow_dv *dv = bow_wi2dvf_dv(barrel->wi2dvf, wi);      int new_dvi = 0;      int old_dvi = 0;      if (!dv)	continue;      for (old_dvi = 0; old_dvi < dv->length; old_dvi++)	if (dv->entry[old_dvi].count == 2)	  {	    dv->entry[new_dvi].count = 2;	    dv->entry[new_dvi].weight = dv->entry[old_dvi].weight;	    new_dvi++;	  }      /* adjust the length of the dv, or remove it if empty */      if (new_dvi == 0)
maxent.c - 源码说明

本页面展示了「机器学习作者tom mitchell的书上代码」中的 maxent.c 源码文件，采用 C语言编程语言编写，共 1,782 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与mitchell相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?