📄 rainbow.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
  fprintf (out_fp, "#0\n");  for (ci = 0; ci < bow_barrel_num_classes (rainbow_doc_barrel); ci++)    {      /* Build a string containing the name of this directory. */      bow_cdoc *class_cdoc;      strcpy (dir, test_dirname);      class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci);      strcat (dir, "/");      strcat (dir, filename_to_classname (class_cdoc->filename));      assert (strlen (dir) < dirlen);      /* Remember which classname this comes from, so, above, we know	 the correct class */      current_ci = class_cdoc->class;      current_class = bow_barrel_classname_at_index (rainbow_doc_barrel, ci);      /* Test each document in that diretory. */#if HAVE_HDB      if (bow_hdb)	bow_map_filenames_from_hdb (test_hdb_file, 0, dir, "");      else#endif	bow_map_filenames_from_dir (test_file, 0, dir, "");    }}voidbow_print_log_odds_ratio (FILE *fp, bow_barrel *barrel, int num_to_print){  int ci;  bow_cdoc *cdoc;  int wi;			/* a "word index" into WI2DVF */  int max_wi;			/* the highest "word index" in WI2DVF. */  bow_dv *dv;			/* the "document vector" at index WI */  int dvi;			/* an index into the DV */  int weight_setting_num_words = 0;  int total_num_words = 0;  struct lorth   {     int wi;     float lor;  } lors[barrel->cdocs->length][num_to_print];  int wci;  float *total_word_counts;  /* bow_error("Can't use this while normalizer is being used for non-integral word_count"); */  /* We assume that we have already called BOW_BARREL_NEW_VPC() on     BARREL, so BARREL already has one-document-per-class. */  /* This might be useful to have.  However, some VPC barrels do not     have this variable set, so we probably shouldn't enforce this - jrennie */  /* assert (barrel->is_vpc); */  max_wi = MIN (barrel->wi2dvf->size, bow_num_words());  total_word_counts = bow_malloc (sizeof (float) * max_wi);  for (ci = 0; ci < barrel->cdocs->length; ci++)    for (wci = 0; wci < num_to_print; wci++)      {	lors[ci][wci].lor = 0.0;	lors[ci][wci].wi = -1;      }  /* assume that word_count, normalizer are already set */  /* Calculate the total number of occurrences of each word; store this     int TOTAL_WORD_COUNTS. */  for (wi = 0; wi < max_wi; wi++)     {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      if (dv == NULL)	continue;      total_word_counts[wi] = 0;      for (dvi = 0; dvi < dv->length; dvi++) 	{	  /* Is cdoc used for anything? - jrennie */	  cdoc = bow_array_entry_at_index (barrel->cdocs, 					   dv->entry[dvi].di);	  if (cdoc->type == bow_doc_train)	    {	      total_num_words += dv->entry[dvi].weight;	      total_word_counts[wi] += dv->entry[dvi].weight;	    }	}    }  /* Set the weights in the BARREL's WI2DVF so that they are     equal to P(w|C), the probability of a word given a class. */  for (wi = 0; wi < max_wi; wi++)     {      double pr_w = 0.0;      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      if (wi % 100 == 0)	bow_verbosify(bow_progress, "\b\b\b\b\b\b%6d", wi);      /* If the model doesn't know about this word, skip it. */      if (dv == NULL)	continue;      pr_w = total_word_counts[wi] / total_num_words;      /* Now loop through all the elements, setting their weights */      for (dvi = 0; dvi < dv->length; dvi++) 	{	  double pr_w_c;	  double pr_w_not_c;	  double log_likelihood_ratio;	  cdoc = bow_array_entry_at_index (barrel->cdocs, 					   dv->entry[dvi].di);	  /* Here CDOC->WORD_COUNT is the total number of words in the class */	  /* We use Laplace Estimation. */	  pr_w_c = ((double)dv->entry[dvi].weight 		    / (cdoc->word_count + cdoc->normalizer));	  pr_w_c = (((double)dv->entry[dvi].weight + 1)		    / (cdoc->word_count + barrel->wi2dvf->num_words));	  pr_w_not_c = ((total_word_counts[wi] - dv->entry[dvi].weight 			 + barrel->cdocs->length - 1)			/ 			(total_num_words - cdoc->word_count			 + (barrel->wi2dvf->num_words			    * (barrel->cdocs->length - 1))));	  log_likelihood_ratio = log (pr_w_c / pr_w_not_c);		  wci = num_to_print - 1;	  while (wci >= 0 && 		 (lors[dv->entry[dvi].di][wci].lor < pr_w_c * log_likelihood_ratio))	    wci--;	  if (wci < num_to_print - 1)	    {	      int new_wci = wci + 1;	      for (wci = num_to_print-1; wci > new_wci; wci--)		{		  lors[dv->entry[dvi].di][wci].lor = 		    lors[dv->entry[dvi].di][wci - 1].lor;		  lors[dv->entry[dvi].di][wci].wi = 		    lors[dv->entry[dvi].di][wci - 1].wi;		}	      lors[dv->entry[dvi].di][new_wci].lor = pr_w_c * log_likelihood_ratio;	      lors[dv->entry[dvi].di][new_wci].wi = wi;	    }	}      weight_setting_num_words++;    }  bow_verbosify (bow_progress, "\n");  fprintf (fp, "Log Odds Ratio - top %d words\n\n", num_to_print);  for (ci = 0; ci < barrel->cdocs->length; ci++)    {      bow_cdoc *cdoc = bow_array_entry_at_index(barrel->cdocs, ci);      int i;      fprintf (fp, "%s\n", filename_to_classname(cdoc->filename));      for (i=0; i < strlen (filename_to_classname(cdoc->filename)); i++)	fprintf (fp, "-");      fprintf (fp, "\n");      for (wci = 0; wci < num_to_print; wci++)	fprintf (fp, "%1.15f %s\n", lors[ci][wci].lor,		 lors[ci][wci].wi >= 0		 ? bow_int2word (lors[ci][wci].wi)		 : "<nothing>");      /* Print feedline and newpage */      fprintf (fp, "%c\n",12);    }}voidrainbow_print_weight_vector (const char *classname){  int ci;			/* The `class index' of CLASSNAME */  bow_cdoc *cdoc;  int wi, max_wi;		/* a word index */  bow_dv *dv;			/* a class vector */  int dvi;			/* an index into DV */  /* Find the `class index' of the class with name CLASSNAME */  for (ci = 0; ci < bow_barrel_num_classes (rainbow_class_barrel); ci++)    {      cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci);      if (!strcmp (filename_to_classname (cdoc->filename), classname))	break;    }  if (ci == bow_barrel_num_classes (rainbow_class_barrel))    bow_error ("No class named `%s'\n", classname);  /* Get the CDOC for this class, so we can use its NORMALIZER. */  cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci);  /* Print the `weight' for each word in the class */  max_wi = MIN (bow_num_words (), rainbow_class_barrel->wi2dvf->size);  for (wi = 0; wi < max_wi; wi++)    {      dv = bow_wi2dvf_dv (rainbow_class_barrel->wi2dvf, wi);      if (dv == NULL)	continue;      /* Find the DVI with the DI matching CI */      for (dvi = 0; dvi < dv->length && dv->entry[dvi].di < ci; dvi++);      if (!(dv && dvi < dv->length && dv->entry[dvi].di == ci))	continue;      /* This is an attempt for a test to see if the weights need to	 be "normalized" before being used. */      if (rainbow_class_barrel->method->normalize_weights)	printf ("%20.10f %s\n",		dv->entry[dvi].weight * cdoc->normalizer,		bow_int2word (wi));      else	printf ("%20.10f %s\n",		dv->entry[dvi].weight,		bow_int2word (wi));    }}voidrainbow_print_foilgain (const char *classname){  int ci;			/* The `class index' of CLASSNAME */  int wi;  bow_cdoc *cdoc;  float **fig_per_wi_ci;  int fig_num_wi;  /* Find the `class index' of the class with name CLASSNAME */  for (ci = 0; ci < bow_barrel_num_classes (rainbow_class_barrel); ci++)    {      cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci);      if (!strcmp (filename_to_classname (cdoc->filename), classname))	break;    }  if (ci == bow_barrel_num_classes (rainbow_class_barrel))    bow_error ("No class named `%s'\n", classname);  /* Get the foilgains. */  fig_per_wi_ci =     bow_foilgain_per_wi_ci_new (rainbow_doc_barrel,				bow_barrel_num_classes (rainbow_class_barrel),				&fig_num_wi);  /* Print the `foilgain' for each word in the class */  for (wi = 0; wi < fig_num_wi; wi++)    {      printf ("%20.6f %s\n", 	      fig_per_wi_ci[wi][ci], bow_int2word (wi));    }  bow_foilgain_free (fig_per_wi_ci, fig_num_wi);}/* The main() function. */extern int _bow_nextprime (unsigned n);#if !RAINBOW_LISPintmain (int argc, char *argv[]){  /* Default command-line argument values */  rainbow_arg_state.what_doing = rainbow_indexing;  rainbow_arg_state.query_filename = NULL;  rainbow_arg_state.output_filename = NULL;  rainbow_arg_state.num_trials = 0;  rainbow_arg_state.infogain_words_to_print = 10;  rainbow_arg_state.logodds_words_to_print = 10;  rainbow_arg_state.printing_class = 0;  rainbow_arg_state.non_option_argi = 0;  rainbow_arg_state.repeat_query = 0;  rainbow_arg_state.vocab_map = NULL;  rainbow_arg_state.hide_vocab_map = NULL;  rainbow_arg_state.use_lisp_score_truncation = 1;  rainbow_arg_state.loo_cv = 0;  rainbow_arg_state.barrel_printing_format = NULL;  rainbow_arg_state.hide_vocab_indices_filename = NULL;  rainbow_arg_state.test_on_training = 0;  rainbow_arg_state.use_saved_classifier = 0;  rainbow_arg_state.forking_server = 0;  rainbow_arg_state.print_doc_length = 0;  rainbow_arg_state.indexing_lines_filename = NULL;#ifdef VPC_ONLY  rainbow_arg_state.vpc_only = 0;#endif    /* Parse the command-line arguments. */  argp_parse (&rainbow_argp, argc, argv, 0, 0, &rainbow_arg_state);  if (rainbow_arg_state.what_doing == rainbow_indexing)    {      /* Strip any trailing `/'s from the classnames, so we can find the 	 classname later using FILENAME_TO_CLASSNAME. */      int argi, len;      const char **rainbow_classnames;      /* if we've fixed the vocab from a file, then use it */      if (rainbow_arg_state.vocab_map)	bow_words_set_map(rainbow_arg_state.vocab_map, 1);      if (rainbow_arg_state.barrel_printing_format)	{	  rainbow_index_printed_barrel	    (argv[rainbow_arg_state.non_option_argi]);	}      else	{	  for (argi = rainbow_arg_state.non_option_argi; argi < argc; argi++)	    {	      len = strlen (argv[argi]);	      if (argv[argi][len-1] == '/')		argv[argi][len-1] = '\0';	    }	  rainbow_classnames = 	    (const char **)(argv + rainbow_arg_state.non_option_argi);	  /* Index text in the directories. */	  rainbow_index (argc - rainbow_arg_state.non_option_argi,			 rainbow_classnames, 			 rainbow_arg_state.output_filename);	}      if (bow_num_words ())	rainbow_archive ();      else	bow_error ("No text documents found.");      exit (0);    }  if (rainbow_arg_state.what_doing == rainbow_indexing_lines)    {      rainbow_index_lines (rainbow_arg_state.indexing_lines_filename);      if (bow_num_words ())	rainbow_archive ();      else	bow_error ("No text documents found.");      exit (0);    }  /* We are using an already built model.  Get it from disk. */  rainbow_unarchive ();  if (rainbow_arg_state.hide_vocab_indices_filename)    {      FILE *fp = 	bow_fopen (rainbow_arg_state.hide_vocab_indices_filename, "r");      int wi;      while (fscanf (fp, "%d", &wi) == 1)	bow_wi2dvf_hide_wi (rainbow_doc_barrel->wi2dvf, wi);      fclose (fp);    }  /* (Re)set the weight-setting method, if requested with a `-m' on     the command line. */  if (bow_argp_method)    rainbow_doc_barrel->method = (rainbow_method*)bow_argp_method;  /* Make the test/train split */  /* Don't touch anything if we don't have the document barrel */  if (rainbow_doc_barrel && rainbow_arg_state.what_doing != rainbow_testing)    bow_set_doc_types_for_barrel (rainbow_doc_barrel);  /* Do things that update their own class/word weights. */#if 0  /* Compute the number of word pairs that co-occur in documents more     than 0 times.  Did this for Jeff Schneider. */  if (1)    {      static const int max_vocab_size = 10000;      int vocab_sizes[] = {max_vocab_size, max_vocab_size};      bow_bitvec *co_occurrences = bow_bitvec_new (2, vocab_sizes);      int wi_pair[2];      int wvi1, wvi2;      bow_dv_heap *heap;      bow_wv *doc_wv;      int di;      int num_co_occurrences;      /* Make vocabulary size manageable. */      bow_barrel_keep_top_words_by_infogain	(max_vocab_size-1, rainbow_doc_barrel,	 bow_barrel_num_classes (rainbow_class_barrel));      /* Step through each document, setting bit for each word-pair 	 co-occurrence. */      heap = bow_test_new_heap (rainbow_doc_barrel);      doc_wv = NULL;      while ((di = bow_model_next_wv (heap, rainbow_doc_barrel, &doc_wv))	     != -1)	{	  for (wvi1 = 0; wvi1 < doc_wv->num_entries; wvi1++)	    {	      for (wvi2 = 0; wvi2 < doc_wv->num_entries; wvi2++)		{		  wi_pair[0] = doc_wv->entry[wvi1].wi;		  wi_pair[1] = doc_wv->entry[wvi2].wi;		  bow_bitvec_set (co_occurrences, wi_pair, 1);		}	    }	}      /* Don't free the heap here because bow_model_next_wv() does it	 for us. */            /* Count the number of co-occurrences. */      num_co_occurrences = 0;      for (wvi1 = 0; wvi1 < max_vocab_size; wvi1+
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -