📄 rainbow.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 5 页
字号:
#endif    case ARGP_KEY_ARG:      /* Now we consume all the rest of the arguments.  STATE->next is the	 index in STATE->argv of the next argument to be parsed, which is the	 first STRING we're interested in, so we can just use	 `&state->argv[state->next]' as the value for RAINBOW_ARG_STATE->ARGS.	 IN ADDITION, by setting STATE->next to the end of the arguments, we	 can force argp to stop parsing here and return.  */      rainbow_arg_state.non_option_argi = state->next - 1;      if (rainbow_arg_state.what_doing == rainbow_indexing	  && rainbow_arg_state.barrel_printing_format == NULL	  && state->next == state->argc)	{	  /* Only one classname is not enough. */	  fprintf (stderr, "Need data from more than one class.\n");	  argp_usage (state);	}      state->next = state->argc;      break;    case ARGP_KEY_END:      /* Here we know that STATE->arg_num == 0, since we force argument	 parsing to end before any more arguments can get here.  */      if (rainbow_arg_state.what_doing == rainbow_indexing	  || rainbow_arg_state.what_doing == rainbow_file_testing)	{	  if (state->arg_num == 0)	    {	      /* Too few arguments.  */	      fprintf (stderr, "No non-option arguments needed.\n");	      argp_usage (state);	    }	}      else if (state->arg_num != 0)	{	  /* Too many arguments.  */	  fprintf (stderr, "No non-option arguments needed.\n");	  argp_usage (state);	}      break;    case BUILD_AND_SAVE:      rainbow_arg_state.what_doing = rainbow_building_and_saving;      break;    case TEST_FROM_SAVED:      rainbow_arg_state.what_doing = rainbow_testing_from_saved_model;      break;    case USE_SAVED_CLASSIFIER_KEY:      rainbow_arg_state.use_saved_classifier = 1;      break;    case PRINT_DOC_LENGTH_KEY:      rainbow_arg_state.print_doc_length = 1;      break;    default:      return ARGP_ERR_UNKNOWN;    }  return 0;}static struct argp rainbow_argp = { rainbow_options, rainbow_parse_opt, rainbow_argp_args_doc,  rainbow_argp_doc, bow_argp_children};/* The structures that hold the data necessary for answering a query. */bow_barrel *rainbow_doc_barrel;     /* the stats about words and documents */bow_barrel *rainbow_class_barrel=NULL;  /* the stats about words and classes *//* The static structure in bow/int4word.c is also used. *//* Given a fully-specified file path name (all the way from `/'),   return just the last filename part of it. */static inline const char *filename_to_classname (const char *filename){#if 0  /* Don't bother stripping off the directory name from the classname.     This way, even when we give rainbow multi-directory     specifications for where to find the data for each class, and     some of the lowest-level directories have the same name, we can     still distinguish between them. */  return filename;#else  const char *ret;  ret = strrchr (filename, '/');  if (ret)    return ret + 1;  return filename;#endif}/* Writing and reading the word/document stats to disk. */#define VOCABULARY_FILENAME "vocabulary"#define DOC_BARREL_FILENAME "doc-barrel"#define CLASS_BARREL_FILENAME "class-barrel"#define OUTPUTNAME_FILENAME "outfile"#define FORMAT_VERSION_FILENAME "format-version"/* Write the stats in the directory DATA_DIRNAME. */voidrainbow_archive (){  char filename[BOW_MAX_WORD_LENGTH];  char *fnp;  FILE *fp;  strcpy (filename, bow_data_dirname);  strcat (filename, "/");  fnp = filename + strlen (filename);  strcpy (fnp, FORMAT_VERSION_FILENAME);  bow_write_format_version_to_file (filename);  strcpy (fnp, OUTPUTNAME_FILENAME);  fp = bow_fopen (filename, "w");  if (rainbow_arg_state.output_filename)    fprintf (fp, "%s\n", rainbow_arg_state.output_filename);  fclose (fp);  strcpy (fnp, VOCABULARY_FILENAME);  fp = bow_fopen (filename, "wb");  bow_words_write (fp);  fclose (fp);  strcpy (fnp, CLASS_BARREL_FILENAME);  fp = bow_fopen (filename, "wb");  bow_barrel_write (rainbow_class_barrel, fp);  fclose (fp);  strcpy (fnp, DOC_BARREL_FILENAME);  fp = bow_fopen (filename, "wb");  bow_barrel_write (rainbow_doc_barrel, fp);  fclose (fp);}/* Read the stats from the directory DATA_DIRNAME. */voidrainbow_unarchive (){  char filename[BOW_MAX_WORD_LENGTH];  char *fnp;  FILE *fp;  char buf[1024];  struct stat st;  int e;    if (rainbow_arg_state.what_doing != rainbow_query_serving)    bow_verbosify (bow_progress, "Loading data files...\n");  strcpy (filename, bow_data_dirname);  strcat (filename, "/");  fnp = filename + strlen (filename);  strcpy (fnp, FORMAT_VERSION_FILENAME);  e = stat (filename, &st);  if (e != 0)    {      /* Assume this means the file doesn't exist, and this archive	 was created before BOW_DEFAULT_FORMAT_VERSION was added to	 the library.  The version number before	 BOW_DEFAULT_FORMAT_VERSION was added to the library was 3. */      bow_file_format_version = 3;    }  else    {      bow_read_format_version_from_file (filename);    }  strcpy (fnp, OUTPUTNAME_FILENAME);  fp = fopen (filename, "r");  if (fp)    {      buf[0] = '\0';      fscanf (fp, "%s", buf);      rainbow_arg_state.output_filename = strdup (buf);      fclose (fp);    }  else    {      rainbow_arg_state.output_filename = NULL;    }  strcpy (fnp, VOCABULARY_FILENAME);  fp = bow_fopen (filename, "rb");  bow_words_read_from_fp (fp);  fclose (fp);  strcpy (fnp, CLASS_BARREL_FILENAME);  fp = bow_fopen (filename, "rb");  rainbow_class_barrel = bow_barrel_new_from_data_fp (fp);  /* Don't close it because bow_wi2dvf_dv will still need to read it. */  strcpy (fnp, DOC_BARREL_FILENAME);  fp = bow_fopen (filename, "rb");  rainbow_doc_barrel = bow_barrel_new_from_data_fp (fp);  /* Don't close it because bow_wi2dvf_dv will still need to read it. */  /* Only do this if the document barrel exists */  if (rainbow_doc_barrel && rainbow_doc_barrel->classnames == NULL)    {      int i;      bow_cdoc *cdoc;      rainbow_doc_barrel->classnames = bow_int4str_new (0);      rainbow_class_barrel->classnames = bow_int4str_new (0);      for (i = 0; i < rainbow_class_barrel->cdocs->length; i++)	{	  cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, i);	  bow_str2int (rainbow_doc_barrel->classnames, 		       filename_to_classname (cdoc->filename));	  bow_str2int (rainbow_class_barrel->classnames, 		       filename_to_classname (cdoc->filename));	}    }  /* Don't want doc priors set equal - want class priors set equal */  if (bow_uniform_class_priors)    bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_class_barrel);  /*bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_doc_barrel);*/}/* Building the word/document stats. *//* Traverse the directories CLASSDIR_NAMES, gathering word/document   stats, and write the stats to disk in BOW_DATA_DIRNAME. */voidrainbow_index (int num_classes, const char *classdir_names[],	       const char *exception_name){  int class_index;  void do_indexing ()    {      if (rainbow_doc_barrel)	bow_free_barrel (rainbow_doc_barrel);      /* Index all the documents. */      rainbow_doc_barrel = bow_barrel_new (0, 0, sizeof (bow_cdoc), NULL);#if VPC_ONLY      if (rainbow_arg_state.vpc_only)	rainbow_doc_barrel->is_vpc = 1;#endif VPC_ONLY      if (bow_argp_method)	rainbow_doc_barrel->method = (rainbow_method*)bow_argp_method;      else	rainbow_doc_barrel->method = rainbow_default_method;      for (class_index = 0; class_index < num_classes; class_index++)	{	  bow_verbosify (bow_progress, "Class `%s'\n  ", 			 filename_to_classname (classdir_names[class_index]));#if HAVE_HDB	  if (bow_hdb)	    {	      /* Gathers stats on all documents in HDB database */	      if (bow_barrel_add_from_hdb		  (rainbow_doc_barrel,		   classdir_names[class_index],		   exception_name,		   filename_to_classname (classdir_names[class_index]))		  == 0)		bow_verbosify (bow_quiet,			       "No text files found in database `%s'\n", 			       classdir_names[class_index]);	    }	  else#endif	    /* This function traverses the class directory	       gathering word/document stats.  Return the number of	       documents indexed.  This gathers stats on individual	       documents; we have yet to "sum together the word vectors	       of all documents for each particular class". */	    if (bow_barrel_add_from_text_dir		(rainbow_doc_barrel, 		 classdir_names[class_index],		 exception_name,		 filename_to_classname (classdir_names[class_index]))		== 0)	      bow_verbosify (bow_quiet,			     "No text files found in directory `%s'\n", 			     classdir_names[class_index]);	}      if (bow_uniform_class_priors)	bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_doc_barrel);    }  /* Do all the parsing to build a barrel with word counts. */  if (bow_prune_vocab_by_occur_count_n)    {      /* Parse all the documents to get word occurrence counts. */      for (class_index = 0; class_index < num_classes; class_index++)	{	  bow_verbosify (bow_progress,			 "Class `%s'\n  ", 			 filename_to_classname			 (classdir_names[class_index]));#if HAVE_HDB	  if (bow_hdb)	    bow_words_add_occurrences_from_hdb	      (classdir_names[class_index], "");	  else#endif	    bow_words_add_occurrences_from_text_dir	      (classdir_names[class_index], "");	}      /* xxx This should be (bow_prune_vocab_by_occur_count_n+1) !! */      bow_words_remove_occurrences_less_than	(bow_prune_vocab_by_occur_count_n);      /* Now insist that future calls to bow_word2int*() will not	 register new words. */      bow_word2int_do_not_add = 1;    }    do_indexing ();  if (bow_prune_vocab_by_infogain_n      || bow_prune_words_by_doc_count_n)    {      if (0)	{	  /* Change barrel by removing words with small information gain. */	  bow_barrel_keep_top_words_by_infogain	    (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, num_classes);	}      else	{	  /* Change vocabulary to remove words with small information gain */	  bow_wi2dvf_hide_words_by_doc_count (rainbow_doc_barrel->wi2dvf,					      bow_prune_words_by_doc_count_n);	  /* The doc count pruning must be before the infogain pruning,	     because this function below is the one that re-assigns	     the word-indices. */	  bow_words_keep_top_by_infogain (bow_prune_vocab_by_infogain_n,					  rainbow_doc_barrel,					  num_classes);	  /* Now insist that future calls to bow_word2int*() will not	     register new words. */	  bow_word2int_do_not_add = 1;	  do_indexing ();	}    }#if VPC_ONLY  /* Weights have been calculated - all that is left to do is to calculate   * priors */  if (rainbow_arg_state.vpc_only)    {      bow_cdoc *cdocp;      double prior_sum = 0.0;      int num_classes = bow_barrel_num_classes (rainbow_doc_barrel);      int ci;      /* Normalize the class priors */      for (ci=0; ci < num_classes; ci++)	{	  cdocp = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, ci);	  prior_sum += cdocp->prior;	}      if (prior_sum)	for (ci=0; ci < num_classes; ci++)	  {	    cdocp = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, ci);	    cdocp->prior = cdocp->prior / prior_sum;	  }      else	bow_verbosify (bow_progress, "WARNING: All classes have zero prior\n");      /* Recalculate word counts for each class */      {	bow_wv *wv = NULL;	int wvi;	bow_cdoc *cdoc;	int di;	bow_dv_heap *heap = bow_test_new_heap (rainbow_doc_barrel);	while ((di = bow_heap_next_wv (heap, rainbow_doc_barrel, &wv,				       bow_cdoc_yes)) != -1)	  {	    cdoc = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, di);	    cdoc->word_count = 0;	    for (wvi = 0; wvi < wv->num_entries; wvi++)	      {		if (bow_wi2dvf_dv (rainbow_doc_barrel->wi2dvf,				   wv->entry[wvi].wi))		  cdoc->word_count += wv->entry[wvi].count;	      }	  }      }      /* We have been (secretly) using the doc_barrel as a class barrel       * all along.  Set doc_barrel to NULL so that an empty file is       * written to disk.  This will keep future executions from       * attempting to recalculate the class_barrel */      rainbow_class_barrel = rainbow_doc_barrel;      rainbow_doc_barrel = NULL;      return;    }#endif  /* Combine the documents into class statistics. */  rainbow_class_barrel =     bow_barrel_new_vpc_with_weights (rainbow_doc_barrel);}voidrainbow_index_printed_barrel (const char *filename){  rainbow_doc_barrel =
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -