📄 rainbow.c

📁 贝叶斯学习算法分类文本。基于朴素贝叶斯分类器的文本分类的通用算法
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
	{	  doc_cdoc = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, 					       di);	  class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, 						 doc_cdoc->class);	  bow_wv_set_weights (query_wv, rainbow_class_barrel);	  bow_wv_normalize_weights (query_wv, rainbow_class_barrel);	  actual_num_hits = 	    bow_barrel_score (rainbow_class_barrel, 			      query_wv, hits,			      num_hits_to_retrieve, -1);	  assert (actual_num_hits == num_hits_to_retrieve);#if 0	  printf ("%8.6f %d %8.6f %8.6f %d ",		  class_cdoc->normalizer, 		  class_cdoc->word_count, 		  class_cdoc->normalizer / class_cdoc->word_count, 		  class_cdoc->prior,		  doc_cdoc->class);	  if (hits[0].di == doc_cdoc->class)	    printf ("1\n");	  else	    printf ("0\n");#endif	  fprintf (test_fp, "%s %s ", 		   doc_cdoc->filename, 		   filename_to_classname(class_cdoc->filename));	  for (hi = 0; hi < actual_num_hits; hi++)	    {	      class_cdoc = 		bow_array_entry_at_index (rainbow_class_barrel->cdocs,					  hits[hi].di);	      /* For the sake CommonLisp, don't print numbers smaller than		 1e-35, because it can't `(read)' them. */	      if (rainbow_arg_state.use_lisp_score_truncation		  && hits[hi].weight < 1e-35		  && hits[hi].weight > 0)		hits[hi].weight = 0;	      fprintf (test_fp, "%s:%g ", 		       filename_to_classname (class_cdoc->filename),		       hits[hi].weight);	    }	  fprintf (test_fp, "\n");	}    }}/* Run test trials, outputing results to TEST_FP.  The results are   indended to be read and processed by the Perl script   ./rainbow-stats.  The test documents come from files inside the   directories that are named in argv[].  */voidrainbow_test_files (FILE *out_fp, const char *test_dirname){  bow_score *hits;  /* int num_test_docs; */  int num_hits_to_retrieve = rainbow_class_barrel->cdocs->length;  int actual_num_hits;  int hi;			/* hit index */  const char *current_class;  int current_ci;  int ci;  unsigned int dirlen = 1024;  char dir[dirlen];  /* This nested function is called once for each test document. */  int test_file (const char *filename, void *context)    {      bow_wv *query_wv;      FILE *fp;      bow_cdoc *class_cdoc;      fp = bow_fopen (filename, "r");      query_wv = bow_wv_new_from_text_fp (fp);      fclose (fp);      if (!query_wv)	{	  bow_verbosify (bow_progress, "%s found to be empty.\n", filename);	  return 0;	}          fprintf (out_fp, "%s %s ", 	       filename,	/* This test instance */	       current_class); /* The name of the correct class */      bow_wv_set_weights (query_wv, rainbow_class_barrel);      bow_wv_normalize_weights (query_wv, rainbow_class_barrel);      actual_num_hits = 	bow_barrel_score (rainbow_class_barrel, 			  query_wv, hits,			  num_hits_to_retrieve,			  (rainbow_arg_state.loo_cv			   ? current_ci			   : -1));      for (hi = 0; hi < actual_num_hits; hi++)	{	  class_cdoc = 	    bow_array_entry_at_index (rainbow_class_barrel->cdocs,				      hits[hi].di);	  /* For the sake CommonLisp, don't print numbers smaller than	     1e-35, because it can't `(read)' them. */	  if (rainbow_arg_state.use_lisp_score_truncation	      && hits[hi].weight < 1e-35	      && hits[hi].weight > 0)	    hits[hi].weight = 0;	  fprintf (out_fp, "%s:%g ", 		   filename_to_classname (class_cdoc->filename),		   hits[hi].weight);	}      fprintf (out_fp, "\n");      return 0;    }  hits = alloca (sizeof (bow_score) * num_hits_to_retrieve);#if 0  /* Calculate the number of testing documents according to TEST_PERCENTAGE.      The default TEST_PERCENTAGE is 0, use all training documents.  Otherwise,     we will use less training documents.  Note that the documents marked      for testing here will not actually be used for testing.  We will test     the documents in TEST_DIRNAME. */  num_test_docs = (rainbow_doc_barrel->cdocs->length * 		   rainbow_arg_state.test_percentage) / 100;  bow_test_split (rainbow_doc_barrel, num_test_docs);  if (bow_prune_vocab_by_infogain_n)    {      /* Change barrel by removing words with small info gain, if requested. */      bow_barrel_keep_top_words_by_infogain	(bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, 	 rainbow_class_barrel->cdocs->length);    }  /* Re-build the rainbow_class_barrel, if necessary */  if (rainbow_doc_barrel->method != rainbow_class_barrel->method      || rainbow_arg_state.vocab_map      || bow_prune_vocab_by_infogain_n      || rainbow_arg_state.test_percentage)    {      int num_classes = rainbow_class_barrel->cdocs->length;      bow_barrel_free (rainbow_class_barrel);      rainbow_class_barrel = 	bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, 					 rainbow_classnames, num_classes);    }#endif  fprintf (out_fp, "#0\n");  for (ci = 0; ci < rainbow_class_barrel->cdocs->length; ci++)    {      /* Build a string containing the name of this directory. */      bow_cdoc *class_cdoc;      strcpy (dir, test_dirname);      class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci);      strcat (dir, "/");      strcat (dir, filename_to_classname (class_cdoc->filename));      assert (strlen (dir) < dirlen);      /* Remember which classname this comes from, so, above, we know	 the correct class */      current_ci = class_cdoc->class;      current_class = filename_to_classname (class_cdoc->filename);      /* Test each document in that diretory. */      bow_map_filenames_from_dir (test_file, 0, dir, "");    }}voidrainbow_print_weight_vector (const char *classname){  int ci;			/* The `class index' of CLASSNAME */  bow_cdoc *cdoc;  int wi, max_wi;		/* a word index */  bow_dv *dv;			/* a class vector */  int dvi;			/* an index into DV */  /* Find the `class index' of the class with name CLASSNAME */  for (ci = 0; ci < rainbow_class_barrel->cdocs->length; ci++)    {      cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci);      if (!strcmp (filename_to_classname (cdoc->filename), classname))	break;    }  if (ci == rainbow_class_barrel->cdocs->length)    bow_error ("No class named `%s'\n", classname);  /* Get the CDOC for this class, so we can use its NORMALIZER. */  cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci);  /* Print the `weight' for each word in the class */  max_wi = MIN (bow_num_words (), rainbow_class_barrel->wi2dvf->size);  for (wi = 0; wi < max_wi; wi++)    {      dv = bow_wi2dvf_dv (rainbow_class_barrel->wi2dvf, wi);      if (dv == NULL)	continue;      /* Find the DVI with the DI matching CI */      for (dvi = 0; dvi < dv->length && dv->entry[dvi].di < ci; dvi++);      if (!(dv && dvi < dv->length && dv->entry[dvi].di == ci))	continue;      /* This is an attempt for a test to see if the weights need to	 be "normalized" before being used. */      if (rainbow_class_barrel->method->normalize_weights)	printf ("%20.10f %s\n",		dv->entry[dvi].weight * cdoc->normalizer,		bow_int2word (wi));      else	printf ("%20.10f %s\n",		dv->entry[dvi].weight,		bow_int2word (wi));    }}voidrainbow_print_foilgain (const char *classname){  int ci;			/* The `class index' of CLASSNAME */  int wi;  bow_cdoc *cdoc;  float **fig_per_wi_ci;  int fig_num_wi;  /* Find the `class index' of the class with name CLASSNAME */  for (ci = 0; ci < rainbow_class_barrel->cdocs->length; ci++)    {      cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci);      if (!strcmp (filename_to_classname (cdoc->filename), classname))	break;    }  if (ci == rainbow_class_barrel->cdocs->length)    bow_error ("No class named `%s'\n", classname);  /* Get the foilgains. */  fig_per_wi_ci =     bow_foilgain_per_wi_ci_new (rainbow_doc_barrel,				rainbow_class_barrel->cdocs->length,				&fig_num_wi);  /* Print the `foilgain' for each word in the class */  for (wi = 0; wi < fig_num_wi; wi++)    {      printf ("%20.6f %s\n", 	      fig_per_wi_ci[wi][ci], bow_int2word (wi));    }  bow_foilgain_free (fig_per_wi_ci, fig_num_wi);}/* The main() function. */#if !RAINBOW_LISPintmain (int argc, char *argv[]){  /* Default command-line argument values */  rainbow_arg_state.what_doing = rainbow_indexing;  rainbow_arg_state.query_filename = NULL;  rainbow_arg_state.output_filename = NULL;  rainbow_arg_state.num_trials = 0;  rainbow_arg_state.test_percentage = 30;  rainbow_arg_state.infogain_words_to_print = 10;  rainbow_arg_state.printing_class = 0;  rainbow_arg_state.non_option_argi = 0;  rainbow_arg_state.repeat_query = 0;  rainbow_arg_state.vocab_map = NULL;  rainbow_arg_state.use_lisp_score_truncation = 1;  rainbow_arg_state.loo_cv = 0;    _register_method_kl ();  _register_method_evi ();  /* Parse the command-line arguments. */  argp_parse (&rainbow_argp, argc, argv, 0, 0, &rainbow_arg_state);  if (rainbow_arg_state.what_doing == rainbow_indexing)    {      /* Strip any trailing `/'s from the classnames, so we can find the 	 classname later using FILENAME_TO_CLASSNAME. */      int argi, len;      for (argi = rainbow_arg_state.non_option_argi; argi < argc; argi++)	{	  len = strlen (argv[argi]);	  if (argv[argi][len-1] == '/')	    argv[argi][len-1] = '\0';	}      /* Initialize the global variable RAINBOW_CLASSNAMES */      rainbow_classnames = (const char **)(argv + 					   rainbow_arg_state.non_option_argi);      /* Index text in the directories. */      rainbow_index (argc - rainbow_arg_state.non_option_argi,		     rainbow_classnames, 		     rainbow_arg_state.output_filename);      if (bow_num_words ())	rainbow_archive ();      else	bow_error ("No text documents found.");      exit (0);    }  /* We are using an already built model.  Get it from disk. */  rainbow_unarchive ();  /* (Re)set the weight-setting method, if requested with a `-m' on     the command line. */  if (bow_argp_method)    rainbow_doc_barrel->method = bow_argp_method;  /* Do things that update their own class/word weights. */#if 0  /* Compute the number of word pairs that co-occur in documents more     than 0 times.  Did this for Jeff Schneider. */  if (1)    {      static const int max_vocab_size = 10000;      int vocab_sizes[] = {max_vocab_size, max_vocab_size};      bow_bitvec *co_occurrences = bow_bitvec_new (2, vocab_sizes);      int wi_pair[2];      int wvi1, wvi2;      bow_dv_heap *heap;      bow_wv *doc_wv;      int di;      int num_co_occurrences;      /* Make vocabulary size manageable. */      bow_barrel_keep_top_words_by_infogain	(max_vocab_size-1, rainbow_doc_barrel,	 rainbow_class_barrel->cdocs->length);      /* Step through each document, setting bit for each word-pair 	 co-occurrence. */      heap = bow_test_new_heap (rainbow_doc_barrel);      doc_wv = NULL;      while ((di = bow_model_next_wv (heap, rainbow_doc_barrel, &doc_wv))	     != -1)	{	  for (wvi1 = 0; wvi1 < doc_wv->num_entries; wvi1++)	    {	      for (wvi2 = 0; wvi2 < doc_wv->num_entries; wvi2++)		{		  wi_pair[0] = doc_wv->entry[wvi1].wi;		  wi_pair[1] = doc_wv->entry[wvi2].wi;		  bow_bitvec_set (co_occurrences, wi_pair, 1);		}	    }	}            /* Count the number of co-occurrences. */      num_co_occurrences = 0;      for (wvi1 = 0; wvi1 < max_vocab_size; wvi1++)	{	  for (wvi2 = 0; wvi2 < max_vocab_size; wvi2++)	    {	      wi_pair[0] = wvi1;	      wi_pair[1] = wvi2;	      if (bow_bitvec_value (co_occurrences, wi_pair))		num_co_occurrences++;	    }	}      printf ("Num co-occurrences = %d\n", num_co_occurrences);      exit (0);    }#endif  if (rainbow_arg_state.what_doing == rainbow_query_serving)    {      rainbow_socket_init (rainbow_arg_state.server_port_num, 0);      while (1)	{	  rainbow_serve();	}    }  /* Do things that don't require the class/word weights to be updated. */  if (rainbow_arg_state.what_doing == rainbow_testing)    {      /* We are doing test trials, and making output for Perl. */      rainbow_test (stdout);      exit (0);    }  if (rainbow_arg_state.what_doing == rainbow_infogain_printing)    {      bow_infogain_per_wi_print	(stdout, rainbow_doc_barrel,	 rainbow_class_barrel->cdocs->length,	 rainbow_arg_state.infogain_words_to_print);      exit (0);    }  if (rainbow_arg_state.what_doing == rainbow_foilgain_printing)    {      rainbow_print_foilgain (rainbow_arg_state.printing_class);      exit (0);    }  if (rainbow_arg_state.what_doing == rainbow_barrel_printing)    {      bow_barrel_printf (rainbow_doc_barrel, stdout, "");      exit (0);    }  if (rainbow_arg_state.what_doing == rainbow_infogain_pair_printing)    {      int s;      bow_infogain_per_wi_new_using_pairs	(rainbow_doc_barrel,	 rainbow_class_barrel->cdocs->length,	 &s);      exit (0);    }  if (rainbow_arg_state.what_doing == rainbow_word_count_printing)    {      bow_barrel_print_word_count (rainbow_class_barrel,				   rainbow_arg_state.printing_class);      exit (0);    }  /* Do things necessary to update the class/word weights for the      command-line options. */  /* Reduce vocabulary size by removing words not in a file listed     on the command line. */  if (rainbow_arg_state.vocab_map)    {      bow_barrel_prune_words_not_in_map (rainbow_doc_barrel,					 rainbow_arg_state.vocab_map);    }  /* Reduce vocabulary size by low info-gain words, if requested. */  if (bow_prune_vocab_by_infogain_n)    {      /* Change barrel by removing words with small info gain. */      bow_barrel_keep_top_words_by_infogain	(bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, 	 rainbow_class_barrel->cdocs->length);    }  /* Re-build the rainbow_class_barrel, if necessary */  if (rainbow_doc_barrel->method != rainbow_class_barrel->method      || rainbow_arg_state.vocab_map      || bow_prune_vocab_by_infogain_n      || 1)    {      int num_classes = rainbow_class_barrel->cdocs->length;      bow_barrel_free (rainbow_class_barrel);      rainbow_class_barrel = 	bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, 					 rainbow_classnames, num_classes);    }  /* Do things that require the class/word weights to have been updated. */  if (rainbow_arg_state.what_doing == rainbow_file_testing)    {      int argi;      assert (rainbow_arg_state.non_option_argi < argc);      for (argi = rainbow_arg_state.non_option_argi; argi < argc; argi++)	rainbow_test_files (stdout, argv[argi]);      exit (0);    }  if (rainbow_arg_state.what_doing == rainbow_weight_vector_printing)    {      rainbow_print_weight_vector (rainbow_arg_state.printing_class);      exit (0);    }  if (rainbow_arg_state.what_doing == rainbow_querying)    {      rainbow_query (stdin, stdout);      exit (0);    }  exit (0);}#endif /* !RAINBOW_LISP */
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -