📄 rainbow.c

📁 贝叶斯学习算法分类文本。基于朴素贝叶斯分类器的文本分类的通用算法
💻 C
📖 第 1 页 / 共 3 页
字号:
  void do_indexing ()    {      if (rainbow_doc_barrel)	bow_barrel_free (rainbow_doc_barrel);      /* Index all the documents. */      rainbow_doc_barrel = bow_barrel_new (0, 0, sizeof (bow_cdoc), NULL);      if (bow_argp_method)	rainbow_doc_barrel->method = bow_argp_method;      else	rainbow_doc_barrel->method = rainbow_default_method;      for (class_index = 0; class_index < num_classes; class_index++)	{	  bow_verbosify (bow_progress, "Class `%s'\n  ", 			 filename_to_classname (classdir_names[class_index]));	  /* This function traverses the directory class directory	     gathering word/document stats.  Return the number of	     documents indexed.  This gathers stats on individual	     documents; we have yet to "sum together the word vectors	     of all documents for each particular class". */	  if (bow_barrel_add_from_text_dir (rainbow_doc_barrel, 					    classdir_names[class_index],					    exception_name,					    class_index)	      == 0)	    bow_verbosify (bow_quiet,			   "No text files found in directory `%s'\n", 			   classdir_names[class_index]);	}      if (bow_uniform_class_priors)	bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_doc_barrel);    }  /* Do all the parsing to build a barrel with word counts. */  if (bow_prune_vocab_by_occur_count_n)    {      /* Parse all the documents to get word occurrence counts. */      for (class_index = 0; class_index < num_classes; class_index++)	{	  bow_verbosify (bow_progress,			 "Class `%s'\n  ", 			 filename_to_classname			 (classdir_names[class_index]));	  bow_words_add_occurrences_from_text_dir	    (classdir_names[class_index], "");	}      bow_words_remove_occurrences_less_than	(bow_prune_vocab_by_occur_count_n);      /* Now insist that future calls to bow_word2int*() will not	 register new words. */      bow_word2int_do_not_add = 1;    }    do_indexing ();  if (bow_prune_vocab_by_infogain_n)    {      if (0)	{	  /* Change barrel by removing words with small information gain. */	  bow_barrel_keep_top_words_by_infogain	    (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, num_classes);	}      else	{	  /* Change vocabulary to remove words with small information gain */	  bow_words_keep_top_by_infogain (bow_prune_vocab_by_infogain_n,					  rainbow_doc_barrel,					  num_classes);	  /* Now insist that future calls to bow_word2int*() will not	     register new words. */	  bow_word2int_do_not_add = 1;	  do_indexing ();	}    }  /* Combine the documents into class statistics. */  rainbow_class_barrel =     bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, 				     classdir_names, num_classes);}/* Perform a query. *//* Print the contents of file FILENAME to stdout. */static inline voidprint_file (const char *filename){  FILE *fp;  int byte;  if ((fp = fopen (filename, "r")) == NULL)    bow_error ("Couldn't open file `%s' for reading", filename);  while ((byte = fgetc (fp)) != EOF)    fputc (byte, stdout);  fclose (fp);}/* Get some query text, and print its best-matching documents among   those previously indexed.  The number of matching documents is   NUM_HITS_TO_SHOW.  If QUERY_FILENAME is non-null, the query text   will be obtained from that file; otherwise it will be prompted for   and read from stdin. */intrainbow_query (FILE *in, FILE *out){  /* Show as many hits as there are classes. */  int num_hits_to_show = rainbow_class_barrel->cdocs->length;  bow_score *hits;  int actual_num_hits;  int i;  bow_wv *query_wv;  hits = alloca (sizeof (bow_score) * num_hits_to_show);  /* Get the query text, and create a "word vector" from the query text. */  if (rainbow_arg_state.query_filename)    {      FILE *fp;      fp = bow_fopen (rainbow_arg_state.query_filename, "r");      query_wv = bow_wv_new_from_text_fp (fp);      fclose (fp);    }  else    {    query_again:      if (rainbow_arg_state.what_doing != rainbow_query_serving)	bow_verbosify (bow_quiet, 		       "Type your query text now.  End with a Control-D.\n");      if (feof (in))	clearerr (in);      query_wv = bow_wv_new_from_text_fp (in);    }  if (query_wv == NULL || query_wv->num_entries == 0)    {      if (rainbow_arg_state.query_filename)	bow_verbosify (bow_quiet, "No query text found in `%s'.\n", 		       rainbow_arg_state.query_filename);      else	if (rainbow_arg_state.what_doing != rainbow_query_serving)	  bow_verbosify (bow_quiet, "No query text found.");	else	  {	    fprintf(out, ".\n");	    fflush(out);	  }      if (rainbow_arg_state.repeat_query)	bow_verbosify (bow_progress, "  Stopping query repeat\n");      return 0;    }  /* (Re)set the weight-setting method, if requested with a `-m' on     the command line. */  if (bow_argp_method)    rainbow_doc_barrel->method = bow_argp_method;  else    rainbow_doc_barrel->method = rainbow_default_method;  if (rainbow_arg_state.vocab_map)    {      /* Remove words not in the VOCAB_MAP. */      bow_barrel_prune_words_not_in_map (rainbow_doc_barrel,					 rainbow_arg_state.vocab_map);    }  else if (bow_prune_vocab_by_infogain_n)    {      /* Change barrel by removing words with small information gain. */      bow_barrel_keep_top_words_by_infogain	(bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, 	 rainbow_class_barrel->cdocs->length);    }  /* Re-build the rainbow_class_barrel, if necessary */  if (rainbow_doc_barrel->method != rainbow_class_barrel->method      || rainbow_arg_state.vocab_map      || bow_prune_vocab_by_infogain_n)    {      int num_classes = rainbow_class_barrel->cdocs->length;      bow_barrel_free (rainbow_class_barrel);      rainbow_class_barrel = 	bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, 					 rainbow_classnames, 					 num_classes);    }  /* Get the best matching documents. */  bow_wv_set_weights (query_wv, rainbow_doc_barrel);  bow_wv_normalize_weights (query_wv, rainbow_doc_barrel);  actual_num_hits = bow_barrel_score  (rainbow_class_barrel, query_wv,				       hits, num_hits_to_show, -1);  /* Print them. */  if (rainbow_arg_state.what_doing != rainbow_query_serving)    fprintf (out, "\n");  for (i = 0; i < actual_num_hits; i++)    {      bow_cdoc *cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, 						 hits[i].di);      if (strlen (rainbow_arg_state.output_filename))	{	  char buf[1024];	  strcpy (buf, cdoc->filename);	  strcat (buf, "/");	  strcat (buf, rainbow_arg_state.output_filename);	  print_file (buf);	}      else	{	  /* For the sake CommonLisp, don't print numbers smaller than	     1e-35, because it can't `(read)' them. */	  if (rainbow_arg_state.use_lisp_score_truncation	      && hits[i].weight < 1e-35	      && hits[i].weight > 0)	    hits[i].weight = 0;	  fprintf (out, "%s %g\n", 		   cdoc->filename, hits[i].weight);	}    }  if (rainbow_arg_state.what_doing == rainbow_query_serving)    fprintf(out, ".\n");  fflush(out);  if (rainbow_arg_state.repeat_query)    goto query_again;  return actual_num_hits;}static void rainbow_socket_init(const char *socket_name, int use_unix_socket){   int servlen, type, bind_ret;   struct sockaddr_un un_addr;   struct sockaddr_in in_addr;   struct sockaddr *sap;   type = use_unix_socket ? AF_UNIX : AF_INET;      rainbow_sockfd = socket(type, SOCK_STREAM, 0);   assert(rainbow_sockfd >= 0);   if (type == AF_UNIX)   {     sap = (struct sockaddr *)&un_addr;     bzero((char *)sap, sizeof(un_addr));     strcpy(un_addr.sun_path, socket_name);     servlen = strlen(un_addr.sun_path) + sizeof(un_addr.sun_family) + 1;   }   else   {     sap = (struct sockaddr *)&in_addr;     bzero((char *)sap, sizeof(in_addr));     in_addr.sin_port = htons(atoi(socket_name));     in_addr.sin_addr.s_addr = htonl(INADDR_ANY);     servlen = sizeof(in_addr);   }   sap->sa_family = type;        bind_ret = bind(rainbow_sockfd, sap, servlen);   assert(bind_ret >= 0);   listen(rainbow_sockfd, 5);}static void rainbow_serve(void){  int newsockfd, clilen;  struct sockaddr cli_addr;  FILE *in, *out;  clilen = sizeof(cli_addr);  newsockfd = accept(rainbow_sockfd, &cli_addr, &clilen);  assert(newsockfd >= 0);  in = fdopen(newsockfd, "r");  out = fdopen(newsockfd, "w");  while (!feof(in))    rainbow_query(in, out);  fclose(in);  fclose(out);  close(newsockfd);}#if RAINBOW_LISP/* Setup rainbow so that we can do our lisp interface. */voidrainbow_lisp_setup (char *datadirname){  /* Defined in deflexer.c */  extern void _bow_default_lexer_init ();  /* Defined in naivebayes.c */  extern void _register_method_crossentropy ();  extern void _register_method_naivebayes ();  /* Defined in tfidf.c */  extern void _register_method_tfidf_words ();  extern void _register_method_tfidf_log_words ();  extern void _register_method_tfidf_log_occur ();  /* Defined in prind.c */  extern void _register_method_prind ();  char *dirname = bow_malloc (strlen (datadirname) + 1);  int argc;  static char *argv[] = {    "rainbow-lisp-interface",    "-q",    "-H",    "-h",    "-s",    "-b",    "-m", "kl",/*    "--lex-pipe-command", "/afs/cs/project/theo-9/webkb/univ4.rainbow/tag-digits.pl", */    "-d", 0,    0};  for (argc = 0; argv[argc]; argc++);  strcpy (dirname, datadirname);  argv[argc] = dirname;  for (argc = 0; argv[argc]; argc++);  /* Since this was dynamically loaded, the __attribute__((constructor))     functions weren't called.  Call them now. */  _bow_default_lexer_init ();  _register_method_crossentropy ();  _register_method_naivebayes ();  _register_method_tfidf_words ();  _register_method_tfidf_log_words ();  _register_method_tfidf_log_occur ();  _register_method_prind ();  _register_method_kl ();  _register_method_evi ();  /* Default command-line argument values */  rainbow_arg_state.what_doing = rainbow_indexing;  rainbow_arg_state.query_filename = NULL;  rainbow_arg_state.output_filename = NULL;  rainbow_arg_state.num_trials = 0;  rainbow_arg_state.test_percentage = 30;  rainbow_arg_state.infogain_words_to_print = 10;  rainbow_arg_state.printing_class = 0;  rainbow_arg_state.non_option_argi = 0;  rainbow_arg_state.repeat_query = 0;  rainbow_arg_state.vocab_map = NULL;  rainbow_arg_state.use_lisp_score_truncation = 1;  rainbow_arg_state.loo_cv = 0;  argp_parse (&rainbow_argp, argc, argv, 0, 0, &rainbow_arg_state);  rainbow_unarchive ();  if (bow_argp_method)    rainbow_doc_barrel->method = bow_argp_method;  else    rainbow_doc_barrel->method = rainbow_default_method;  /*  if (rainbow_doc_barrel->method != rainbow_class_barrel->method)    { */      bow_barrel_free (rainbow_class_barrel);      rainbow_class_barrel = 	bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, 					 rainbow_classnames);      /*    } */}/* Classify the text in the file QUERY_FILE, and return the    class scores (in sorted order) in SCORES.  NUM_SCORES indicates   the maximum number of slots for which space is allocated in SCORES. */intrainbow_lisp_query (const char *query_file,		    bow_score *scores, int num_scores){  /* Show as many hits as there are classes. */  int actual_num_scores;  bow_wv *query_wv;  /* Get the query text, and create a "word vector" from the query text. */  if (query_file)    {      FILE *fp;      fp = bow_fopen (query_file, "r");      query_wv = bow_wv_new_from_text_fp (fp);      fclose (fp);    }  else    {      bow_verbosify (bow_quiet, 		     "Type your query text now.  End with a Control-D.\n");      query_wv = bow_wv_new_from_text_fp (stdin);    }  if (query_wv == NULL || query_wv->num_entries == 0)    {      return 0;    }  /* Get the best matching documents. */  bow_wv_set_weights (query_wv, rainbow_class_barrel);  bow_wv_normalize_weights (query_wv, rainbow_class_barrel);  actual_num_scores = bow_barrel_score (rainbow_class_barrel, query_wv,					scores, num_scores, -1);  bow_wv_free (query_wv);  return actual_num_scores;}#endif /* RAINBOW_LISP *//* Run test trials, outputing results to TEST_FP.  The results are   indended to be read and processed by the Perl script   ./rainbow-stats. */voidrainbow_test (FILE *test_fp){  int tn;			/* trial number */  int num_test_docs;		/* how many doc's will be for testing */  bow_dv_heap *test_heap;	/* we'll extract test WV's from here */  bow_wv *query_wv;  int di;			/* a document index */  bow_score *hits;  int num_hits_to_retrieve = rainbow_class_barrel->cdocs->length;  int actual_num_hits;  int hi;			/* hit index */  bow_cdoc *doc_cdoc;  bow_cdoc *class_cdoc;  hits = alloca (sizeof (bow_score) * num_hits_to_retrieve);  /* Calculate the number of testing documents according to TEST_PERCENTAGE. */  num_test_docs = (rainbow_doc_barrel->cdocs->length 		   * rainbow_arg_state.test_percentage) / 100;  /* (Re)set the weight-setting method, if requested with `-m' argument. */  if (bow_argp_method)    rainbow_doc_barrel->method = bow_argp_method;  if (rainbow_arg_state.vocab_map)    {      bow_barrel_prune_words_not_in_map (rainbow_doc_barrel,					 rainbow_arg_state.vocab_map);    }  /* Loop once for each trial. */  for (tn = 0; tn < rainbow_arg_state.num_trials; tn++)    {      fprintf (test_fp, "#%d\n", tn);      /* Randomly set which doc's are for training and which are testing. */      bow_test_split (rainbow_doc_barrel, num_test_docs);      if (bow_uniform_class_priors)	bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_doc_barrel);      if (bow_prune_vocab_by_infogain_n)	{	  /* Change barrel by removing words with small information gain. */	  bow_barrel_keep_top_words_by_infogain	    (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, 	     rainbow_class_barrel->cdocs->length);	}      assert (rainbow_arg_state.test_percentage > 0 	      && rainbow_arg_state.test_percentage < 100);      /* Re-create the vector-per-class barrel in accordance with the	 new train/test settings. */      {	int num_classes = rainbow_class_barrel->cdocs->length;	bow_barrel_free (rainbow_class_barrel);	rainbow_class_barrel = 	  bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, 					   rainbow_classnames,					   num_classes);      }      /* Create the heap from which we'll get WV's. */      test_heap = bow_test_new_heap (rainbow_doc_barrel);      /* Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free */      query_wv = NULL;      /* Loop once for each test document. */      while ((di = bow_test_next_wv (test_heap, rainbow_doc_barrel, &query_wv))	     != -1)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -