📄 rainbow.c

📁 贝叶斯学习算法分类文本。基于朴素贝叶斯分类器的文本分类的通用算法
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* rainbow - a document classification front-end to libbow. *//* Copyright (C) 1997 Andrew McCallum   Written by:  Andrew Kachites McCallum <mccallum@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <argp.h>#include <errno.h>		/* needed on DEC Alpha's */#include <unistd.h>		/* for getopt(), maybe */#include <stdlib.h>		/* for atoi() */#include <string.h>		/* for strrchr() */#include <sys/types.h>#include <sys/socket.h>#include <sys/un.h>#include <netinet/in.h>#include <netdb.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <signal.h>#include <unistd.h>#include <fcntl.h>static int rainbow_sockfd;/* The version number of this program. */#define RAINBOW_MAJOR_VERSION 0#define RAINBOW_MINOR_VERSION 2#define rainbow_default_method (&bow_method_naivebayes)/* Definitions for using argp command-line processing */const char *argp_program_version ="rainbow " STRINGIFY(RAINBOW_MAJOR_VERSION) "." STRINGIFY(RAINBOW_MINOR_VERSION);const char *argp_program_bug_address = "<mccallum@cs.cmu.edu>";static char rainbow_argp_doc[] ="Rainbow -- a document classification front-end to libbow";static char rainbow_argp_args_doc[] = "[ARG...]";#define PRINT_COUNTS_FOR_WORD_KEY 10000#define INFOGAIN_PAIR_VECTOR_KEY 10001#define USE_VOCAB_IN_FILE_KEY 10002#define NO_LISP_SCORE_TRUNCATION_KEY 10003#define SERVER_KEY 10004static struct argp_option rainbow_options[] ={  {0, 0, 0, 0,   "For building data structures from text files:", 1},  {"index", 'i', 0, 0,   "Tokenize training documents found under directories ARG... "   "(where each ARG directory contains documents of a different class), "   "build weight vectors, and save them to disk."},  {0, 0, 0, 0,   "For doing document classification using the data structures "   "built with -i:", 2},  {"query", 'q', "FILE", OPTION_ARG_OPTIONAL,    "Tokenize input from stdin [or FILE], then print classification scores."},  {"output-text", 'o', "FILE", OPTION_HIDDEN,   "Intead of outputing the classnames, output the contents of FILE in the "   "data directory of the winning class, (for use as email auto-answer)."},  {"repeat", 'r', 0, 0,   "Prompt for repeated queries."},  {"query-server", SERVER_KEY, "PORTNUM", 0,   "Run rainbow in server mode."},  {0, 0, 0, 0,   "Method parameter options:", 3},  {"prind-non-uniform-priors", 'U', 0, 0,   "Make PrInd use non-uniform class priors."},  {"prind-no-foilgain-weight-scaling", 'G', 0, 0,   "Don't have PrInd scale its weights by Quinlan's FoilGain."},  {"prind-no-score-normalization", 'N', 0, 0,   "Don't have PrInd normalize its class scores to sum to one."},  {"use-vocab-in-file", USE_VOCAB_IN_FILE_KEY, "FILE", 0,   "Limit vocabulary to just those words lexed from FILE."},  {0, 0, 0, 0,   "Testing documents that were indexed with `-i':", 4},  {"test", 't', "N", 0,   "Perform N test/train splits of the indexed documents, and output "   "classifications of all test documents each time."},  {"test-percentage", 'p', "P", 0,   "Use P percent of the indexed documents as test data."},  {"no-lisp-score-truncation", NO_LISP_SCORE_TRUNCATION_KEY, 0, 0,   "Normally scores that are lower than 1e-35 are printed as 0, "   "because our LISP reader can't handle floating point numbers smaller "   "than 1e-35.  This option turns off that truncation."},  {0, 0, 0, 0,   "Testing documents that are specified on the command line:", 5},  {"test-files", 'x', 0, 0,   "In same format as `-t', output classifications of documents in "   "the directory ARG  The ARG must have the same subdir names as the "   "ARG's specified when --index'ing."},  {"test-files-loo", 'X', 0, 0,   "Same as --test-files, but evaulate the files assuming that they "   "were part of the training data, and doing leave-one-out "   "cross-validation."},  {0, 0, 0, 0,   "Diagnostics:", 6},  {"infogain-vector", 'I', "N", 0,   "Print the N words with the highest information gain."},  {"infogain-pair-vector", INFOGAIN_PAIR_VECTOR_KEY, "N", 0,   "Print the N word-pairs, which when co-occuring in a document, have "   "the highest information gain.  (Unfinished; ignores N.)"},  {"weight-vector", 'W', "CLASSNAME", 0,   "Print the word/weight vector for CLASSNAME, "   "sorted with high weights first."},  {"foilgain-vector", 'F', "CLASSNAME", 0,   "Print the word/foilgain vector for CLASSNAME."},  {"print-barrel", 'B', 0, 0,   "Print the word/document count matrix in an awk- or perl-accessible "   "format."},  {"print-word-counts", PRINT_COUNTS_FOR_WORD_KEY, "WORD", 0,   "Print the number of times WORD occurs in each class."},  {"print-counts-for-word", PRINT_COUNTS_FOR_WORD_KEY, "WORD",    OPTION_ALIAS | OPTION_HIDDEN},  { 0 }};struct rainbow_arg_state{  /* Is this invocation of rainbow to do indexing or querying? */  enum {    rainbow_indexing,     rainbow_querying,    rainbow_query_serving,    rainbow_testing,		/* many queries, from train/test split */    rainbow_file_testing,	/* many queries, from a directory */    rainbow_infogain_printing,    rainbow_infogain_pair_printing,    rainbow_weight_vector_printing,    rainbow_foilgain_printing,    rainbow_barrel_printing,    rainbow_word_count_printing,  } what_doing;  /* Where to find query text, or if NULL get query text from stdin */  const char *query_filename;  /* Name of file to find in each class directory; output the contents     of this file instead of the classname. */  const char *output_filename;  /* If we are doing test, how many test are we doing? */  int num_trials;  int test_percentage;  /* If we are printing info gain stats, how many of the top words? */  int infogain_words_to_print;  /* Used for selecting the class for which the weight-vector will be     printed. */  const char *printing_class;  /* Index into argv of the non-option args at the end (i.e. for -i     classnames or -x filenames, etc). */  int non_option_argi;  int repeat_query;  bow_int4str *vocab_map;  int use_lisp_score_truncation;  int loo_cv;  const char *server_port_num;} rainbow_arg_state;static error_trainbow_parse_opt (int key, char *arg, struct argp_state *state){  switch (key)    {    case 'q':      rainbow_arg_state.what_doing = rainbow_querying;      rainbow_arg_state.query_filename = arg;      break;    case SERVER_KEY:      rainbow_arg_state.what_doing = rainbow_query_serving;      rainbow_arg_state.server_port_num = arg;      bow_default_lexer->document_end_pattern = "\n.\r\n";      break;    case 'i':      rainbow_arg_state.what_doing = rainbow_indexing;      break;    case 'r':      rainbow_arg_state.repeat_query = 1;      break;    case 'U':      /* Don't have PrTFIDF use uniform class prior probabilities */      ((bow_params_prind*)(bow_method_prind.params))->uniform_priors	= bow_no;      break;    case 'G':      /* Don't scale weights (by foilgain or anything else) */      {	int i;	bow_method *m;	for (i = 0; i < bow_methods->array->length; i++)	  {	    m = bow_sarray_entry_at_index (bow_methods, i);	    if (m)	      m->scale_weights = NULL;	  }	break;      }    case USE_VOCAB_IN_FILE_KEY:      rainbow_arg_state.vocab_map = bow_int4str_new_from_text_file (arg);      break;    /* Switches for testing */    case 't':      rainbow_arg_state.what_doing = rainbow_testing;      rainbow_arg_state.num_trials = atoi (arg);      break;    case 'p':      rainbow_arg_state.test_percentage = atoi (arg);      break;    case 'N':      /* Don't normalize the scores from PrInd. */      ((bow_params_prind*)(bow_method_prind.params))->normalize_scores	= bow_no;      break;    case NO_LISP_SCORE_TRUNCATION_KEY:      rainbow_arg_state.use_lisp_score_truncation = 0;      break;      /* Switches for file testing */    case 'X':      rainbow_arg_state.loo_cv = 1;    case 'x':      rainbow_arg_state.what_doing = rainbow_file_testing;      break;      /* Switches for diagnostics */    case 'I':      /* Print out ARG number of vocab words ranked by infogain. */      rainbow_arg_state.what_doing = rainbow_infogain_printing;      rainbow_arg_state.infogain_words_to_print = atoi (arg);      break;    case 'W':      /* Print the weight-vector for the named class */      rainbow_arg_state.what_doing = rainbow_weight_vector_printing;      rainbow_arg_state.printing_class = arg;      break;    case 'F':      /* Print the foil gain for the named class */      rainbow_arg_state.what_doing = rainbow_foilgain_printing;      rainbow_arg_state.printing_class = arg;      break;    case 'P':      /* Print the contribution of each word to each class during 	 scoring. */       bow_print_word_scores = 1;      break;    case 'B':      /* Print the barrel in awk-accessible form to stdout. */      rainbow_arg_state.what_doing = rainbow_barrel_printing;      break;    case PRINT_COUNTS_FOR_WORD_KEY:      rainbow_arg_state.what_doing = rainbow_word_count_printing;      rainbow_arg_state.printing_class = arg;      break;    case INFOGAIN_PAIR_VECTOR_KEY:      rainbow_arg_state.what_doing = rainbow_infogain_pair_printing;      rainbow_arg_state.infogain_words_to_print = atoi (arg);      break;#if 0    case ARGP_KEY_NO_ARGS:      argp_usage (state);#endif    case ARGP_KEY_ARG:      /* Now we consume all the rest of the arguments.  STATE->next is the	 index in STATE->argv of the next argument to be parsed, which is the	 first STRING we're interested in, so we can just use	 `&state->argv[state->next]' as the value for RAINBOW_ARG_STATE->ARGS.	 IN ADDITION, by setting STATE->next to the end of the arguments, we	 can force argp to stop parsing here and return.  */      rainbow_arg_state.non_option_argi = state->next - 1;      if (rainbow_arg_state.what_doing == rainbow_indexing	  && state->next == state->argc)	{	  /* Only one classname is not enough. */	  fprintf (stderr, "Need data from more than one class.\n");	  argp_usage (state);	}      state->next = state->argc;      break;    case ARGP_KEY_END:      /* Here we know that STATE->arg_num == 0, since we force argument	 parsing to end before any more arguments can get here.  */      if (rainbow_arg_state.what_doing == rainbow_indexing	  || rainbow_arg_state.what_doing == rainbow_file_testing)	{	  if (state->arg_num == 0)	    {	      /* Too few arguments.  */	      fprintf (stderr, "No non-option arguments needed.\n");	      argp_usage (state);	    }	}      else if (state->arg_num != 0)	{	  /* Too many arguments.  */	  fprintf (stderr, "No non-option arguments needed.\n");	  argp_usage (state);	}      break;    default:      return ARGP_ERR_UNKNOWN;    }  return 0;}static struct argp rainbow_argp = { rainbow_options, rainbow_parse_opt, rainbow_argp_args_doc,  rainbow_argp_doc, bow_argp_children};/* The structures that hold the data necessary for answering a query. */bow_barrel *rainbow_doc_barrel;     /* the stats about words and documents */bow_barrel *rainbow_class_barrel;   /* the stats about words and classes */const char **rainbow_classnames;/* The static structure in bow/int4word.c is also used. *//* Writing and reading the word/document stats to disk. */#define VOCABULARY_FILENAME "vocabulary"#define DOC_BARREL_FILENAME "doc-barrel"#define CLASS_BARREL_FILENAME "class-barrel"#define OUTPUTNAME_FILENAME "outfile"#define FORMAT_VERSION_FILENAME "format-version"/* Write the stats in the directory DATA_DIRNAME. */voidrainbow_archive (){  char filename[BOW_MAX_WORD_LENGTH];  char *fnp;  FILE *fp;  strcpy (filename, bow_data_dirname);  strcat (filename, "/");  fnp = filename + strlen (filename);  strcpy (fnp, FORMAT_VERSION_FILENAME);  bow_write_format_version_to_file (filename);  strcpy (fnp, OUTPUTNAME_FILENAME);  fp = bow_fopen (filename, "w");  if (rainbow_arg_state.output_filename)    fprintf (fp, "%s\n", rainbow_arg_state.output_filename);  fclose (fp);  strcpy (fnp, VOCABULARY_FILENAME);  fp = bow_fopen (filename, "w");  bow_words_write (fp);  fclose (fp);  strcpy (fnp, CLASS_BARREL_FILENAME);  fp = bow_fopen (filename, "w");  bow_barrel_write (rainbow_class_barrel, fp);  fclose (fp);  strcpy (fnp, DOC_BARREL_FILENAME);  fp = bow_fopen (filename, "w");  bow_barrel_write (rainbow_doc_barrel, fp);  fclose (fp);}/* Read the stats from the directory DATA_DIRNAME. */voidrainbow_unarchive (){  char filename[BOW_MAX_WORD_LENGTH];  char *fnp;  FILE *fp;  char buf[1024];  int i;  struct stat st;  int e;    if (rainbow_arg_state.what_doing != rainbow_query_serving)    bow_verbosify (bow_progress, "Loading data files...\n");  strcpy (filename, bow_data_dirname);  strcat (filename, "/");  fnp = filename + strlen (filename);  strcpy (fnp, FORMAT_VERSION_FILENAME);  e = stat (filename, &st);  if (e != 0)    {      /* Assume this means the file doesn't exist, and this archive	 was created before BOW_DEFAULT_FORMAT_VERSION was added to	 the library.  The version number before	 BOW_DEFAULT_FORMAT_VERSION was added to the library was 3. */      bow_file_format_version = 3;    }  else    {      bow_read_format_version_from_file (filename);    }  strcpy (fnp, OUTPUTNAME_FILENAME);  fp = bow_fopen (filename, "r");  buf[0] = '\0';  fscanf (fp, "%s", buf);  rainbow_arg_state.output_filename = strdup (buf);  fclose (fp);  strcpy (fnp, VOCABULARY_FILENAME);  fp = bow_fopen (filename, "r");  bow_words_read_from_fp (fp);  fclose (fp);  strcpy (fnp, CLASS_BARREL_FILENAME);  fp = bow_fopen (filename, "r");  rainbow_class_barrel = bow_barrel_new_from_data_fp (fp);  /* Don't close it because bow_wi2dvf_dv will still need to read it. */  strcpy (fnp, DOC_BARREL_FILENAME);  fp = bow_fopen (filename, "r");  rainbow_doc_barrel = bow_barrel_new_from_data_fp (fp);  /* Don't close it because bow_wi2dvf_dv will still need to read it. */  /* Extract the CLASSNAMES from the class barrel. */  rainbow_classnames = bow_malloc (rainbow_class_barrel->cdocs->length				   * sizeof (char*));  for (i = 0; i < rainbow_class_barrel->cdocs->length; i++)    {      bow_cdoc *cdoc = 	bow_array_entry_at_index (rainbow_class_barrel->cdocs, i);      assert (cdoc->filename);      rainbow_classnames[i] = strdup (cdoc->filename);      assert (rainbow_classnames[i]);    }  if (bow_uniform_class_priors)    bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_doc_barrel);}/* Given a fully-specified file path name (all the way from `/'),   return just the last filename part of it. */static inline const char *filename_to_classname (const char *filename){  const char *ret;  ret = strrchr (filename, '/');  if (ret)    return ret + 1;  return filename;}/* Building the word/document stats. *//* Traverse the directories CLASSDIR_NAMES, gathering word/document   stats, and write the stats to disk in BOW_DATA_DIRNAME. */voidrainbow_index (int num_classes, const char *classdir_names[],	       const char *exception_name){  int class_index;
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -