📄 opts.c

📁 贝叶斯学习算法分类文本。基于朴素贝叶斯分类器的文本分类的通用算法
💻 C
字号:
/* Handling command-line options that apply across the whole of libbow. */#include <argp.h>#include <bow/libbow.h>/* For mkdir() and stat() */#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <unistd.h>/* Global variables whose value is set by bow_argp functions, but   which must be examined by some other function (called later) in   order to have any effect. *//* Remove all but the top N words by selecting words with highest   information gain */int bow_prune_vocab_by_infogain_n = 0;/* Remove words that occur less than N times */int bow_prune_vocab_by_occur_count_n = 0;/* The weight-setting and scoring method set on the command-line. */bow_method *bow_argp_method = NULL;/* The directory in which we'll store word-vector data. */const char *bow_data_dirname = NULL;/* If non-zero, print to stdout the contribution of each word to   each class. */int bow_print_word_scores = 0;/* If non-zero, use equal prior probabilities on classes when setting   weights, calculating infogain and scoring */int bow_uniform_class_priors = 0;/* If non-zero, use binary occurrence counts for words. */int bow_binary_word_counts = 0;/* Don't lex any files with names matching this. */const char *bow_exclude_filename = NULL;/* Pipe the files through this shell command before lexing. */const char *bow_lex_pipe_command = NULL;/* If non-zero, check for eencoding blocks before istext() says that   the file is text. */int bow_istext_avoid_uuencode = 0;/* Value added to key to get the key of the opposite option.  For   example "do not use stoplist" has key 's'; "use stoplist" has key   's'+KEY_OPPOSITE. */#define KEY_OPPOSITE 256enum {  APPEND_STOPLIST_FILE_KEY = 10000,  PRINT_WORD_SCORES_KEY,  UNIFORM_CLASS_PRIORS_KEY,  NAIVEBAYES_SCORE_WITH_LOG_PROBS_KEY,  BINARY_WORD_COUNTS_KEY,  EXCLUDE_FILENAME_KEY,  LEX_PIPE_COMMAND_KEY,  ISTEXT_AVOID_UUENCODE_KEY};static struct argp_option bow_options[] ={  {0, 0, 0, 0,   "General options", 1},  {"verbosity", 'v', "LEVEL", 0,   "Set amount of info printed while running; "   "(0=silent, 1=quiet, 2=show-progess,...5=max)"},  {"no-backspaces", 'b', 0, 0,   "Don't use backspace when verbosifying progress (good for use in emacs)"},  {"data-dir", 'd', "DIR", 0,   "Set the directory in which to read/write word-vector data "   "(default=~/.<program_name>)."},  {0, 0, 0, 0,   "Lexing options", 2},  {"skip-header", 'h', 0, 0,   "Avoid lexing news/mail headers by scanning forward until two newlines."},  {"no-stoplist", 's', 0, 0,   "Do not toss lexed words that appear in the stoplist."},  {"use-stoplist", 's'+KEY_OPPOSITE, 0, 0,   "Toss lexed words that appear in the stoplist. "   "(usually the default, depending on lexer)"},  {"append-stoplist-file", APPEND_STOPLIST_FILE_KEY, "FILE", 0,   "Add words in FILE to the stoplist."},  {"no-stemming", 'S'+KEY_OPPOSITE, 0, 0,   "Do not modify lexed words with a stemming function. "   "(usually the default, depending on lexer)"},  {"use-stemming", 'S', 0, 0,   "Modify lexed words with the `Porter' stemming function."},  {"gram-size", 'g', "N", 0,   "Create tokens for all 1-grams,... N-grams."},  {"exclude-filename", EXCLUDE_FILENAME_KEY, "FILENAME", 0,   "When scanning directories for text files, skip files with name "   "matching FILENAME."},  {0, 0, 0, 0,   "Mutually exclusive choice of lexers", 3},  {"skip-html", 'H', 0, 0,   "Skip HTML tokens when lexing."},  {"keep-html", 'H'+KEY_OPPOSITE, 0, OPTION_HIDDEN,   "Treat HTML tokens the same as any other chars when lexing. (default)"},  {"lex-for-usenet", 'U', 0, 0,   "Use a special lexer for UseNet articles, ignore some headers and "   "uuencoded blocks."},  {"lex-pipe-command", LEX_PIPE_COMMAND_KEY, "SHELLCMD", 0,   "Pipe files through this shell command before lexing them."},  {"istext-avoid-uuencode", ISTEXT_AVOID_UUENCODE_KEY, 0, 0,   "Check for uuencoded blocks before saying that the file is text, "   "and say no if there are many lines of the same length."},  {0, 0, 0, 0,   "Feature-selection options", 4},  {"prune-vocab-by-infogain", 'T', "N", 0,   "Remove all but the top N words by selecting words with highest "   "information gain."},  {"prune-vocab-by-occur-count", 'O', "N", 0,   "Remove words that occur less than N times."},  {0, 0, 0, 0,   "Weight-vector setting/scoring method options", 5},  {"method", 'm', "METHOD", 0,   "Set the weight-method; METHOD may be one of: "},  {"print-word-scores", PRINT_WORD_SCORES_KEY, 0, 0,   "During scoring, print the contribution of each word to each class."},  {"uniform-class-priors", UNIFORM_CLASS_PRIORS_KEY, 0, 0,   "When setting weights, calculating infogain and scoring, use equal prior "   "probabilities on classes."},  {"binary-word-counts", BINARY_WORD_COUNTS_KEY, 0, 0,   "Instead of using integer occurrence counts of words to set weights, "   "use binary absence/presence."},  {0, 0}};static intparse_bow_opt (int opt, char *arg, struct argp_state *state){  switch (opt)    {    case 'v':      bow_verbosity_level = atoi (optarg);      break;    case 'b':      /* Don't print backspaces when verbosifying at level BOW_PROGRESS. */      bow_verbosity_use_backspace = 0;      break;    case 'd':      /* Set name of the directory in which we'll store word-vector data. */      bow_data_dirname = arg;      break;      /* Lexing options. */    case 'h':      /* Avoid lexing news/mail headers by scanning fwd until two newlines */      bow_default_lexer->document_start_pattern = "\n\n";      break;    case 's':      /* Do not toss lexed words that appear in the stoplist */      bow_default_lexer_simple->stoplist_func = NULL;      break;    case 's'+KEY_OPPOSITE:      /* Toss lexed words that appear in the stoplist */      bow_default_lexer_simple->stoplist_func = bow_stoplist_present;      break;    case 'S':      /* Modify lexed words with the `Porter' stemming function */      bow_default_lexer_simple->stem_func = bow_stem_porter;      break;    case 'S'+KEY_OPPOSITE:      /* Do not modify lexed words with a stemming function. (default) */      bow_default_lexer_simple->stem_func = NULL;      break;    case APPEND_STOPLIST_FILE_KEY:      bow_stoplist_add_from_file (arg);      break;    case 'g':      /* Create tokens for all 1-grams,... N-grams */      {	int n = atoi (arg);	if (n == 0)	  {	    fprintf (stderr,		     "--gram-size, -N:  gram size must be a positive int\n");	    return ARGP_ERR_UNKNOWN;	  }	bow_default_lexer_gram->gram_size = n;      }    case 'H':      /* Skip HTML tokens when lexing */      bow_default_lexer_indirect->underlying_lexer = 	(bow_lexer*) bow_default_lexer_html;      break;    case 'H'+KEY_OPPOSITE:      /* Treat HTML tokens the same as any other chars when lexing (default) */      bow_default_lexer_indirect->underlying_lexer = 	(bow_lexer*) bow_default_lexer_simple;      break;    case 'U':      /* Use a special lexer for UseNet articles, ignore some headers and	 uuencoded blocks. */      bow_default_lexer_indirect->underlying_lexer = 	(bow_lexer*) bow_default_lexer_email;      break;    case EXCLUDE_FILENAME_KEY:      bow_exclude_filename = arg;      break;    case LEX_PIPE_COMMAND_KEY:      bow_lex_pipe_command = arg;      break;    case ISTEXT_AVOID_UUENCODE_KEY:      bow_istext_avoid_uuencode = 1;      break;      /* Feature selection options. */    case 'T':      /* Remove all but the top N words by selecting words with highest 	 information gain */      bow_prune_vocab_by_infogain_n = atoi (arg);      break;    case 'O':      /* Remove words that occur less than N times */      bow_prune_vocab_by_occur_count_n = atoi (arg);      break;    case 'm':      bow_argp_method = bow_method_at_name (arg);      break;    case PRINT_WORD_SCORES_KEY:      bow_print_word_scores = 1;      break;    case UNIFORM_CLASS_PRIORS_KEY:      bow_uniform_class_priors = 1;      break;    case BINARY_WORD_COUNTS_KEY:      /* Use binary absence/presence, instead of integer occurrence         counts for words. */      bow_binary_word_counts = 1;      break;    case ARGP_KEY_INIT:      /* Things to do before any arguments are processed. */      /* If the file ./.bow-stopwords exists, load the extra words into	 the stoplist. */      bow_stoplist_add_from_file ("./.bow-stopwords");      /* If the file ~/.bow-stopwords exists, load the extra words into	 the stoplist. */      {	const char sw_fn[] = "/.bow-stopwords";	const char *home = getenv ("HOME");	char filename[strlen (home) + strlen (sw_fn) + 1];	strcpy (filename, home);	strcat (filename, sw_fn);	bow_stoplist_add_from_file (filename);          }       /* Build the default data directory name, in case it wasn't	 specified on the command line. */      assert (program_invocation_short_name);      if (!bow_data_dirname)	{	  char *homedir = getenv ("HOME");	  char *dirname = bow_malloc (strlen (homedir) 				      + strlen ("/.")				      + strlen (program_invocation_short_name)				      + 1);	  strcpy (dirname, homedir);	  strcat (dirname, "/.");	  strcat (dirname, program_invocation_short_name);	  bow_data_dirname = dirname;	}    case ARGP_KEY_ARG:      break;    case ARGP_KEY_END:      /* Create the data directory, if it doesn't exist already. */      {	struct stat st;	int e;	e = stat (bow_data_dirname, &st);	if (e == 0)	  {	    /* Assume this means the file exists. */	    if (!S_ISDIR (st.st_mode))	      bow_error ("`%s' already exists, but is not a directory");	  }	else	  {	    if (mkdir (bow_data_dirname, 0777) == 0)	      bow_verbosify (bow_quiet, "Created directory `%s'.\n", 			     bow_data_dirname);	    else if (errno != EEXIST)	      bow_error ("Couldn't create default data directory `%s'",			 bow_data_dirname);	  }      }    default:      return ARGP_ERR_UNKNOWN;    }  return 0;}static char *_help_filter (int key, const char *text, void *input){  char *ret;  /* Add the names of the available methods to the help text. */  if (key == 'm')    {      static const len = 1024;      char methodnames[len];      int i;      bow_method *m;      methodnames[0] = '\0';      for (i = bow_methods->array->length-1; i >= 0; i--)	{	  m = bow_sarray_entry_at_index (bow_methods, i);	  strcat (methodnames, m->name);	  strcat (methodnames, ", ");	}      strcat (methodnames, "default=naivebayes.");      assert (strlen (methodnames) < len);      ret = malloc (strlen (text) + len);      strcpy (ret, text);      strcat (ret, methodnames);      return ret;    }  return (char*)text;}/* This may be used with argp_parse to parse standard libbow startup   options, possible chained onto the end of a user argp structure.  */const struct argp bow_argp ={  bow_options,			/* data structure describing cmdline options */  parse_bow_opt,		/* the function to handle the options */  0,				/* non-option argument documention */  0,				/* extra text printed before and after help */  0,				/* argp children */  _help_filter};#define MAX_NUM_CHILDREN 10struct argp_child bow_argp_children[MAX_NUM_CHILDREN] ={  {    &bow_argp,			/* the argp structure */    0,				/* flags for child */    "Libbow options",		/* optional header */    999				/* group (general lib flags at end of help)*/  },  {0}};/* The number of children already initialized in the const assignment above. */static int bow_argp_children_length = 1;voidbow_argp_add_child (struct argp_child *child){  assert (bow_argp_children_length-1 < MAX_NUM_CHILDREN);  memcpy (bow_argp_children + bow_argp_children_length,	  child,	  sizeof (struct argp_child));  bow_argp_children_length++;#if 1  memset (bow_argp_children + bow_argp_children_length,	  0, sizeof (struct argp_child));#endif}static void_print_version (FILE *stream, struct argp_state *state){  if (argp_program_version)    /* If this is non-zero, then the program's probably defined it, so let       that take precedence over the default.  */    fprintf (stream, "%s\n", argp_program_version);  /* And because libbow is a changing, integral part, put our     information out too. */  fprintf (stream, "libbow %d.%d\n",	   BOW_MAJOR_VERSION, BOW_MINOR_VERSION);}void (*argp_program_version_hook) (FILE *stream, struct argp_state *state)     = _print_version;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -