📄 opts.c
字号:
/* Handling command-line options that apply across the whole of libbow. */#include <argp.h>#include <bow/libbow.h>/* For mkdir() and stat() */#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <unistd.h>#include <string.h>/* Global variables whose value is set by bow_argp functions, but which must be examined by some other function (called later) in order to have any effect. *//* Flag to indicate whether ARG... files should be interpreted as HDB databases */int bow_hdb = 0;/* Remove all but the top N words by selecting words with highest information gain */int bow_prune_vocab_by_infogain_n = 0;/* Remove words that occur less than N times */int bow_prune_vocab_by_occur_count_n = 0;/* The weight-setting and scoring method set on the command-line. */bow_method *bow_argp_method = NULL;/* The directory in which we'll store word-vector data. */const char *bow_data_dirname = NULL;/* If non-zero, print to stdout the contribution of each word to each class. */int bow_print_word_scores = 0;/* If non-zero, use equal prior probabilities on classes when setting weights, calculating infogain and scoring */int bow_uniform_class_priors = 0;/* If non-zero, use binary occurrence counts for words. */int bow_binary_word_counts = 0;/* Don't lex any files with names matching this. */const char *bow_exclude_filename = NULL;/* Pipe the files through this shell command before lexing. */const char *bow_lex_pipe_command = NULL;/* File containing the annotations to display for each file */const char *bow_annotation_filename = NULL;/* If non-zero, check for eencoding blocks before istext() says that the file is text. */int bow_istext_avoid_uuencode = 0;/* Number of decimal places to print when printing classification scores */int bow_score_print_precision = 10;/* Which smoothing method to use to avoid zero word probabilities */bow_smoothing bow_smoothing_method = bow_smoothing_laplace;/* Remove words that occur in this many or fewer documents. */int bow_prune_words_by_doc_count_n = 0;/* Random seed to use for srand, if not equal to -1 */int bow_random_seed = -1;/* What "event-model" we will use for the probabilistic models. */bow_event_models bow_event_model = bow_event_word;/* What "event-model" we will use for calculating information gain of words with classes. */bow_event_models bow_infogain_event_model = bow_event_document;/* When using the bow_event_document_then_word event model, we normalize the length of all the documents. This determines the normalized length. */int bow_event_document_then_word_document_length = 200;/* Smooth words that occur k or fewer times for Good-Turing smoothing */int bow_smoothing_goodturing_k = 7;/* The filename containing the dirichlet alphas */const char *bow_smoothing_dirichlet_filename = NULL;/* Only tokenize words containing `xxx' */int bow_xxx_words_only = 0;/* The weighting factor for the alphas */float bow_smoothing_dirichlet_weight = 1.0;/* Value added to key to get the key of the opposite option. For example "do not use stoplist" has key 's'; "use stoplist" has key 's'+KEY_OPPOSITE. */#define KEY_OPPOSITE 256enum { APPEND_STOPLIST_FILE_KEY = 10000, PRINT_WORD_SCORES_KEY, UNIFORM_CLASS_PRIORS_KEY, NAIVEBAYES_SCORE_WITH_LOG_PROBS_KEY, BINARY_WORD_COUNTS_KEY, EXCLUDE_FILENAME_KEY, LEX_PIPE_COMMAND_KEY, ISTEXT_AVOID_UUENCODE_KEY, LEX_WHITE_KEY, LEX_ALPHANUM_KEY, LEX_SUFFIXING_KEY, LEX_INFIX_KEY, SHORTEST_WORD_KEY, FLEX_MAIL_KEY, FLEX_TAGGED_KEY, REPLACE_STOPLIST_FILE_KEY, SCORE_PRINT_PRECISION, SMOOTHING_METHOD_KEY, SPLIT_SEED, EVENT_MODEL_KEY, EVENT_DOC_THEN_WORD_DOC_LENGTH_KEY, INFOGAIN_EVENT_MODEL_KEY, SMOOTHING_GOODTURING_K, HDB_KEY, ANNOTATION_KEY, SMOOTHING_DIRICHLET_FILENAME, SMOOTHING_DIRICHLET_WEIGHT, XXX_WORDS_ONLY_KEY, MAX_NUM_WORDS_PER_DOCUMENT_KEY, USE_UNKNOWN_WORD_KEY,};static struct argp_option bow_options[] ={ {0, 0, 0, 0, "General options", 1}, {"verbosity", 'v', "LEVEL", 0, "Set amount of info printed while running; " "(0=silent, 1=quiet, 2=show-progess,...5=max)"}, {"no-backspaces", 'b', 0, 0, "Don't use backspace when verbosifying progress (good for use in emacs)"}, {"data-dir", 'd', "DIR", 0, "Set the directory in which to read/write word-vector data " "(default=~/.<program_name>)."}, {"score-precision", SCORE_PRINT_PRECISION, "NUM", 0, "The number of decimal digits to print when displaying document scores"}, {"random-seed", SPLIT_SEED, "NUM", 0, "The non-negative integer to use for seeding the random number generator"}, {"annotations", ANNOTATION_KEY, "FILE", 0, "The sarray file containing annotations for the files in the index"},#if HAVE_HDB {"hdb", HDB_KEY, 0, 0, "Assume ARG... names are HDB databases. May not be used with " "--lex-pipe-command. Only useful with --index option. Currently only " "works with rainbow and arrow"},#endif {0, 0, 0, 0, "Lexing options", 2}, {"skip-header", 'h', 0, 0, "Avoid lexing news/mail headers by scanning forward until two newlines."}, {"no-stoplist", 's', 0, 0, "Do not toss lexed words that appear in the stoplist."}, {"use-stoplist", 's'+KEY_OPPOSITE, 0, 0, "Toss lexed words that appear in the stoplist. " "(usually the default SMART stoplist, depending on lexer)"}, {"append-stoplist-file", APPEND_STOPLIST_FILE_KEY, "FILE", 0, "Add words in FILE to the stoplist."}, {"replace-stoplist-file", REPLACE_STOPLIST_FILE_KEY, "FILE", 0, "Empty the default stoplist, and add space-delimited words from FILE."}, {"no-stemming", 'S'+KEY_OPPOSITE, 0, 0, "Do not modify lexed words with a stemming function. " "(usually the default, depending on lexer)"}, {"use-stemming", 'S', 0, 0, "Modify lexed words with the `Porter' stemming function."}, {"shortest-word", SHORTEST_WORD_KEY, "LENGTH", 0, "Toss lexed words that are shorter than LENGTH. Default is usually 2."}, {"gram-size", 'g', "N", 0, "Create tokens for all 1-grams,... N-grams."}, {"exclude-filename", EXCLUDE_FILENAME_KEY, "FILENAME", 0, "When scanning directories for text files, skip files with name " "matching FILENAME."}, {"istext-avoid-uuencode", ISTEXT_AVOID_UUENCODE_KEY, 0, 0, "Check for uuencoded blocks before saying that the file is text, " "and say no if there are many lines of the same length."}, {"lex-pipe-command", LEX_PIPE_COMMAND_KEY, "SHELLCMD", 0, "Pipe files through this shell command before lexing them."}, {"xxx-words-only", XXX_WORDS_ONLY_KEY, 0, 0, "Only tokenize words with `xxx' in them"}, {"max-num-words-per-document", MAX_NUM_WORDS_PER_DOCUMENT_KEY, "N", 0, "Only tokenize the first N words in each document."}, {"use-unknown-word", USE_UNKNOWN_WORD_KEY, 0, 0, "When used in conjunction with -O or -D, captures all words with " "occurrence counts below threshold as the `<unknown>' token"}, {0, 0, 0, 0, "Mutually exclusive choice of lexers", 3}, {"skip-html", 'H', 0, 0, "Skip HTML tokens when lexing."}, {"keep-html", 'H'+KEY_OPPOSITE, 0, OPTION_HIDDEN, "Treat HTML tokens the same as any other chars when lexing. (default)"}, {"lex-white", LEX_WHITE_KEY, 0, 0, "Use a special lexer that delimits tokens by whitespace only, and " "does not change the contents of the token at all---no downcasing, " "no stemming, no stoplist, nothing. Ideal for use with an externally-" "written lexer interfaced to rainbow with --lex-pipe-cmd."}, {"lex-alphanum", LEX_ALPHANUM_KEY, 0, 0, "Use a special lexer that includes digits in tokens, delimiting tokens " "only by non-alphanumeric characters."}, {"lex-suffixing", LEX_SUFFIXING_KEY, 0, 0, "Use a special lexer that adds suffixes depending on Email-style headers."}, {"lex-infix-string", LEX_INFIX_KEY, "ARG", 0, "Use only the characters after ARG in each word for stoplisting and " "stemming. If a word does not contain ARG, the entire word is used."}, {"flex-mail", FLEX_MAIL_KEY, 0, 0, "Use a mail-specific flex lexer"}, {"flex-tagged", FLEX_TAGGED_KEY, 0, 0, "Use a tagged flex lexer"}, {"lex-for-usenet", 'U', 0, OPTION_HIDDEN, "Use a special lexer for UseNet articles, ignore some headers and " "uuencoded blocks."}, {0, 0, 0, 0, "Feature-selection options", 4}, {"prune-vocab-by-infogain", 'T', "N", 0, "Remove all but the top N words by selecting words with highest " "information gain."}, {"prune-vocab-by-occur-count", 'O', "N", 0, "Remove words that occur less than N times."}, {"prune-vocab-by-doc-count", 'D', "N", 0, "Remove words that occur in N or fewer documents."}, {0, 0, 0, 0, "Weight-vector setting/scoring method options", 5}, {"method", 'm', "METHOD", 0, "Set the word weight-setting method; METHOD may be one of: "}, {"print-word-scores", PRINT_WORD_SCORES_KEY, 0, 0, "During scoring, print the contribution of each word to each class."}, {"uniform-class-priors", UNIFORM_CLASS_PRIORS_KEY, 0, 0, "When setting weights, calculating infogain and scoring, use equal prior " "probabilities on classes."}, {"binary-word-counts", BINARY_WORD_COUNTS_KEY, 0, 0, "Instead of using integer occurrence counts of words to set weights, " "use binary absence/presence."}, {"smoothing-method", SMOOTHING_METHOD_KEY, "METHOD", 0, "Set the method for smoothing word probabilities to avoid zeros; " "METHOD may be one of: goodturing, laplace, mestimate, wittenbell"}, {"smoothing-goodturing-k", SMOOTHING_GOODTURING_K, "NUM", 0, "Smooth word probabilities for words that occur NUM or less times. " "The default is 7."}, {"smoothing-dirichlet-filename", SMOOTHING_DIRICHLET_FILENAME, "FILE", 0, "The file containing the alphas for the dirichlet smoothing."}, {"smoothing-dirichlet-weight", SMOOTHING_DIRICHLET_WEIGHT, "NUM", 0, "The weighting factor by which to muliply the alphas for dirichlet " "smoothing."}, {"event-model", EVENT_MODEL_KEY, "EVENTNAME", 0, "Set what objects will be considered the `events' of the probabilistic " "model. EVENTNAME can be one of: word, document, document-then-word. " "Default is `word'."}, {"event-document-then-word-document-length", EVENT_DOC_THEN_WORD_DOC_LENGTH_KEY, "NUM", 0, "Set the normalized length of documents when " "--event-model=document-then-word"}, {"infogain-event-model", INFOGAIN_EVENT_MODEL_KEY, "EVENTNAME", 0, "Set what objects will be considered the `events' when information gain " "is calculated. " "EVENTNAME can be one of: word, document, document-then-word. " "Default is `document'."}, {0, 0}};static intparse_bow_opt (int opt, char *arg, struct argp_state *state){ switch (opt) { case 'v': bow_verbosity_level = atoi (optarg); break; case 'b': /* Don't print backspaces when verbosifying at level BOW_PROGRESS. */ bow_verbosity_use_backspace = 0; break; case 'd': /* Set name of the directory in which we'll store word-vector data. */ bow_data_dirname = arg; break; case SCORE_PRINT_PRECISION: /* Set the number of digits to print */ bow_score_print_precision = atoi(optarg); break; case SPLIT_SEED: /* Set the seed for the random number generator */ bow_random_seed = atoi (optarg); if (bow_random_seed < 0) { fprintf (stderr, "--split-seed: Seed must be non-negative.\n"); return ARGP_ERR_UNKNOWN; } break;#if HAVE_HDB case HDB_KEY: bow_hdb = 1; if (bow_lex_pipe_command) bow_error ("--hdb and --lex-pipe-command options cannot be used in" " conjunction\n"); break;#endif /* Lexing options. */ case 'h': /* Avoid lexing news/mail headers by scanning fwd until two newlines */ bow_lexer_document_start_pattern = "\n\n"; break; case 'g': /* Create tokens for all 1-grams,... N-grams */ { int n = atoi (arg); if (n <= 0) { fprintf (stderr, "--gram-size, -N: gram size must be a positive int\n"); return ARGP_ERR_UNKNOWN; } else if (n > 1) { bow_lexer_gram *lex = bow_malloc (sizeof (bow_lexer_gram)); memcpy (lex, bow_gram_lexer, sizeof (bow_lexer_gram)); lex->gram_size = n; lex->lexer.next = bow_default_lexer; bow_default_lexer = (bow_lexer*) lex; } break; } case 'H': /* Skip HTML tokens when lexing */ { bow_lexer *lex = bow_malloc (sizeof (bow_lexer));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -