⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 rainbow.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 5 页
字号:
/* rainbow - a document classification front-end to libbow. *//* Copyright (C) 1997, 1998, 1999, 2000 Andrew McCallum   Written by:  Andrew Kachites McCallum <mccallum@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <argp.h>#include <argp/argp1.h>                         /* drapp-2/11 */#include <setjmp.h>                         /* drapp-2/11 */#include <errno.h>		/* needed on DEC Alpha's */#include <unistd.h>		/* for getopt(), maybe */#include <stdlib.h>		/* for atoi() */#include <string.h>		/* for strrchr() */#include <strings.h>		/* for bzero() on Solaris */#include <sys/types.h>#include <sys/socket.h>#ifndef WINNT#include <sys/un.h>#endif /* WINNT */#include <netinet/in.h>#include <netdb.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <signal.h>#include <unistd.h>#include <fcntl.h>static int rainbow_sockfd;/* The version number of this program. */#define RAINBOW_MAJOR_VERSION 0#define RAINBOW_MINOR_VERSION 2#define rainbow_default_method (&bow_method_naivebayes)/* Definitions for using argp command-line processing */const char *argp_program_version ="rainbow " STRINGIFY(RAINBOW_MAJOR_VERSION) "." STRINGIFY(RAINBOW_MINOR_VERSION);const char *argp_program_bug_address = "<mccallum@cs.cmu.edu>";static char rainbow_argp_doc[] ="Rainbow -- a document classification front-end to libbow";static char rainbow_argp_args_doc[] = "[ARG...]";enum {  PRINT_COUNTS_FOR_WORD_KEY = 10000,  INFOGAIN_PAIR_VECTOR_KEY,  USE_VOCAB_IN_FILE_KEY,  NO_LISP_SCORE_TRUNCATION_KEY,  SERVER_KEY,  FORKING_SERVER_KEY,  PRINT_DOC_NAMES_KEY,  PRINT_LOG_ODDS_RATIO_KEY,  WORD_PROBABILITIES_KEY,  PRINT_WORD_PROBABILITIES_KEY,  INDEX_MATRIX_KEY,  HIDE_VOCAB_IN_FILE_KEY,  HIDE_VOCAB_INDICES_IN_FILE_KEY,  TEST_ON_TRAINING_KEY,  VPC_ONLY_KEY,  BUILD_AND_SAVE,  TEST_FROM_SAVED,  USE_SAVED_CLASSIFIER_KEY,  PRINT_DOC_LENGTH_KEY,  INDEX_LINES_KEY,};static struct argp_option rainbow_options[] ={  {0, 0, 0, 0,   "For building data structures from text files:", 20},  {"index", 'i', 0, 0,   "Tokenize training documents found under directories ARG... "   "(where each ARG directory contains documents of a different class), "   "build token-document matrix, and save it to disk."},  {"index-matrix", INDEX_MATRIX_KEY, "FORMAT", 0,   "Read document/word statistics from a file in the format produced by "   "--print-matrix=FORMAT.  See --print-matrix for details about FORMAT."},  {"index-lines", INDEX_LINES_KEY, "FILENAME", 0,   "Read documents' contents from the filename argument, one-per-line.  "   "The first two "   "space-delimited words on each line are the document name and class name "   "respectively"},#if VPC_ONLY  {"vpc-only", VPC_ONLY_KEY, 0, 0,   "Only create a vector-per-class barrel.  Do not create a document barrel.  "   "Useful for creating barrels to be used with --query-server.  "   "NOTE: This is a hack which assumes multinomial and a naive Bayes-like "   "method.  Not meant for general purpose usage!"},#endif  {0, 0, 0, 0,   "For doing document classification using the token-document matrix "   "built with -i:", 21},  {"query", 'q', "FILE", OPTION_ARG_OPTIONAL,    "Tokenize input from stdin [or FILE], then print classification scores."},  {"output-text", 'o', "FILE", OPTION_HIDDEN,   "Intead of outputing the classnames, output the contents of FILE in the "   "data directory of the winning class, (for use as email auto-answer)."},  {"repeat", 'r', 0, 0,   "Prompt for repeated queries."},  {"query-server", SERVER_KEY, "PORTNUM", 0,   "Run rainbow in server mode, listening on socket number PORTNUM.  "   "You can try it by executing this command, then in a different shell "   "window on the same machine typing `telnet localhost PORTNUM'."},  {"forking-query-server", FORKING_SERVER_KEY, "PORTNUM", 0,   "Same as `--query-server', except allow multiple clients at once by "   "forking for each client."},  {"print-doc-length", PRINT_DOC_LENGTH_KEY, 0, 0,   "When printing the classification scores for each test document, at the "   "end also print the number of words in the document.  This only works "   "with the --test option."},  {0, 0, 0, 0,   "Rainbow-specific vocabulary options:", 22},  {"use-vocab-in-file", USE_VOCAB_IN_FILE_KEY, "FILE", 0,   "Limit vocabulary to just those words read as space-separated strings "   "from FILE.  Note that regular lexing is not done on these strings."},  {"hide-vocab-in-file", HIDE_VOCAB_IN_FILE_KEY, "FILE", 0,   "Hide from the vocabulary all words read as space-separated strings "   "from FILE.  Note that regular lexing is not done on these strings."},  {"hide-vocab-indices-in-file", HIDE_VOCAB_INDICES_IN_FILE_KEY, "FILE", 0,   "Hide from the vocabulary all words read as space-separated word "   "integer indices from FILE."},  {0, 0, 0, 0,   "Testing documents that were indexed with `-i':", 23},  {"test", 't', "N", 0,   "Perform N test/train splits of the indexed documents, and output "   "classifications of all test documents each time.  The parameters of "   "the test/train splits are determined by the option `--test-set' "   "and its siblings"},  {"test-on-training", TEST_ON_TRAINING_KEY, "N", 0,   "Like `--test', but instead of classifing the held-out test documents "   "classify the training data in leave-one-out fashion.  Perform N trials."},#if 0  {"no-lisp-score-truncation", NO_LISP_SCORE_TRUNCATION_KEY, 0, 0,   "Normally scores that are lower than 1e-35 are printed as 0, "   "because our LISP reader can't handle floating point numbers smaller "   "than 1e-35.  This option turns off that truncation."},#endif  {0, 0, 0, 0,   "Testing documents that are specified on the command line:", 5},  {"test-files", 'x', 0, 0,   "In same format as `-t', output classifications of documents in "   "the directory ARG  The ARG must have the same subdir names as the "   "ARG's specified when --index'ing."},  {"test-files-loo", 'X', 0, 0,   "Same as --test-files, but evaulate the files assuming that they "   "were part of the training data, and doing leave-one-out "   "cross-validation.  This only works with the classification methods "   "that support leave-one-out evaluation"},  {0, 0, 0, 0,   "Diagnostics:", 24},  {"print-word-infogain", 'I', "N", 0,   "Print the N words with the highest information gain."},  {"print-word-pair-infogain", INFOGAIN_PAIR_VECTOR_KEY, "N", 0,   "Print the N word-pairs, which when co-occuring in a document, have "   "the highest information gain.  (Unfinished; ignores N.)"},  {"print-log-odds-ratio", PRINT_LOG_ODDS_RATIO_KEY, "N", OPTION_ARG_OPTIONAL,   "For each class, print the N words with the highest log odds ratio score.  "   "Default is N=10."},  {"print-word-weights", 'W', "CLASSNAME", 0,   "Print the word/weight vector for CLASSNAME, "   "sorted with high weights first.  The meaning of `weight' is undefined."},  {"print-word-probabilities", PRINT_WORD_PROBABILITIES_KEY, "CLASS", 0,   "Print P(w|CLASS), the probability in class CLASS "   "of each word in the vocabulary."},  {"print-word-foilgain", 'F', "CLASSNAME", 0,   "Print the word/foilgain vector for CLASSNAME.  See Mitchell's "   "Machine Learning textbook for a description of foilgain."},  {"print-matrix", 'B', "FORMAT", OPTION_ARG_OPTIONAL,   "Print the word/document count matrix in an awk- or perl-accessible "   "format.  Format is specified by the following letters:\n"   "print all vocab or just words in document:\n"   "  a=all OR s=sparse\n"   "print counts as ints or binary:\n"   "  b=binary OR i=integer\n"   "print word as:\n  "   "  n=integer index OR w=string OR e=empty OR c=combination\n"   "The default is the last in each list"},  {"print-barrel", 'B', "FORMAT",    OPTION_ARG_OPTIONAL | OPTION_ALIAS | OPTION_HIDDEN},  {"print-word-counts", PRINT_COUNTS_FOR_WORD_KEY, "WORD", 0,   "Print the number of times WORD occurs in each class."},  {"print-counts-for-word", PRINT_COUNTS_FOR_WORD_KEY, "WORD",    OPTION_ALIAS | OPTION_HIDDEN},  {"print-doc-names", PRINT_DOC_NAMES_KEY, "TAG", OPTION_ARG_OPTIONAL,   "Print the filenames of documents contained in the model.  "   "If the optional TAG argument is given, print only the documents "   "that have the specified tag, where TAG might be `train', `test', etc."},  {"build-and-save", BUILD_AND_SAVE, 0, 0,   "Builds a class model and saves it to disk.  This option is unstable."},  {"test-from-saved", TEST_FROM_SAVED, 0, 0,   "Classify using the class model saved to disk.  This option is unstable."},  {"use-saved-classifier", USE_SAVED_CLASSIFIER_KEY, 0, 0,   "Don't ever re-train the classifier.  Use whatever class barrel was saved "   "to disk.  This option designed for use with --query-server"},  { 0 }};struct rainbow_arg_state{  /* Is this invocation of rainbow to do indexing or querying? */  enum {    rainbow_indexing,     rainbow_querying,    rainbow_query_serving,    rainbow_testing,		/* many queries, from train/test split */    rainbow_file_testing,	/* many queries, from a directory */    rainbow_infogain_printing,    rainbow_infogain_pair_printing,    rainbow_logodds_printing,    rainbow_weight_vector_printing,    rainbow_foilgain_printing,    rainbow_barrel_printing,    rainbow_word_count_printing,    rainbow_doc_name_printing,    rainbow_printing_word_probabilities,    rainbow_building_and_saving,    rainbow_testing_from_saved_model,    rainbow_indexing_lines  } what_doing;  /* Where to find query text, or if NULL get query text from stdin */  const char *query_filename;  /* Name of file to find in each class directory; output the contents     of this file instead of the classname. */  const char *output_filename;  /* If we are doing test, how many test are we doing? */  int num_trials;  /* If we are printing info gain stats, how many of the top words? */  int infogain_words_to_print;  /* If we are printing log odds ratio stats, how many of the top words? */  int logodds_words_to_print;  /* Used for selecting the class for which the weight-vector will be     printed. */  const char *printing_class;  /* Index into argv of the non-option args at the end (i.e. for -i     classnames or -x filenames, etc). */  int non_option_argi;  int repeat_query;  bow_int4str *vocab_map;  bow_int4str *hide_vocab_map;  int use_lisp_score_truncation;  int loo_cv;  const char *server_port_num;  const char *barrel_printing_format;  const char *hide_vocab_indices_filename;  int test_on_training;  int use_saved_classifier;  int forking_server;#if VPC_ONLY  /* Set if we only want to build a class barrel */  int vpc_only;#endif  int print_doc_length;  const char *indexing_lines_filename;} rainbow_arg_state;static error_trainbow_parse_opt (int key, char *arg, struct argp_state *state){  switch (key)    {    case 'q':      rainbow_arg_state.what_doing = rainbow_querying;      rainbow_arg_state.query_filename = arg;      break;    case FORKING_SERVER_KEY:      rainbow_arg_state.forking_server = 1;    case SERVER_KEY:      rainbow_arg_state.what_doing = rainbow_query_serving;      rainbow_arg_state.server_port_num = arg;      bow_lexer_document_end_pattern = "\n.\r\n";      break;    case 'i':      rainbow_arg_state.what_doing = rainbow_indexing;      break;    case INDEX_MATRIX_KEY:      rainbow_arg_state.what_doing = rainbow_indexing;      rainbow_arg_state.barrel_printing_format = arg;      break;#if VPC_ONLY    case VPC_ONLY_KEY:      rainbow_arg_state.vpc_only = 1;      break;#endif    case INDEX_LINES_KEY:      rainbow_arg_state.what_doing = rainbow_indexing_lines;      rainbow_arg_state.indexing_lines_filename = arg;      break;    case 'r':      rainbow_arg_state.repeat_query = 1;      break;    case USE_VOCAB_IN_FILE_KEY:      rainbow_arg_state.vocab_map = bow_int4str_new_from_string_file (arg);      bow_verbosify (bow_progress,		     "Using vocab with %d words from file `%s'\n",		     rainbow_arg_state.vocab_map->str_array_length, arg);      bow_word2int_do_not_add = 1;      break;    case HIDE_VOCAB_IN_FILE_KEY:      rainbow_arg_state.hide_vocab_map = bow_int4str_new_from_string_file(arg);      break;    case HIDE_VOCAB_INDICES_IN_FILE_KEY:      rainbow_arg_state.hide_vocab_indices_filename = arg;      break;    /* Switches for testing */    case 't':      rainbow_arg_state.what_doing = rainbow_testing;      rainbow_arg_state.num_trials = atoi (arg);      break;    case TEST_ON_TRAINING_KEY:      rainbow_arg_state.what_doing = rainbow_testing;      rainbow_arg_state.test_on_training = 1;      rainbow_arg_state.num_trials = atoi (arg);      break;    case NO_LISP_SCORE_TRUNCATION_KEY:      rainbow_arg_state.use_lisp_score_truncation = 0;      break;      /* Switches for file testing */    case 'X':      rainbow_arg_state.loo_cv = 1;    case 'x':      rainbow_arg_state.what_doing = rainbow_file_testing;      break;      /* Switches for diagnostics */    case 'I':      /* Print out ARG number of vocab words ranked by infogain. */      rainbow_arg_state.what_doing = rainbow_infogain_printing;      rainbow_arg_state.infogain_words_to_print = atoi (arg);      break;    case PRINT_LOG_ODDS_RATIO_KEY:      /* Print out ARG number of vocab words ranked by infogain. */      rainbow_arg_state.what_doing = rainbow_logodds_printing;      if (arg)	rainbow_arg_state.logodds_words_to_print = atoi (arg);      break;    case 'W':      /* Print the weight-vector for the named class */      rainbow_arg_state.what_doing = rainbow_weight_vector_printing;      rainbow_arg_state.printing_class = arg;      break;    case 'F':      /* Print the foil gain for the named class */      rainbow_arg_state.what_doing = rainbow_foilgain_printing;      rainbow_arg_state.printing_class = arg;      break;    case 'P':      /* Print the contribution of each word to each class during 	 scoring. */       bow_print_word_scores = 1;      break;    case 'B':      /* Print the barrel in awk-accessible form to stdout. */      rainbow_arg_state.what_doing = rainbow_barrel_printing;      rainbow_arg_state.barrel_printing_format = arg;      break;    case PRINT_COUNTS_FOR_WORD_KEY:      rainbow_arg_state.what_doing = rainbow_word_count_printing;      rainbow_arg_state.printing_class = arg;      break;    case INFOGAIN_PAIR_VECTOR_KEY:      rainbow_arg_state.what_doing = rainbow_infogain_pair_printing;      rainbow_arg_state.infogain_words_to_print = atoi (arg);      break;    case PRINT_DOC_NAMES_KEY:      rainbow_arg_state.what_doing = rainbow_doc_name_printing;      rainbow_arg_state.printing_class = arg;      break;    case PRINT_WORD_PROBABILITIES_KEY:      rainbow_arg_state.what_doing = rainbow_printing_word_probabilities;      rainbow_arg_state.printing_class = arg;      break;#if 0    case ARGP_KEY_NO_ARGS:      argp_usage (state);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -