📄 rainbow.c
字号:
#endif case ARGP_KEY_ARG: /* Now we consume all the rest of the arguments. STATE->next is the index in STATE->argv of the next argument to be parsed, which is the first STRING we're interested in, so we can just use `&state->argv[state->next]' as the value for RAINBOW_ARG_STATE->ARGS. IN ADDITION, by setting STATE->next to the end of the arguments, we can force argp to stop parsing here and return. */ rainbow_arg_state.non_option_argi = state->next - 1; if (rainbow_arg_state.what_doing == rainbow_indexing && rainbow_arg_state.barrel_printing_format == NULL && state->next == state->argc) { /* Only one classname is not enough. */ fprintf (stderr, "Need data from more than one class.\n"); argp_usage (state); } state->next = state->argc; break; case ARGP_KEY_END: /* Here we know that STATE->arg_num == 0, since we force argument parsing to end before any more arguments can get here. */ if (rainbow_arg_state.what_doing == rainbow_indexing || rainbow_arg_state.what_doing == rainbow_file_testing) { if (state->arg_num == 0) { /* Too few arguments. */ fprintf (stderr, "No non-option arguments needed.\n"); argp_usage (state); } } else if (state->arg_num != 0) { /* Too many arguments. */ fprintf (stderr, "No non-option arguments needed.\n"); argp_usage (state); } break; case BUILD_AND_SAVE: rainbow_arg_state.what_doing = rainbow_building_and_saving; break; case TEST_FROM_SAVED: rainbow_arg_state.what_doing = rainbow_testing_from_saved_model; break; case USE_SAVED_CLASSIFIER_KEY: rainbow_arg_state.use_saved_classifier = 1; break; case PRINT_DOC_LENGTH_KEY: rainbow_arg_state.print_doc_length = 1; break; default: return ARGP_ERR_UNKNOWN; } return 0;}static struct argp rainbow_argp = { rainbow_options, rainbow_parse_opt, rainbow_argp_args_doc, rainbow_argp_doc, bow_argp_children};/* The structures that hold the data necessary for answering a query. */bow_barrel *rainbow_doc_barrel; /* the stats about words and documents */bow_barrel *rainbow_class_barrel=NULL; /* the stats about words and classes *//* The static structure in bow/int4word.c is also used. *//* Given a fully-specified file path name (all the way from `/'), return just the last filename part of it. */static inline const char *filename_to_classname (const char *filename){#if 0 /* Don't bother stripping off the directory name from the classname. This way, even when we give rainbow multi-directory specifications for where to find the data for each class, and some of the lowest-level directories have the same name, we can still distinguish between them. */ return filename;#else const char *ret; ret = strrchr (filename, '/'); if (ret) return ret + 1; return filename;#endif}/* Writing and reading the word/document stats to disk. */#define VOCABULARY_FILENAME "vocabulary"#define DOC_BARREL_FILENAME "doc-barrel"#define CLASS_BARREL_FILENAME "class-barrel"#define OUTPUTNAME_FILENAME "outfile"#define FORMAT_VERSION_FILENAME "format-version"/* Write the stats in the directory DATA_DIRNAME. */voidrainbow_archive (){ char filename[BOW_MAX_WORD_LENGTH]; char *fnp; FILE *fp; strcpy (filename, bow_data_dirname); strcat (filename, "/"); fnp = filename + strlen (filename); strcpy (fnp, FORMAT_VERSION_FILENAME); bow_write_format_version_to_file (filename); strcpy (fnp, OUTPUTNAME_FILENAME); fp = bow_fopen (filename, "w"); if (rainbow_arg_state.output_filename) fprintf (fp, "%s\n", rainbow_arg_state.output_filename); fclose (fp); strcpy (fnp, VOCABULARY_FILENAME); fp = bow_fopen (filename, "wb"); bow_words_write (fp); fclose (fp); strcpy (fnp, CLASS_BARREL_FILENAME); fp = bow_fopen (filename, "wb"); bow_barrel_write (rainbow_class_barrel, fp); fclose (fp); strcpy (fnp, DOC_BARREL_FILENAME); fp = bow_fopen (filename, "wb"); bow_barrel_write (rainbow_doc_barrel, fp); fclose (fp);}/* Read the stats from the directory DATA_DIRNAME. */voidrainbow_unarchive (){ char filename[BOW_MAX_WORD_LENGTH]; char *fnp; FILE *fp; char buf[1024]; struct stat st; int e; if (rainbow_arg_state.what_doing != rainbow_query_serving) bow_verbosify (bow_progress, "Loading data files...\n"); strcpy (filename, bow_data_dirname); strcat (filename, "/"); fnp = filename + strlen (filename); strcpy (fnp, FORMAT_VERSION_FILENAME); e = stat (filename, &st); if (e != 0) { /* Assume this means the file doesn't exist, and this archive was created before BOW_DEFAULT_FORMAT_VERSION was added to the library. The version number before BOW_DEFAULT_FORMAT_VERSION was added to the library was 3. */ bow_file_format_version = 3; } else { bow_read_format_version_from_file (filename); } strcpy (fnp, OUTPUTNAME_FILENAME); fp = fopen (filename, "r"); if (fp) { buf[0] = '\0'; fscanf (fp, "%s", buf); rainbow_arg_state.output_filename = strdup (buf); fclose (fp); } else { rainbow_arg_state.output_filename = NULL; } strcpy (fnp, VOCABULARY_FILENAME); fp = bow_fopen (filename, "rb"); bow_words_read_from_fp (fp); fclose (fp); strcpy (fnp, CLASS_BARREL_FILENAME); fp = bow_fopen (filename, "rb"); rainbow_class_barrel = bow_barrel_new_from_data_fp (fp); /* Don't close it because bow_wi2dvf_dv will still need to read it. */ strcpy (fnp, DOC_BARREL_FILENAME); fp = bow_fopen (filename, "rb"); rainbow_doc_barrel = bow_barrel_new_from_data_fp (fp); /* Don't close it because bow_wi2dvf_dv will still need to read it. */ /* Only do this if the document barrel exists */ if (rainbow_doc_barrel && rainbow_doc_barrel->classnames == NULL) { int i; bow_cdoc *cdoc; rainbow_doc_barrel->classnames = bow_int4str_new (0); rainbow_class_barrel->classnames = bow_int4str_new (0); for (i = 0; i < rainbow_class_barrel->cdocs->length; i++) { cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, i); bow_str2int (rainbow_doc_barrel->classnames, filename_to_classname (cdoc->filename)); bow_str2int (rainbow_class_barrel->classnames, filename_to_classname (cdoc->filename)); } } /* Don't want doc priors set equal - want class priors set equal */ if (bow_uniform_class_priors) bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_class_barrel); /*bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_doc_barrel);*/}/* Building the word/document stats. *//* Traverse the directories CLASSDIR_NAMES, gathering word/document stats, and write the stats to disk in BOW_DATA_DIRNAME. */voidrainbow_index (int num_classes, const char *classdir_names[], const char *exception_name){ int class_index; void do_indexing () { if (rainbow_doc_barrel) bow_free_barrel (rainbow_doc_barrel); /* Index all the documents. */ rainbow_doc_barrel = bow_barrel_new (0, 0, sizeof (bow_cdoc), NULL);#if VPC_ONLY if (rainbow_arg_state.vpc_only) rainbow_doc_barrel->is_vpc = 1;#endif VPC_ONLY if (bow_argp_method) rainbow_doc_barrel->method = (rainbow_method*)bow_argp_method; else rainbow_doc_barrel->method = rainbow_default_method; for (class_index = 0; class_index < num_classes; class_index++) { bow_verbosify (bow_progress, "Class `%s'\n ", filename_to_classname (classdir_names[class_index]));#if HAVE_HDB if (bow_hdb) { /* Gathers stats on all documents in HDB database */ if (bow_barrel_add_from_hdb (rainbow_doc_barrel, classdir_names[class_index], exception_name, filename_to_classname (classdir_names[class_index])) == 0) bow_verbosify (bow_quiet, "No text files found in database `%s'\n", classdir_names[class_index]); } else#endif /* This function traverses the class directory gathering word/document stats. Return the number of documents indexed. This gathers stats on individual documents; we have yet to "sum together the word vectors of all documents for each particular class". */ if (bow_barrel_add_from_text_dir (rainbow_doc_barrel, classdir_names[class_index], exception_name, filename_to_classname (classdir_names[class_index])) == 0) bow_verbosify (bow_quiet, "No text files found in directory `%s'\n", classdir_names[class_index]); } if (bow_uniform_class_priors) bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_doc_barrel); } /* Do all the parsing to build a barrel with word counts. */ if (bow_prune_vocab_by_occur_count_n) { /* Parse all the documents to get word occurrence counts. */ for (class_index = 0; class_index < num_classes; class_index++) { bow_verbosify (bow_progress, "Class `%s'\n ", filename_to_classname (classdir_names[class_index]));#if HAVE_HDB if (bow_hdb) bow_words_add_occurrences_from_hdb (classdir_names[class_index], ""); else#endif bow_words_add_occurrences_from_text_dir (classdir_names[class_index], ""); } /* xxx This should be (bow_prune_vocab_by_occur_count_n+1) !! */ bow_words_remove_occurrences_less_than (bow_prune_vocab_by_occur_count_n); /* Now insist that future calls to bow_word2int*() will not register new words. */ bow_word2int_do_not_add = 1; } do_indexing (); if (bow_prune_vocab_by_infogain_n || bow_prune_words_by_doc_count_n) { if (0) { /* Change barrel by removing words with small information gain. */ bow_barrel_keep_top_words_by_infogain (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, num_classes); } else { /* Change vocabulary to remove words with small information gain */ bow_wi2dvf_hide_words_by_doc_count (rainbow_doc_barrel->wi2dvf, bow_prune_words_by_doc_count_n); /* The doc count pruning must be before the infogain pruning, because this function below is the one that re-assigns the word-indices. */ bow_words_keep_top_by_infogain (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, num_classes); /* Now insist that future calls to bow_word2int*() will not register new words. */ bow_word2int_do_not_add = 1; do_indexing (); } }#if VPC_ONLY /* Weights have been calculated - all that is left to do is to calculate * priors */ if (rainbow_arg_state.vpc_only) { bow_cdoc *cdocp; double prior_sum = 0.0; int num_classes = bow_barrel_num_classes (rainbow_doc_barrel); int ci; /* Normalize the class priors */ for (ci=0; ci < num_classes; ci++) { cdocp = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, ci); prior_sum += cdocp->prior; } if (prior_sum) for (ci=0; ci < num_classes; ci++) { cdocp = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, ci); cdocp->prior = cdocp->prior / prior_sum; } else bow_verbosify (bow_progress, "WARNING: All classes have zero prior\n"); /* Recalculate word counts for each class */ { bow_wv *wv = NULL; int wvi; bow_cdoc *cdoc; int di; bow_dv_heap *heap = bow_test_new_heap (rainbow_doc_barrel); while ((di = bow_heap_next_wv (heap, rainbow_doc_barrel, &wv, bow_cdoc_yes)) != -1) { cdoc = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, di); cdoc->word_count = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) { if (bow_wi2dvf_dv (rainbow_doc_barrel->wi2dvf, wv->entry[wvi].wi)) cdoc->word_count += wv->entry[wvi].count; } } } /* We have been (secretly) using the doc_barrel as a class barrel * all along. Set doc_barrel to NULL so that an empty file is * written to disk. This will keep future executions from * attempting to recalculate the class_barrel */ rainbow_class_barrel = rainbow_doc_barrel; rainbow_doc_barrel = NULL; return; }#endif /* Combine the documents into class statistics. */ rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel);}voidrainbow_index_printed_barrel (const char *filename){ rainbow_doc_barrel =
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -