📄 rainbow.c
字号:
fprintf (out_fp, "#0\n"); for (ci = 0; ci < bow_barrel_num_classes (rainbow_doc_barrel); ci++) { /* Build a string containing the name of this directory. */ bow_cdoc *class_cdoc; strcpy (dir, test_dirname); class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci); strcat (dir, "/"); strcat (dir, filename_to_classname (class_cdoc->filename)); assert (strlen (dir) < dirlen); /* Remember which classname this comes from, so, above, we know the correct class */ current_ci = class_cdoc->class; current_class = bow_barrel_classname_at_index (rainbow_doc_barrel, ci); /* Test each document in that diretory. */#if HAVE_HDB if (bow_hdb) bow_map_filenames_from_hdb (test_hdb_file, 0, dir, ""); else#endif bow_map_filenames_from_dir (test_file, 0, dir, ""); }}voidbow_print_log_odds_ratio (FILE *fp, bow_barrel *barrel, int num_to_print){ int ci; bow_cdoc *cdoc; int wi; /* a "word index" into WI2DVF */ int max_wi; /* the highest "word index" in WI2DVF. */ bow_dv *dv; /* the "document vector" at index WI */ int dvi; /* an index into the DV */ int weight_setting_num_words = 0; int total_num_words = 0; struct lorth { int wi; float lor; } lors[barrel->cdocs->length][num_to_print]; int wci; float *total_word_counts; /* bow_error("Can't use this while normalizer is being used for non-integral word_count"); */ /* We assume that we have already called BOW_BARREL_NEW_VPC() on BARREL, so BARREL already has one-document-per-class. */ /* This might be useful to have. However, some VPC barrels do not have this variable set, so we probably shouldn't enforce this - jrennie */ /* assert (barrel->is_vpc); */ max_wi = MIN (barrel->wi2dvf->size, bow_num_words()); total_word_counts = bow_malloc (sizeof (float) * max_wi); for (ci = 0; ci < barrel->cdocs->length; ci++) for (wci = 0; wci < num_to_print; wci++) { lors[ci][wci].lor = 0.0; lors[ci][wci].wi = -1; } /* assume that word_count, normalizer are already set */ /* Calculate the total number of occurrences of each word; store this int TOTAL_WORD_COUNTS. */ for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (dv == NULL) continue; total_word_counts[wi] = 0; for (dvi = 0; dvi < dv->length; dvi++) { /* Is cdoc used for anything? - jrennie */ cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); if (cdoc->type == bow_doc_train) { total_num_words += dv->entry[dvi].weight; total_word_counts[wi] += dv->entry[dvi].weight; } } } /* Set the weights in the BARREL's WI2DVF so that they are equal to P(w|C), the probability of a word given a class. */ for (wi = 0; wi < max_wi; wi++) { double pr_w = 0.0; dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (wi % 100 == 0) bow_verbosify(bow_progress, "\b\b\b\b\b\b%6d", wi); /* If the model doesn't know about this word, skip it. */ if (dv == NULL) continue; pr_w = total_word_counts[wi] / total_num_words; /* Now loop through all the elements, setting their weights */ for (dvi = 0; dvi < dv->length; dvi++) { double pr_w_c; double pr_w_not_c; double log_likelihood_ratio; cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); /* Here CDOC->WORD_COUNT is the total number of words in the class */ /* We use Laplace Estimation. */ pr_w_c = ((double)dv->entry[dvi].weight / (cdoc->word_count + cdoc->normalizer)); pr_w_c = (((double)dv->entry[dvi].weight + 1) / (cdoc->word_count + barrel->wi2dvf->num_words)); pr_w_not_c = ((total_word_counts[wi] - dv->entry[dvi].weight + barrel->cdocs->length - 1) / (total_num_words - cdoc->word_count + (barrel->wi2dvf->num_words * (barrel->cdocs->length - 1)))); log_likelihood_ratio = log (pr_w_c / pr_w_not_c); wci = num_to_print - 1; while (wci >= 0 && (lors[dv->entry[dvi].di][wci].lor < pr_w_c * log_likelihood_ratio)) wci--; if (wci < num_to_print - 1) { int new_wci = wci + 1; for (wci = num_to_print-1; wci > new_wci; wci--) { lors[dv->entry[dvi].di][wci].lor = lors[dv->entry[dvi].di][wci - 1].lor; lors[dv->entry[dvi].di][wci].wi = lors[dv->entry[dvi].di][wci - 1].wi; } lors[dv->entry[dvi].di][new_wci].lor = pr_w_c * log_likelihood_ratio; lors[dv->entry[dvi].di][new_wci].wi = wi; } } weight_setting_num_words++; } bow_verbosify (bow_progress, "\n"); fprintf (fp, "Log Odds Ratio - top %d words\n\n", num_to_print); for (ci = 0; ci < barrel->cdocs->length; ci++) { bow_cdoc *cdoc = bow_array_entry_at_index(barrel->cdocs, ci); int i; fprintf (fp, "%s\n", filename_to_classname(cdoc->filename)); for (i=0; i < strlen (filename_to_classname(cdoc->filename)); i++) fprintf (fp, "-"); fprintf (fp, "\n"); for (wci = 0; wci < num_to_print; wci++) fprintf (fp, "%1.15f %s\n", lors[ci][wci].lor, lors[ci][wci].wi >= 0 ? bow_int2word (lors[ci][wci].wi) : "<nothing>"); /* Print feedline and newpage */ fprintf (fp, "%c\n",12); }}voidrainbow_print_weight_vector (const char *classname){ int ci; /* The `class index' of CLASSNAME */ bow_cdoc *cdoc; int wi, max_wi; /* a word index */ bow_dv *dv; /* a class vector */ int dvi; /* an index into DV */ /* Find the `class index' of the class with name CLASSNAME */ for (ci = 0; ci < bow_barrel_num_classes (rainbow_class_barrel); ci++) { cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci); if (!strcmp (filename_to_classname (cdoc->filename), classname)) break; } if (ci == bow_barrel_num_classes (rainbow_class_barrel)) bow_error ("No class named `%s'\n", classname); /* Get the CDOC for this class, so we can use its NORMALIZER. */ cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci); /* Print the `weight' for each word in the class */ max_wi = MIN (bow_num_words (), rainbow_class_barrel->wi2dvf->size); for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (rainbow_class_barrel->wi2dvf, wi); if (dv == NULL) continue; /* Find the DVI with the DI matching CI */ for (dvi = 0; dvi < dv->length && dv->entry[dvi].di < ci; dvi++); if (!(dv && dvi < dv->length && dv->entry[dvi].di == ci)) continue; /* This is an attempt for a test to see if the weights need to be "normalized" before being used. */ if (rainbow_class_barrel->method->normalize_weights) printf ("%20.10f %s\n", dv->entry[dvi].weight * cdoc->normalizer, bow_int2word (wi)); else printf ("%20.10f %s\n", dv->entry[dvi].weight, bow_int2word (wi)); }}voidrainbow_print_foilgain (const char *classname){ int ci; /* The `class index' of CLASSNAME */ int wi; bow_cdoc *cdoc; float **fig_per_wi_ci; int fig_num_wi; /* Find the `class index' of the class with name CLASSNAME */ for (ci = 0; ci < bow_barrel_num_classes (rainbow_class_barrel); ci++) { cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci); if (!strcmp (filename_to_classname (cdoc->filename), classname)) break; } if (ci == bow_barrel_num_classes (rainbow_class_barrel)) bow_error ("No class named `%s'\n", classname); /* Get the foilgains. */ fig_per_wi_ci = bow_foilgain_per_wi_ci_new (rainbow_doc_barrel, bow_barrel_num_classes (rainbow_class_barrel), &fig_num_wi); /* Print the `foilgain' for each word in the class */ for (wi = 0; wi < fig_num_wi; wi++) { printf ("%20.6f %s\n", fig_per_wi_ci[wi][ci], bow_int2word (wi)); } bow_foilgain_free (fig_per_wi_ci, fig_num_wi);}/* The main() function. */extern int _bow_nextprime (unsigned n);#if !RAINBOW_LISPintmain (int argc, char *argv[]){ /* Default command-line argument values */ rainbow_arg_state.what_doing = rainbow_indexing; rainbow_arg_state.query_filename = NULL; rainbow_arg_state.output_filename = NULL; rainbow_arg_state.num_trials = 0; rainbow_arg_state.infogain_words_to_print = 10; rainbow_arg_state.logodds_words_to_print = 10; rainbow_arg_state.printing_class = 0; rainbow_arg_state.non_option_argi = 0; rainbow_arg_state.repeat_query = 0; rainbow_arg_state.vocab_map = NULL; rainbow_arg_state.hide_vocab_map = NULL; rainbow_arg_state.use_lisp_score_truncation = 1; rainbow_arg_state.loo_cv = 0; rainbow_arg_state.barrel_printing_format = NULL; rainbow_arg_state.hide_vocab_indices_filename = NULL; rainbow_arg_state.test_on_training = 0; rainbow_arg_state.use_saved_classifier = 0; rainbow_arg_state.forking_server = 0; rainbow_arg_state.print_doc_length = 0; rainbow_arg_state.indexing_lines_filename = NULL;#ifdef VPC_ONLY rainbow_arg_state.vpc_only = 0;#endif /* Parse the command-line arguments. */ argp_parse (&rainbow_argp, argc, argv, 0, 0, &rainbow_arg_state); if (rainbow_arg_state.what_doing == rainbow_indexing) { /* Strip any trailing `/'s from the classnames, so we can find the classname later using FILENAME_TO_CLASSNAME. */ int argi, len; const char **rainbow_classnames; /* if we've fixed the vocab from a file, then use it */ if (rainbow_arg_state.vocab_map) bow_words_set_map(rainbow_arg_state.vocab_map, 1); if (rainbow_arg_state.barrel_printing_format) { rainbow_index_printed_barrel (argv[rainbow_arg_state.non_option_argi]); } else { for (argi = rainbow_arg_state.non_option_argi; argi < argc; argi++) { len = strlen (argv[argi]); if (argv[argi][len-1] == '/') argv[argi][len-1] = '\0'; } rainbow_classnames = (const char **)(argv + rainbow_arg_state.non_option_argi); /* Index text in the directories. */ rainbow_index (argc - rainbow_arg_state.non_option_argi, rainbow_classnames, rainbow_arg_state.output_filename); } if (bow_num_words ()) rainbow_archive (); else bow_error ("No text documents found."); exit (0); } if (rainbow_arg_state.what_doing == rainbow_indexing_lines) { rainbow_index_lines (rainbow_arg_state.indexing_lines_filename); if (bow_num_words ()) rainbow_archive (); else bow_error ("No text documents found."); exit (0); } /* We are using an already built model. Get it from disk. */ rainbow_unarchive (); if (rainbow_arg_state.hide_vocab_indices_filename) { FILE *fp = bow_fopen (rainbow_arg_state.hide_vocab_indices_filename, "r"); int wi; while (fscanf (fp, "%d", &wi) == 1) bow_wi2dvf_hide_wi (rainbow_doc_barrel->wi2dvf, wi); fclose (fp); } /* (Re)set the weight-setting method, if requested with a `-m' on the command line. */ if (bow_argp_method) rainbow_doc_barrel->method = (rainbow_method*)bow_argp_method; /* Make the test/train split */ /* Don't touch anything if we don't have the document barrel */ if (rainbow_doc_barrel && rainbow_arg_state.what_doing != rainbow_testing) bow_set_doc_types_for_barrel (rainbow_doc_barrel); /* Do things that update their own class/word weights. */#if 0 /* Compute the number of word pairs that co-occur in documents more than 0 times. Did this for Jeff Schneider. */ if (1) { static const int max_vocab_size = 10000; int vocab_sizes[] = {max_vocab_size, max_vocab_size}; bow_bitvec *co_occurrences = bow_bitvec_new (2, vocab_sizes); int wi_pair[2]; int wvi1, wvi2; bow_dv_heap *heap; bow_wv *doc_wv; int di; int num_co_occurrences; /* Make vocabulary size manageable. */ bow_barrel_keep_top_words_by_infogain (max_vocab_size-1, rainbow_doc_barrel, bow_barrel_num_classes (rainbow_class_barrel)); /* Step through each document, setting bit for each word-pair co-occurrence. */ heap = bow_test_new_heap (rainbow_doc_barrel); doc_wv = NULL; while ((di = bow_model_next_wv (heap, rainbow_doc_barrel, &doc_wv)) != -1) { for (wvi1 = 0; wvi1 < doc_wv->num_entries; wvi1++) { for (wvi2 = 0; wvi2 < doc_wv->num_entries; wvi2++) { wi_pair[0] = doc_wv->entry[wvi1].wi; wi_pair[1] = doc_wv->entry[wvi2].wi; bow_bitvec_set (co_occurrences, wi_pair, 1); } } } /* Don't free the heap here because bow_model_next_wv() does it for us. */ /* Count the number of co-occurrences. */ num_co_occurrences = 0; for (wvi1 = 0; wvi1 < max_vocab_size; wvi1+
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -