📄 rainbow.c
字号:
{ doc_cdoc = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, di); class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, doc_cdoc->class); bow_wv_set_weights (query_wv, rainbow_class_barrel); bow_wv_normalize_weights (query_wv, rainbow_class_barrel); actual_num_hits = bow_barrel_score (rainbow_class_barrel, query_wv, hits, num_hits_to_retrieve, -1); assert (actual_num_hits == num_hits_to_retrieve);#if 0 printf ("%8.6f %d %8.6f %8.6f %d ", class_cdoc->normalizer, class_cdoc->word_count, class_cdoc->normalizer / class_cdoc->word_count, class_cdoc->prior, doc_cdoc->class); if (hits[0].di == doc_cdoc->class) printf ("1\n"); else printf ("0\n");#endif fprintf (test_fp, "%s %s ", doc_cdoc->filename, filename_to_classname(class_cdoc->filename)); for (hi = 0; hi < actual_num_hits; hi++) { class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, hits[hi].di); /* For the sake CommonLisp, don't print numbers smaller than 1e-35, because it can't `(read)' them. */ if (rainbow_arg_state.use_lisp_score_truncation && hits[hi].weight < 1e-35 && hits[hi].weight > 0) hits[hi].weight = 0; fprintf (test_fp, "%s:%g ", filename_to_classname (class_cdoc->filename), hits[hi].weight); } fprintf (test_fp, "\n"); } }}/* Run test trials, outputing results to TEST_FP. The results are indended to be read and processed by the Perl script ./rainbow-stats. The test documents come from files inside the directories that are named in argv[]. */voidrainbow_test_files (FILE *out_fp, const char *test_dirname){ bow_score *hits; /* int num_test_docs; */ int num_hits_to_retrieve = rainbow_class_barrel->cdocs->length; int actual_num_hits; int hi; /* hit index */ const char *current_class; int current_ci; int ci; unsigned int dirlen = 1024; char dir[dirlen]; /* This nested function is called once for each test document. */ int test_file (const char *filename, void *context) { bow_wv *query_wv; FILE *fp; bow_cdoc *class_cdoc; fp = bow_fopen (filename, "r"); query_wv = bow_wv_new_from_text_fp (fp); fclose (fp); if (!query_wv) { bow_verbosify (bow_progress, "%s found to be empty.\n", filename); return 0; } fprintf (out_fp, "%s %s ", filename, /* This test instance */ current_class); /* The name of the correct class */ bow_wv_set_weights (query_wv, rainbow_class_barrel); bow_wv_normalize_weights (query_wv, rainbow_class_barrel); actual_num_hits = bow_barrel_score (rainbow_class_barrel, query_wv, hits, num_hits_to_retrieve, (rainbow_arg_state.loo_cv ? current_ci : -1)); for (hi = 0; hi < actual_num_hits; hi++) { class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, hits[hi].di); /* For the sake CommonLisp, don't print numbers smaller than 1e-35, because it can't `(read)' them. */ if (rainbow_arg_state.use_lisp_score_truncation && hits[hi].weight < 1e-35 && hits[hi].weight > 0) hits[hi].weight = 0; fprintf (out_fp, "%s:%g ", filename_to_classname (class_cdoc->filename), hits[hi].weight); } fprintf (out_fp, "\n"); return 0; } hits = alloca (sizeof (bow_score) * num_hits_to_retrieve);#if 0 /* Calculate the number of testing documents according to TEST_PERCENTAGE. The default TEST_PERCENTAGE is 0, use all training documents. Otherwise, we will use less training documents. Note that the documents marked for testing here will not actually be used for testing. We will test the documents in TEST_DIRNAME. */ num_test_docs = (rainbow_doc_barrel->cdocs->length * rainbow_arg_state.test_percentage) / 100; bow_test_split (rainbow_doc_barrel, num_test_docs); if (bow_prune_vocab_by_infogain_n) { /* Change barrel by removing words with small info gain, if requested. */ bow_barrel_keep_top_words_by_infogain (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, rainbow_class_barrel->cdocs->length); } /* Re-build the rainbow_class_barrel, if necessary */ if (rainbow_doc_barrel->method != rainbow_class_barrel->method || rainbow_arg_state.vocab_map || bow_prune_vocab_by_infogain_n || rainbow_arg_state.test_percentage) { int num_classes = rainbow_class_barrel->cdocs->length; bow_barrel_free (rainbow_class_barrel); rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, rainbow_classnames, num_classes); }#endif fprintf (out_fp, "#0\n"); for (ci = 0; ci < rainbow_class_barrel->cdocs->length; ci++) { /* Build a string containing the name of this directory. */ bow_cdoc *class_cdoc; strcpy (dir, test_dirname); class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci); strcat (dir, "/"); strcat (dir, filename_to_classname (class_cdoc->filename)); assert (strlen (dir) < dirlen); /* Remember which classname this comes from, so, above, we know the correct class */ current_ci = class_cdoc->class; current_class = filename_to_classname (class_cdoc->filename); /* Test each document in that diretory. */ bow_map_filenames_from_dir (test_file, 0, dir, ""); }}voidrainbow_print_weight_vector (const char *classname){ int ci; /* The `class index' of CLASSNAME */ bow_cdoc *cdoc; int wi, max_wi; /* a word index */ bow_dv *dv; /* a class vector */ int dvi; /* an index into DV */ /* Find the `class index' of the class with name CLASSNAME */ for (ci = 0; ci < rainbow_class_barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci); if (!strcmp (filename_to_classname (cdoc->filename), classname)) break; } if (ci == rainbow_class_barrel->cdocs->length) bow_error ("No class named `%s'\n", classname); /* Get the CDOC for this class, so we can use its NORMALIZER. */ cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci); /* Print the `weight' for each word in the class */ max_wi = MIN (bow_num_words (), rainbow_class_barrel->wi2dvf->size); for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (rainbow_class_barrel->wi2dvf, wi); if (dv == NULL) continue; /* Find the DVI with the DI matching CI */ for (dvi = 0; dvi < dv->length && dv->entry[dvi].di < ci; dvi++); if (!(dv && dvi < dv->length && dv->entry[dvi].di == ci)) continue; /* This is an attempt for a test to see if the weights need to be "normalized" before being used. */ if (rainbow_class_barrel->method->normalize_weights) printf ("%20.10f %s\n", dv->entry[dvi].weight * cdoc->normalizer, bow_int2word (wi)); else printf ("%20.10f %s\n", dv->entry[dvi].weight, bow_int2word (wi)); }}voidrainbow_print_foilgain (const char *classname){ int ci; /* The `class index' of CLASSNAME */ int wi; bow_cdoc *cdoc; float **fig_per_wi_ci; int fig_num_wi; /* Find the `class index' of the class with name CLASSNAME */ for (ci = 0; ci < rainbow_class_barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, ci); if (!strcmp (filename_to_classname (cdoc->filename), classname)) break; } if (ci == rainbow_class_barrel->cdocs->length) bow_error ("No class named `%s'\n", classname); /* Get the foilgains. */ fig_per_wi_ci = bow_foilgain_per_wi_ci_new (rainbow_doc_barrel, rainbow_class_barrel->cdocs->length, &fig_num_wi); /* Print the `foilgain' for each word in the class */ for (wi = 0; wi < fig_num_wi; wi++) { printf ("%20.6f %s\n", fig_per_wi_ci[wi][ci], bow_int2word (wi)); } bow_foilgain_free (fig_per_wi_ci, fig_num_wi);}/* The main() function. */#if !RAINBOW_LISPintmain (int argc, char *argv[]){ /* Default command-line argument values */ rainbow_arg_state.what_doing = rainbow_indexing; rainbow_arg_state.query_filename = NULL; rainbow_arg_state.output_filename = NULL; rainbow_arg_state.num_trials = 0; rainbow_arg_state.test_percentage = 30; rainbow_arg_state.infogain_words_to_print = 10; rainbow_arg_state.printing_class = 0; rainbow_arg_state.non_option_argi = 0; rainbow_arg_state.repeat_query = 0; rainbow_arg_state.vocab_map = NULL; rainbow_arg_state.use_lisp_score_truncation = 1; rainbow_arg_state.loo_cv = 0; _register_method_kl (); _register_method_evi (); /* Parse the command-line arguments. */ argp_parse (&rainbow_argp, argc, argv, 0, 0, &rainbow_arg_state); if (rainbow_arg_state.what_doing == rainbow_indexing) { /* Strip any trailing `/'s from the classnames, so we can find the classname later using FILENAME_TO_CLASSNAME. */ int argi, len; for (argi = rainbow_arg_state.non_option_argi; argi < argc; argi++) { len = strlen (argv[argi]); if (argv[argi][len-1] == '/') argv[argi][len-1] = '\0'; } /* Initialize the global variable RAINBOW_CLASSNAMES */ rainbow_classnames = (const char **)(argv + rainbow_arg_state.non_option_argi); /* Index text in the directories. */ rainbow_index (argc - rainbow_arg_state.non_option_argi, rainbow_classnames, rainbow_arg_state.output_filename); if (bow_num_words ()) rainbow_archive (); else bow_error ("No text documents found."); exit (0); } /* We are using an already built model. Get it from disk. */ rainbow_unarchive (); /* (Re)set the weight-setting method, if requested with a `-m' on the command line. */ if (bow_argp_method) rainbow_doc_barrel->method = bow_argp_method; /* Do things that update their own class/word weights. */#if 0 /* Compute the number of word pairs that co-occur in documents more than 0 times. Did this for Jeff Schneider. */ if (1) { static const int max_vocab_size = 10000; int vocab_sizes[] = {max_vocab_size, max_vocab_size}; bow_bitvec *co_occurrences = bow_bitvec_new (2, vocab_sizes); int wi_pair[2]; int wvi1, wvi2; bow_dv_heap *heap; bow_wv *doc_wv; int di; int num_co_occurrences; /* Make vocabulary size manageable. */ bow_barrel_keep_top_words_by_infogain (max_vocab_size-1, rainbow_doc_barrel, rainbow_class_barrel->cdocs->length); /* Step through each document, setting bit for each word-pair co-occurrence. */ heap = bow_test_new_heap (rainbow_doc_barrel); doc_wv = NULL; while ((di = bow_model_next_wv (heap, rainbow_doc_barrel, &doc_wv)) != -1) { for (wvi1 = 0; wvi1 < doc_wv->num_entries; wvi1++) { for (wvi2 = 0; wvi2 < doc_wv->num_entries; wvi2++) { wi_pair[0] = doc_wv->entry[wvi1].wi; wi_pair[1] = doc_wv->entry[wvi2].wi; bow_bitvec_set (co_occurrences, wi_pair, 1); } } } /* Count the number of co-occurrences. */ num_co_occurrences = 0; for (wvi1 = 0; wvi1 < max_vocab_size; wvi1++) { for (wvi2 = 0; wvi2 < max_vocab_size; wvi2++) { wi_pair[0] = wvi1; wi_pair[1] = wvi2; if (bow_bitvec_value (co_occurrences, wi_pair)) num_co_occurrences++; } } printf ("Num co-occurrences = %d\n", num_co_occurrences); exit (0); }#endif if (rainbow_arg_state.what_doing == rainbow_query_serving) { rainbow_socket_init (rainbow_arg_state.server_port_num, 0); while (1) { rainbow_serve(); } } /* Do things that don't require the class/word weights to be updated. */ if (rainbow_arg_state.what_doing == rainbow_testing) { /* We are doing test trials, and making output for Perl. */ rainbow_test (stdout); exit (0); } if (rainbow_arg_state.what_doing == rainbow_infogain_printing) { bow_infogain_per_wi_print (stdout, rainbow_doc_barrel, rainbow_class_barrel->cdocs->length, rainbow_arg_state.infogain_words_to_print); exit (0); } if (rainbow_arg_state.what_doing == rainbow_foilgain_printing) { rainbow_print_foilgain (rainbow_arg_state.printing_class); exit (0); } if (rainbow_arg_state.what_doing == rainbow_barrel_printing) { bow_barrel_printf (rainbow_doc_barrel, stdout, ""); exit (0); } if (rainbow_arg_state.what_doing == rainbow_infogain_pair_printing) { int s; bow_infogain_per_wi_new_using_pairs (rainbow_doc_barrel, rainbow_class_barrel->cdocs->length, &s); exit (0); } if (rainbow_arg_state.what_doing == rainbow_word_count_printing) { bow_barrel_print_word_count (rainbow_class_barrel, rainbow_arg_state.printing_class); exit (0); } /* Do things necessary to update the class/word weights for the command-line options. */ /* Reduce vocabulary size by removing words not in a file listed on the command line. */ if (rainbow_arg_state.vocab_map) { bow_barrel_prune_words_not_in_map (rainbow_doc_barrel, rainbow_arg_state.vocab_map); } /* Reduce vocabulary size by low info-gain words, if requested. */ if (bow_prune_vocab_by_infogain_n) { /* Change barrel by removing words with small info gain. */ bow_barrel_keep_top_words_by_infogain (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, rainbow_class_barrel->cdocs->length); } /* Re-build the rainbow_class_barrel, if necessary */ if (rainbow_doc_barrel->method != rainbow_class_barrel->method || rainbow_arg_state.vocab_map || bow_prune_vocab_by_infogain_n || 1) { int num_classes = rainbow_class_barrel->cdocs->length; bow_barrel_free (rainbow_class_barrel); rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, rainbow_classnames, num_classes); } /* Do things that require the class/word weights to have been updated. */ if (rainbow_arg_state.what_doing == rainbow_file_testing) { int argi; assert (rainbow_arg_state.non_option_argi < argc); for (argi = rainbow_arg_state.non_option_argi; argi < argc; argi++) rainbow_test_files (stdout, argv[argi]); exit (0); } if (rainbow_arg_state.what_doing == rainbow_weight_vector_printing) { rainbow_print_weight_vector (rainbow_arg_state.printing_class); exit (0); } if (rainbow_arg_state.what_doing == rainbow_querying) { rainbow_query (stdin, stdout); exit (0); } exit (0);}#endif /* !RAINBOW_LISP */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -