📄 rainbow.c
字号:
"-b", "-m", "kl",/* "--lex-pipe-command", "/afs/cs/project/theo-9/webkb/univ4.rainbow/tag-digits.pl", */ "-d", 0, 0}; for (argc = 0; argv[argc]; argc++); strcpy (dirname, datadirname); argv[argc] = dirname; for (argc = 0; argv[argc]; argc++); /* Since this was dynamically loaded, the __attribute__((constructor)) functions weren't called. Call them now. */ _bow_default_lexer_init (); _register_method_crossentropy (); _register_method_naivebayes (); _register_method_tfidf_words (); _register_method_tfidf_log_words (); _register_method_tfidf_log_occur (); _register_method_prind (); _register_method_kl (); _register_method_evi (); _register_method_svm (); /* Default command-line argument values */ rainbow_arg_state.what_doing = rainbow_indexing; rainbow_arg_state.query_filename = NULL; rainbow_arg_state.output_filename = NULL; rainbow_arg_state.num_trials = 0; rainbow_arg_state.infogain_words_to_print = 10; rainbow_arg_state.logodds_words_to_print = 10; rainbow_arg_state.printing_class = 0; rainbow_arg_state.non_option_argi = 0; rainbow_arg_state.repeat_query = 0; rainbow_arg_state.vocab_map = NULL; rainbow_arg_state.hide_vocab_map = NULL; rainbow_arg_state.use_lisp_score_truncation = 1; rainbow_arg_state.loo_cv = 0; rainbow_arg_state.indexing_lines_filename = NULL; argp_parse (&rainbow_argp, argc, argv, 0, 0, &rainbow_arg_state); rainbow_unarchive (); if (bow_argp_method) rainbow_doc_barrel->method = bow_argp_method; else rainbow_doc_barrel->method = rainbow_default_method; /* if (rainbow_doc_barrel->method != rainbow_class_barrel->method) { */ bow_free_barrel (rainbow_class_barrel); rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel); /* } */}/* Classify the text in the file QUERY_FILE, and return the class scores (in sorted order) in SCORES. NUM_SCORES indicates the maximum number of slots for which space is allocated in SCORES. */intrainbow_lisp_query (const char *query_file, bow_score *scores, int num_scores){ /* Show as many hits as there are classes. */ int actual_num_scores; bow_wv *query_wv; /* Get the query text, and create a "word vector" from the query text. */ if (query_file) { FILE *fp; fp = bow_fopen (query_file, "r"); query_wv = bow_wv_new_from_text_fp (fp, NULL); fclose (fp); } else { bow_verbosify (bow_quiet, "Type your query text now. End with a Control-D.\n"); query_wv = bow_wv_new_from_text_fp (stdin, NULL); } if (query_wv == NULL || query_wv->num_entries == 0) { return 0; } /* Get the best matching documents. */ bow_wv_set_weights (query_wv, rainbow_class_barrel); bow_wv_normalize_weights (query_wv, rainbow_class_barrel); actual_num_scores = bow_barrel_score (rainbow_class_barrel, query_wv, scores, num_scores, -1); bow_wv_free (query_wv); return actual_num_scores;}#endif /* RAINBOW_LISP */extern FILE *svml_test_file;/* Run test trials, outputing results to TEST_FP. The results are indended to be read and processed by the Perl script ./rainbow-stats. */voidrainbow_test (FILE *test_fp){ int tn; /* trial number */ bow_dv_heap *test_heap; /* we'll extract test WV's from here */ bow_wv *query_wv; int di; /* a document index */ bow_score *hits = NULL; int num_hits_to_retrieve=0; int actual_num_hits; int hi; /* hit index */ bow_cdoc *doc_cdoc; bow_cdoc *class_cdoc; int (*classify_cdoc_p)(bow_cdoc*); /* (Re)set the weight-setting method, if requested with `-m' argument. */ if (bow_argp_method) rainbow_doc_barrel->method = (rainbow_method*)bow_argp_method; hits = NULL; /* Loop once for each trial. */ for (tn = 0; tn < rainbow_arg_state.num_trials; tn++) { bow_set_doc_types_for_barrel (rainbow_doc_barrel); if (bow_uniform_class_priors) bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_doc_barrel); if (bow_prune_vocab_by_infogain_n) { /* Change barrel by removing words with small information gain. */ bow_barrel_keep_top_words_by_infogain (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, bow_barrel_num_classes (rainbow_class_barrel)); } if (bow_prune_vocab_by_occur_count_n) bow_error ("Sorry, `-O' implemented only for --index, not --test"); if (bow_prune_words_by_doc_count_n) bow_error ("Sorry, `-D' implemented only for --index, not --test"); /* Infogain pruning must be done before this vocab_map pruning, because infogain pruning first unhides all words! */ if (rainbow_arg_state.vocab_map) { bow_barrel_prune_words_not_in_map (rainbow_doc_barrel, rainbow_arg_state.vocab_map); } if (rainbow_arg_state.hide_vocab_map) { bow_barrel_prune_words_in_map (rainbow_doc_barrel, rainbow_arg_state.hide_vocab_map); } if (rainbow_arg_state.hide_vocab_indices_filename) { FILE *fp = bow_fopen (rainbow_arg_state.hide_vocab_indices_filename, "r"); int wi; int num_hidden = 0; while (fscanf (fp, "%d", &wi) == 1) { bow_wi2dvf_hide_wi (rainbow_doc_barrel->wi2dvf, wi); num_hidden++; } fclose (fp); bow_verbosify (bow_progress, "%d words hidden by index\n", num_hidden); } /* Re-create the vector-per-class barrel in accordance with the new train/test settings. */ bow_free_barrel (rainbow_class_barrel); rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel); if (rainbow_class_barrel->method->vpc_set_priors) (*rainbow_class_barrel->method->vpc_set_priors) (rainbow_class_barrel, rainbow_doc_barrel); /* do this late for --em-multi-hump-neg */ if (!hits) { num_hits_to_retrieve = bow_barrel_num_classes (rainbow_class_barrel); assert (num_hits_to_retrieve); hits = alloca (sizeof (bow_score) * num_hits_to_retrieve); } fprintf (test_fp, "#%d\n", tn); /* Create the heap from which we'll get WV's. NOTE: We are also including "hidden" words here---words that were previously "removed" by, for example, feature selection.*/ test_heap = bow_make_dv_heap_from_wi2dvf_hidden (rainbow_doc_barrel->wi2dvf, 0); /* Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free */ query_wv = NULL; /* Determine if we are classifying the testing documents or the training documents. */ if (rainbow_arg_state.test_on_training) { classify_cdoc_p = bow_cdoc_is_train; assert (rainbow_arg_state.num_trials == 1); } else { classify_cdoc_p = bow_cdoc_is_test; } /* Loop once for each test document. NOTE: This will skip documents that don't have any words that are in the vocabulary. */ while ((di = bow_heap_next_wv (test_heap, rainbow_doc_barrel, &query_wv, classify_cdoc_p)) != -1) { doc_cdoc = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, di); class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, doc_cdoc->class); /* Remove words not in the class_barrel */ bow_wv_prune_words_not_in_wi2dvf (query_wv, rainbow_class_barrel->wi2dvf); bow_wv_set_weights (query_wv, rainbow_class_barrel); bow_wv_normalize_weights (query_wv, rainbow_class_barrel); if (!strcmp(rainbow_class_barrel->method->name, "em")) { actual_num_hits = bow_barrel_score (rainbow_class_barrel, query_wv, hits, num_hits_to_retrieve, (rainbow_arg_state.test_on_training ? (int) doc_cdoc->class_probs : (int) NULL)); } else { if (svml_test_file) { fprintf (svml_test_file,"%d ",-1*((doc_cdoc->class*2)-1)); } actual_num_hits = bow_barrel_score (rainbow_class_barrel, query_wv, hits, num_hits_to_retrieve, (rainbow_arg_state.test_on_training ? doc_cdoc->class : -1)); } //assert (actual_num_hits == num_hits_to_retrieve);#if 0 printf ("%8.6f %d %8.6f %8.6f %d ", class_cdoc->normalizer, class_cdoc->word_count, class_cdoc->normalizer / class_cdoc->word_count, class_cdoc->prior, doc_cdoc->class); if (hits[0].di == doc_cdoc->class) printf ("1\n"); else printf ("0\n");#endif fprintf (test_fp, "%s %s ", doc_cdoc->filename, bow_barrel_classname_at_index (rainbow_doc_barrel, doc_cdoc->class)); for (hi = 0; hi < actual_num_hits; hi++) { class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, hits[hi].di); /* For the sake CommonLisp, don't print numbers smaller than 1e-35, because it can't `(read)' them. */ if (rainbow_arg_state.use_lisp_score_truncation && hits[hi].weight < 1e-35 && hits[hi].weight > 0) hits[hi].weight = 0; fprintf (test_fp, "%s:%.*g ", bow_barrel_classname_at_index (rainbow_class_barrel, hits[hi].di), bow_score_print_precision, hits[hi].weight); } if (rainbow_arg_state.print_doc_length) fprintf (test_fp, "%d", bow_wv_word_count (query_wv)); fprintf (test_fp, "\n"); } /* Don't free the heap here because bow_test_next_wv() does it for us. */ }}/* Run test trials, outputing results to TEST_FP. The results are indended to be read and processed by the Perl script ./rainbow-stats. The test documents come from files inside the directories that are named in argv[]. */voidrainbow_test_files (FILE *out_fp, const char *test_dirname){ bow_score *hits; /* int num_test_docs; */ int num_hits_to_retrieve = bow_barrel_num_classes (rainbow_class_barrel); int actual_num_hits; int hi; /* hit index */ const char *current_class; int current_ci; int ci; unsigned int dirlen = 1024; char dir[dirlen]; /* Deals with the word vector once it has been taken from the file or HDB database. Called by test_file and test_hdb_file. (see below) */ int process_wv (const char *filename, bow_wv *query_wv, void *context) { bow_cdoc *class_cdoc; if (!query_wv) { bow_verbosify (bow_progress, "%s found to be empty.\n", filename); return 0; } fprintf (out_fp, "%s %s ", filename, /* This test instance */ current_class); /* The name of the correct class */ /* Remove words not in the class_barrel */ bow_wv_prune_words_not_in_wi2dvf (query_wv, rainbow_class_barrel->wi2dvf); bow_wv_set_weights (query_wv, rainbow_class_barrel); bow_wv_normalize_weights (query_wv, rainbow_class_barrel); actual_num_hits = bow_barrel_score (rainbow_class_barrel, query_wv, hits, num_hits_to_retrieve, (rainbow_arg_state.loo_cv ? current_ci : -1)); for (hi = 0; hi < actual_num_hits; hi++) { class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, hits[hi].di); /* For the sake CommonLisp, don't print numbers smaller than 1e-35, because it can't `(read)' them. */ if (rainbow_arg_state.use_lisp_score_truncation && hits[hi].weight < 1e-35 && hits[hi].weight > 0) hits[hi].weight = 0; fprintf (out_fp, "%s:%.*g ", filename_to_classname (class_cdoc->filename), bow_score_print_precision, hits[hi].weight); } fprintf (out_fp, "\n"); return 0; } /* This nested function is called once for each test document. */ int test_file (const char *filename, void *context) { bow_wv *query_wv = NULL; FILE *fp; fp = fopen (filename, "r"); if (!fp) { bow_verbosify (bow_progress, "test_file: Couldn't open file %s\n", filename); return 0; } /* Must test to see if text here because this was done when the barrel was build in barrel.c:bow_barrel_add_from_text_dir(). Otherwise we may read a document that was not included in the original barrel, and get negative word occurrence counts since we subtract to do leave-one-out processing here. */ if (bow_fp_is_text (fp)) query_wv = bow_wv_new_from_text_fp (fp, filename); fclose (fp); return process_wv (filename, query_wv, context); }#if HAVE_HDB /* This is used for the case that we are dealing with HDB files. At this point, the fulltext of the file has already been retrieved and is passed in as DATA. */ int test_hdb_file (const char *filename, char *data, void *context) { bow_wv *query_wv = NULL; bow_lex lex; lex.document = data; lex.document_length = strlen (data); lex.document_position = 0; if (bow_str_is_text (data)) query_wv = bow_wv_new_from_lex (&lex); return process_wv (filename, query_wv, context); }#endif hits = alloca (sizeof (bow_score) * num_hits_to_retrieve);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -