📄 rainbow.c
字号:
void do_indexing () { if (rainbow_doc_barrel) bow_barrel_free (rainbow_doc_barrel); /* Index all the documents. */ rainbow_doc_barrel = bow_barrel_new (0, 0, sizeof (bow_cdoc), NULL); if (bow_argp_method) rainbow_doc_barrel->method = bow_argp_method; else rainbow_doc_barrel->method = rainbow_default_method; for (class_index = 0; class_index < num_classes; class_index++) { bow_verbosify (bow_progress, "Class `%s'\n ", filename_to_classname (classdir_names[class_index])); /* This function traverses the directory class directory gathering word/document stats. Return the number of documents indexed. This gathers stats on individual documents; we have yet to "sum together the word vectors of all documents for each particular class". */ if (bow_barrel_add_from_text_dir (rainbow_doc_barrel, classdir_names[class_index], exception_name, class_index) == 0) bow_verbosify (bow_quiet, "No text files found in directory `%s'\n", classdir_names[class_index]); } if (bow_uniform_class_priors) bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_doc_barrel); } /* Do all the parsing to build a barrel with word counts. */ if (bow_prune_vocab_by_occur_count_n) { /* Parse all the documents to get word occurrence counts. */ for (class_index = 0; class_index < num_classes; class_index++) { bow_verbosify (bow_progress, "Class `%s'\n ", filename_to_classname (classdir_names[class_index])); bow_words_add_occurrences_from_text_dir (classdir_names[class_index], ""); } bow_words_remove_occurrences_less_than (bow_prune_vocab_by_occur_count_n); /* Now insist that future calls to bow_word2int*() will not register new words. */ bow_word2int_do_not_add = 1; } do_indexing (); if (bow_prune_vocab_by_infogain_n) { if (0) { /* Change barrel by removing words with small information gain. */ bow_barrel_keep_top_words_by_infogain (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, num_classes); } else { /* Change vocabulary to remove words with small information gain */ bow_words_keep_top_by_infogain (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, num_classes); /* Now insist that future calls to bow_word2int*() will not register new words. */ bow_word2int_do_not_add = 1; do_indexing (); } } /* Combine the documents into class statistics. */ rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, classdir_names, num_classes);}/* Perform a query. *//* Print the contents of file FILENAME to stdout. */static inline voidprint_file (const char *filename){ FILE *fp; int byte; if ((fp = fopen (filename, "r")) == NULL) bow_error ("Couldn't open file `%s' for reading", filename); while ((byte = fgetc (fp)) != EOF) fputc (byte, stdout); fclose (fp);}/* Get some query text, and print its best-matching documents among those previously indexed. The number of matching documents is NUM_HITS_TO_SHOW. If QUERY_FILENAME is non-null, the query text will be obtained from that file; otherwise it will be prompted for and read from stdin. */intrainbow_query (FILE *in, FILE *out){ /* Show as many hits as there are classes. */ int num_hits_to_show = rainbow_class_barrel->cdocs->length; bow_score *hits; int actual_num_hits; int i; bow_wv *query_wv; hits = alloca (sizeof (bow_score) * num_hits_to_show); /* Get the query text, and create a "word vector" from the query text. */ if (rainbow_arg_state.query_filename) { FILE *fp; fp = bow_fopen (rainbow_arg_state.query_filename, "r"); query_wv = bow_wv_new_from_text_fp (fp); fclose (fp); } else { query_again: if (rainbow_arg_state.what_doing != rainbow_query_serving) bow_verbosify (bow_quiet, "Type your query text now. End with a Control-D.\n"); if (feof (in)) clearerr (in); query_wv = bow_wv_new_from_text_fp (in); } if (query_wv == NULL || query_wv->num_entries == 0) { if (rainbow_arg_state.query_filename) bow_verbosify (bow_quiet, "No query text found in `%s'.\n", rainbow_arg_state.query_filename); else if (rainbow_arg_state.what_doing != rainbow_query_serving) bow_verbosify (bow_quiet, "No query text found."); else { fprintf(out, ".\n"); fflush(out); } if (rainbow_arg_state.repeat_query) bow_verbosify (bow_progress, " Stopping query repeat\n"); return 0; } /* (Re)set the weight-setting method, if requested with a `-m' on the command line. */ if (bow_argp_method) rainbow_doc_barrel->method = bow_argp_method; else rainbow_doc_barrel->method = rainbow_default_method; if (rainbow_arg_state.vocab_map) { /* Remove words not in the VOCAB_MAP. */ bow_barrel_prune_words_not_in_map (rainbow_doc_barrel, rainbow_arg_state.vocab_map); } else if (bow_prune_vocab_by_infogain_n) { /* Change barrel by removing words with small information gain. */ bow_barrel_keep_top_words_by_infogain (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, rainbow_class_barrel->cdocs->length); } /* Re-build the rainbow_class_barrel, if necessary */ if (rainbow_doc_barrel->method != rainbow_class_barrel->method || rainbow_arg_state.vocab_map || bow_prune_vocab_by_infogain_n) { int num_classes = rainbow_class_barrel->cdocs->length; bow_barrel_free (rainbow_class_barrel); rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, rainbow_classnames, num_classes); } /* Get the best matching documents. */ bow_wv_set_weights (query_wv, rainbow_doc_barrel); bow_wv_normalize_weights (query_wv, rainbow_doc_barrel); actual_num_hits = bow_barrel_score (rainbow_class_barrel, query_wv, hits, num_hits_to_show, -1); /* Print them. */ if (rainbow_arg_state.what_doing != rainbow_query_serving) fprintf (out, "\n"); for (i = 0; i < actual_num_hits; i++) { bow_cdoc *cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, hits[i].di); if (strlen (rainbow_arg_state.output_filename)) { char buf[1024]; strcpy (buf, cdoc->filename); strcat (buf, "/"); strcat (buf, rainbow_arg_state.output_filename); print_file (buf); } else { /* For the sake CommonLisp, don't print numbers smaller than 1e-35, because it can't `(read)' them. */ if (rainbow_arg_state.use_lisp_score_truncation && hits[i].weight < 1e-35 && hits[i].weight > 0) hits[i].weight = 0; fprintf (out, "%s %g\n", cdoc->filename, hits[i].weight); } } if (rainbow_arg_state.what_doing == rainbow_query_serving) fprintf(out, ".\n"); fflush(out); if (rainbow_arg_state.repeat_query) goto query_again; return actual_num_hits;}static void rainbow_socket_init(const char *socket_name, int use_unix_socket){ int servlen, type, bind_ret; struct sockaddr_un un_addr; struct sockaddr_in in_addr; struct sockaddr *sap; type = use_unix_socket ? AF_UNIX : AF_INET; rainbow_sockfd = socket(type, SOCK_STREAM, 0); assert(rainbow_sockfd >= 0); if (type == AF_UNIX) { sap = (struct sockaddr *)&un_addr; bzero((char *)sap, sizeof(un_addr)); strcpy(un_addr.sun_path, socket_name); servlen = strlen(un_addr.sun_path) + sizeof(un_addr.sun_family) + 1; } else { sap = (struct sockaddr *)&in_addr; bzero((char *)sap, sizeof(in_addr)); in_addr.sin_port = htons(atoi(socket_name)); in_addr.sin_addr.s_addr = htonl(INADDR_ANY); servlen = sizeof(in_addr); } sap->sa_family = type; bind_ret = bind(rainbow_sockfd, sap, servlen); assert(bind_ret >= 0); listen(rainbow_sockfd, 5);}static void rainbow_serve(void){ int newsockfd, clilen; struct sockaddr cli_addr; FILE *in, *out; clilen = sizeof(cli_addr); newsockfd = accept(rainbow_sockfd, &cli_addr, &clilen); assert(newsockfd >= 0); in = fdopen(newsockfd, "r"); out = fdopen(newsockfd, "w"); while (!feof(in)) rainbow_query(in, out); fclose(in); fclose(out); close(newsockfd);}#if RAINBOW_LISP/* Setup rainbow so that we can do our lisp interface. */voidrainbow_lisp_setup (char *datadirname){ /* Defined in deflexer.c */ extern void _bow_default_lexer_init (); /* Defined in naivebayes.c */ extern void _register_method_crossentropy (); extern void _register_method_naivebayes (); /* Defined in tfidf.c */ extern void _register_method_tfidf_words (); extern void _register_method_tfidf_log_words (); extern void _register_method_tfidf_log_occur (); /* Defined in prind.c */ extern void _register_method_prind (); char *dirname = bow_malloc (strlen (datadirname) + 1); int argc; static char *argv[] = { "rainbow-lisp-interface", "-q", "-H", "-h", "-s", "-b", "-m", "kl",/* "--lex-pipe-command", "/afs/cs/project/theo-9/webkb/univ4.rainbow/tag-digits.pl", */ "-d", 0, 0}; for (argc = 0; argv[argc]; argc++); strcpy (dirname, datadirname); argv[argc] = dirname; for (argc = 0; argv[argc]; argc++); /* Since this was dynamically loaded, the __attribute__((constructor)) functions weren't called. Call them now. */ _bow_default_lexer_init (); _register_method_crossentropy (); _register_method_naivebayes (); _register_method_tfidf_words (); _register_method_tfidf_log_words (); _register_method_tfidf_log_occur (); _register_method_prind (); _register_method_kl (); _register_method_evi (); /* Default command-line argument values */ rainbow_arg_state.what_doing = rainbow_indexing; rainbow_arg_state.query_filename = NULL; rainbow_arg_state.output_filename = NULL; rainbow_arg_state.num_trials = 0; rainbow_arg_state.test_percentage = 30; rainbow_arg_state.infogain_words_to_print = 10; rainbow_arg_state.printing_class = 0; rainbow_arg_state.non_option_argi = 0; rainbow_arg_state.repeat_query = 0; rainbow_arg_state.vocab_map = NULL; rainbow_arg_state.use_lisp_score_truncation = 1; rainbow_arg_state.loo_cv = 0; argp_parse (&rainbow_argp, argc, argv, 0, 0, &rainbow_arg_state); rainbow_unarchive (); if (bow_argp_method) rainbow_doc_barrel->method = bow_argp_method; else rainbow_doc_barrel->method = rainbow_default_method; /* if (rainbow_doc_barrel->method != rainbow_class_barrel->method) { */ bow_barrel_free (rainbow_class_barrel); rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, rainbow_classnames); /* } */}/* Classify the text in the file QUERY_FILE, and return the class scores (in sorted order) in SCORES. NUM_SCORES indicates the maximum number of slots for which space is allocated in SCORES. */intrainbow_lisp_query (const char *query_file, bow_score *scores, int num_scores){ /* Show as many hits as there are classes. */ int actual_num_scores; bow_wv *query_wv; /* Get the query text, and create a "word vector" from the query text. */ if (query_file) { FILE *fp; fp = bow_fopen (query_file, "r"); query_wv = bow_wv_new_from_text_fp (fp); fclose (fp); } else { bow_verbosify (bow_quiet, "Type your query text now. End with a Control-D.\n"); query_wv = bow_wv_new_from_text_fp (stdin); } if (query_wv == NULL || query_wv->num_entries == 0) { return 0; } /* Get the best matching documents. */ bow_wv_set_weights (query_wv, rainbow_class_barrel); bow_wv_normalize_weights (query_wv, rainbow_class_barrel); actual_num_scores = bow_barrel_score (rainbow_class_barrel, query_wv, scores, num_scores, -1); bow_wv_free (query_wv); return actual_num_scores;}#endif /* RAINBOW_LISP *//* Run test trials, outputing results to TEST_FP. The results are indended to be read and processed by the Perl script ./rainbow-stats. */voidrainbow_test (FILE *test_fp){ int tn; /* trial number */ int num_test_docs; /* how many doc's will be for testing */ bow_dv_heap *test_heap; /* we'll extract test WV's from here */ bow_wv *query_wv; int di; /* a document index */ bow_score *hits; int num_hits_to_retrieve = rainbow_class_barrel->cdocs->length; int actual_num_hits; int hi; /* hit index */ bow_cdoc *doc_cdoc; bow_cdoc *class_cdoc; hits = alloca (sizeof (bow_score) * num_hits_to_retrieve); /* Calculate the number of testing documents according to TEST_PERCENTAGE. */ num_test_docs = (rainbow_doc_barrel->cdocs->length * rainbow_arg_state.test_percentage) / 100; /* (Re)set the weight-setting method, if requested with `-m' argument. */ if (bow_argp_method) rainbow_doc_barrel->method = bow_argp_method; if (rainbow_arg_state.vocab_map) { bow_barrel_prune_words_not_in_map (rainbow_doc_barrel, rainbow_arg_state.vocab_map); } /* Loop once for each trial. */ for (tn = 0; tn < rainbow_arg_state.num_trials; tn++) { fprintf (test_fp, "#%d\n", tn); /* Randomly set which doc's are for training and which are testing. */ bow_test_split (rainbow_doc_barrel, num_test_docs); if (bow_uniform_class_priors) bow_barrel_set_cdoc_priors_to_class_uniform (rainbow_doc_barrel); if (bow_prune_vocab_by_infogain_n) { /* Change barrel by removing words with small information gain. */ bow_barrel_keep_top_words_by_infogain (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, rainbow_class_barrel->cdocs->length); } assert (rainbow_arg_state.test_percentage > 0 && rainbow_arg_state.test_percentage < 100); /* Re-create the vector-per-class barrel in accordance with the new train/test settings. */ { int num_classes = rainbow_class_barrel->cdocs->length; bow_barrel_free (rainbow_class_barrel); rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel, rainbow_classnames, num_classes); } /* Create the heap from which we'll get WV's. */ test_heap = bow_test_new_heap (rainbow_doc_barrel); /* Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free */ query_wv = NULL; /* Loop once for each test document. */ while ((di = bow_test_next_wv (test_heap, rainbow_doc_barrel, &query_wv)) != -1)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -