📄 rainbow.c
字号:
bow_barrel_new_from_printed_barrel_file (filename, rainbow_arg_state.barrel_printing_format); /* Combine the documents into class statistics. */ rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel);}/* Index each line of ARCHER_ARG_STATE.DIRNAME as if it were a separate file, named after the line number. Does not deal with labels.*/voidrainbow_index_lines (const char *filename){ static const int max_line_length = 40000; char *buf; int n, classindex, nchars; FILE *fp; bow_cdoc cdoc; int di; char docname[BOW_MAX_WORD_LENGTH]; char classname[BOW_MAX_WORD_LENGTH]; buf = alloca (max_line_length); rainbow_doc_barrel = bow_barrel_new (0, 0, sizeof (bow_cdoc), NULL); fp = bow_fopen (filename, "r"); bow_verbosify (bow_progress, "Indexing lines: "); di = 0; while (fgets (buf, max_line_length, fp)) { if (buf[0] == '%') continue; n = sscanf (buf, "%s %d %n", docname, &classindex, &nchars); assert (n >= 2); if (classindex < 0) classindex = 0; sprintf (classname, "class%d", classindex); if (!(rainbow_doc_barrel->classnames)) rainbow_doc_barrel->classnames = bow_int4str_new (0); classindex = bow_str2int (rainbow_doc_barrel->classnames, classname); cdoc.type = bow_doc_train; cdoc.class = classindex; /* Set to one so bow_infogain_per_wi_new() works correctly by default. */ cdoc.prior = 1.0f; assert (cdoc.class >= 0); cdoc.filename = strdup (docname); assert (cdoc.filename); cdoc.class_probs = NULL; /* Add the CDOC to CDOCS, and determine the "index" of this document. */ di = bow_array_append (rainbow_doc_barrel->cdocs, &cdoc); if (strlen (buf+nchars)) bow_wi2dvf_add_di_text_str (&(rainbow_doc_barrel->wi2dvf), di, buf+nchars, docname); di++; if (di % 100 == 0) bow_verbosify(bow_progress, "\b\b\b\b\b\b%6d", di); } fclose (fp); bow_verbosify (bow_progress, "\n"); /* Combine the documents into class statistics. */ rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel);}/* Perform a query. *//* Print the contents of file FILENAME to stdout. */static inline voidprint_file (const char *filename){ FILE *fp; int byte; if ((fp = fopen (filename, "r")) == NULL) bow_error ("Couldn't open file `%s' for reading", filename); while ((byte = fgetc (fp)) != EOF) fputc (byte, stdout); fclose (fp);}int iBrokenPipe = 0; /* drapp-2/10 */jmp_buf env; /* drapp-2/10 *//* Get some query text, and print its best-matching documents among those previously indexed. The number of matching documents is NUM_HITS_TO_SHOW. If QUERY_FILENAME is non-null, the query text will be obtained from that file; otherwise it will be prompted for and read from stdin. */intrainbow_query (FILE *in, FILE *out){ /* Show as many hits as there are classes. */ int num_hits_to_show; bow_score *hits; int actual_num_hits; int i; bow_wv *query_wv = NULL; num_hits_to_show = bow_barrel_num_classes (rainbow_class_barrel); hits = alloca (sizeof (bow_score) * num_hits_to_show); /* Commented out for WhizBang --query-server */#if 0 /* (Re)set the weight-setting method, if requested with a `-m' on the command line. */ /* If we don't have the document barrel, we can't do this... */ if (rainbow_doc_barrel) { if (bow_argp_method) rainbow_doc_barrel->method = (rainbow_method*)bow_argp_method; else rainbow_doc_barrel->method = rainbow_default_method; } if (bow_prune_vocab_by_infogain_n && rainbow_doc_barrel) { /* Change barrel by removing words with small information gain. */ bow_barrel_keep_top_words_by_infogain (bow_prune_vocab_by_infogain_n, rainbow_doc_barrel, bow_barrel_num_classes (rainbow_class_barrel)); } /* Infogain pruning must be done before this vocab_map pruning, because infogain pruning first unhides all words! */ if (rainbow_arg_state.vocab_map) { if (rainbow_doc_barrel) /* Remove words not in the VOCAB_MAP. */ bow_barrel_prune_words_not_in_map (rainbow_doc_barrel, rainbow_arg_state.vocab_map); if (rainbow_class_barrel) /* Remove words not in the VOCAB_MAP. */ bow_barrel_prune_words_not_in_map (rainbow_class_barrel, rainbow_arg_state.vocab_map); } if (rainbow_arg_state.hide_vocab_map && rainbow_doc_barrel) { bow_barrel_prune_words_in_map (rainbow_doc_barrel, rainbow_arg_state.hide_vocab_map); } /* Re-build the rainbow_class_barrel, if necessary */ /* Make sure that we have the document barrel */ if (rainbow_doc_barrel && (rainbow_doc_barrel->method != rainbow_class_barrel->method || rainbow_arg_state.vocab_map || rainbow_arg_state.hide_vocab_map || bow_prune_vocab_by_infogain_n)) { bow_free_barrel (rainbow_class_barrel); rainbow_class_barrel = bow_barrel_new_vpc_with_weights (rainbow_doc_barrel); }#endif /* Get the query text, and create a "word vector" from the query text. */ query_again: if (rainbow_arg_state.query_filename) { FILE *fp; fp = bow_fopen (rainbow_arg_state.query_filename, "r"); query_wv = bow_wv_new_from_text_fp (fp, rainbow_arg_state.query_filename); fclose (fp); } else { if (rainbow_arg_state.what_doing != rainbow_query_serving) bow_verbosify (bow_quiet, "Type your query text now. End with a Control-D.\n"); if (feof (in)) clearerr (in); query_wv = bow_wv_new_from_text_fp (in, NULL); } if (query_wv == NULL || query_wv->num_entries == 0) { if (rainbow_arg_state.query_filename) bow_verbosify (bow_quiet, "No query text found in `%s'.\n", rainbow_arg_state.query_filename); else if (rainbow_arg_state.what_doing != rainbow_query_serving) bow_verbosify (bow_quiet, "No query text found."); else { fprintf(out, ".\n"); if ( sigsetjmp(env, 0) == 0 ) /* drapp-2/10 */ { fflush(out); } else { iBrokenPipe = 1; /* drapp-2/10 */ } } if (rainbow_arg_state.repeat_query) bow_verbosify (bow_progress, " Stopping query repeat\n"); return 0; } /* Remove words not in the class_barrel */ bow_wv_prune_words_not_in_wi2dvf (query_wv, rainbow_class_barrel->wi2dvf);#if 0 /* Print the WV, just for debugging */ bow_wv_fprintf (stderr, query_wv); fflush (stderr);#endif /* Get the best matching documents. */ /* When using vpc-only, we should use a method that specifies weight * and normalization functions which do not use the doc barrel */#if 0 if (rainbow_doc_barrel) { bow_wv_set_weights (query_wv, rainbow_doc_barrel); bow_wv_normalize_weights (query_wv, rainbow_doc_barrel); } else#endif { bow_wv_set_weights (query_wv, rainbow_class_barrel); bow_wv_normalize_weights (query_wv, rainbow_class_barrel); } actual_num_hits = bow_barrel_score (rainbow_class_barrel, query_wv, hits, num_hits_to_show, -1); bow_free (query_wv); /* Print them. */ if (rainbow_arg_state.what_doing != rainbow_query_serving) fprintf (out, "\n"); for (i = 0; i < actual_num_hits; i++) { bow_cdoc *cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, hits[i].di); if (strlen (rainbow_arg_state.output_filename)) { char buf[1024]; strcpy (buf, cdoc->filename); strcat (buf, "/"); strcat (buf, rainbow_arg_state.output_filename); print_file (buf); } else { /* For the sake CommonLisp, don't print numbers smaller than 1e-35, because it can't `(read)' them. */ if (rainbow_arg_state.use_lisp_score_truncation && hits[i].weight < 1e-35 && hits[i].weight > 0) hits[i].weight = 0; fprintf (out, "%s %.*g\n", /* cdoc->filename,*/ /* When knn runs, CDOCS entries correspond to documents * rather than classes. We want to print class names. */ bow_int2str (rainbow_class_barrel->classnames, hits[i].di), bow_score_print_precision, hits[i].weight); } } if (rainbow_arg_state.what_doing == rainbow_query_serving) fprintf(out, ".\n"); if ( sigsetjmp(env, 0) == 0 ) /* drapp-2/10 */ { fflush(out); } else { iBrokenPipe = 1; /* drapp-2/10 */ } if (rainbow_arg_state.repeat_query) goto query_again; return actual_num_hits;}void SigPipeHandler( int iParm ) /* drapp-2/10 */{ bow_verbosify (bow_progress, "Broken Pipe.\n"); siglongjmp( env, 1 );}voidrainbow_socket_init (const char *socket_name, int use_unix_socket){ int servlen, type, bind_ret; struct sockaddr_in in_addr; struct sockaddr *sap; type = use_unix_socket ? AF_UNIX : AF_INET; rainbow_sockfd = socket(type, SOCK_STREAM, 0); assert(rainbow_sockfd >= 0); if (type == AF_UNIX) {#ifdef WINNT servlen = 0; /* so that the compiler is happy */ sap = 0; assert(WINNT == 0);#else /* !WINNT */ struct sockaddr_un un_addr; sap = (struct sockaddr *)&un_addr; bzero((char *)sap, sizeof(un_addr)); strcpy(un_addr.sun_path, socket_name); servlen = strlen(un_addr.sun_path) + sizeof(un_addr.sun_family) + 1;#endif /* WINNT */ } else { sap = (struct sockaddr *)&in_addr; bzero((char *)sap, sizeof(in_addr)); in_addr.sin_port = htons(atoi(socket_name)); in_addr.sin_addr.s_addr = htonl(INADDR_ANY); servlen = sizeof(in_addr); } sap->sa_family = type; bind_ret = bind(rainbow_sockfd, sap, servlen); assert(bind_ret >= 0); listen(rainbow_sockfd, 5);}voidrainbow_serve (){ int newsockfd, clilen; struct sockaddr cli_addr; FILE *in, *out; pid_t pid; iBrokenPipe = 0; /* drapp-2/10 */ clilen = sizeof(cli_addr); bow_verbosify (bow_progress, "Waiting for connection...\n"); newsockfd = accept(rainbow_sockfd, &cli_addr, &clilen); assert(newsockfd >= 0); in = fdopen(newsockfd, "r"); out = fdopen(newsockfd, "w"); if (rainbow_arg_state.forking_server) { if ((pid = fork ()) != 0) { /* Parent - return to server mode */ fclose (in); fclose (out); close (newsockfd); return; } } bow_verbosify (bow_progress, "Got connection.\n"); while (!feof(in) && !iBrokenPipe) /* drapp-2/10 */ rainbow_query(in, out); fclose(in); fclose(out); close(newsockfd); bow_verbosify (bow_progress, "Closed connection.\n"); /* Kill the child - don't want it hanging around, sucking up memory */ if (rainbow_arg_state.forking_server) exit (0);}#if RAINBOW_LISP/* Setup rainbow so that we can do our lisp interface. */voidrainbow_lisp_setup (char *datadirname){ /* Defined in deflexer.c */ extern void _bow_default_lexer_init (); /* Defined in naivebayes.c */ extern void _register_method_crossentropy (); extern void _register_method_naivebayes (); /* Defined in tfidf.c */ extern void _register_method_tfidf_words (); extern void _register_method_tfidf_log_words (); extern void _register_method_tfidf_log_occur (); /* Defined in prind.c */ extern void _register_method_prind (); extern void _register_method_svm (); char *dirname = bow_malloc (strlen (datadirname) + 1); int argc; static char *argv[] = { "rainbow-lisp-interface", "-q", "-H", "-h", "-s",
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -