📄 crossbow.c
字号:
else fprintf (out, "%s", leaves[wa->entry[0].wi]->name); fprintf (out, "\n"); } bow_wa_free (wa); return (ret);}voidcrossbow_classify_tagged_docs (int tag, int verbose, FILE *out){ int di; int doc_count = 0; int correct_count = 0; crossbow_doc *doc; for (di = 0; di < crossbow_docs->length; di++) { doc = bow_array_entry_at_index (crossbow_docs, di); if (tag != -1 && doc->tag != tag) continue; doc_count++; if ((((crossbow_method*)bow_argp_method)->classify_doc) (doc, verbose, out)) correct_count++; } if (!verbose) { fprintf (out, "Fraction correct %f (%d/%d)\n", ((double)correct_count) / doc_count, correct_count, doc_count);#if 0 fprintf (out, "Average Inverse Rank %f\n", inverse_rank_sum / doc_count); fprintf (out, "Average Score Difference %f\n", score_diff_sum / doc_count);#endif }}voidcrossbow_classify_docs_in_dirname (const char *dirname, int verbose){ int classify_filename (const char *filename, void *context) { crossbow_doc doc; bow_wv *wv; FILE *fp; fp = bow_fopen (filename, "r"); if (!bow_fp_is_text (fp)) return 0; wv = bow_wv_new_from_text_fp (fp, filename); fclose (fp); if (!wv) return 0; doc.tag = bow_doc_test; doc.ci = -1; doc.filename = filename; doc.word_count = bow_wv_word_count (wv); doc.wv_seek_pos = -1; doc.di = -1; doc.wv = wv; doc.cis_size = -1; doc.cis = NULL; ((((crossbow_method*)bow_argp_method)->classify_doc) (&doc, verbose, stdout)); return 0; } bow_map_filenames_from_dir (classify_filename, NULL, dirname, "");}voidcrossbow_cluster (){ bow_verbosify (bow_progress, "Starting clustering\n"); assert (((crossbow_method*)bow_argp_method)->cluster); ((crossbow_method*)bow_argp_method)->cluster ();}voidcrossbow_classify (){ bow_verbosify (bow_progress, "Starting classification\n"); /* Train the vertical mixture model with EM. */ if (!crossbow_hem_restricted_horizontal) crossbow_hem_deterministic_horizontal = 1; ((crossbow_method*)bow_argp_method)->train_classifier (); /* Classify the test documents and output results */ if (crossbow_arg_state.classify_files_dirname) crossbow_classify_docs_in_dirname (crossbow_arg_state.classify_files_dirname, 1); else crossbow_classify_tagged_docs (bow_doc_test, 2, stdout);}/* Code for query serving */static int crossbow_sockfd;voidcrossbow_socket_init (const char *socket_name, int use_unix_socket){ int servlen, type, bind_ret; struct sockaddr_in in_addr; struct sockaddr *sap; type = use_unix_socket ? AF_UNIX : AF_INET; crossbow_sockfd = socket(type, SOCK_STREAM, 0); assert(crossbow_sockfd >= 0); if (type == AF_UNIX) {#ifdef WINNT servlen = 0; /* so that the compiler is happy */ sap = 0; assert(WINNT == 0);#else /* !WINNT */ struct sockaddr_un un_addr; sap = (struct sockaddr *)&un_addr; bzero((char *)sap, sizeof(un_addr)); strcpy(un_addr.sun_path, socket_name); servlen = strlen(un_addr.sun_path) + sizeof(un_addr.sun_family) + 1;#endif /* WINNT */ } else { sap = (struct sockaddr *)&in_addr; bzero((char *)sap, sizeof(in_addr)); in_addr.sin_port = htons(atoi(socket_name)); in_addr.sin_addr.s_addr = htonl(INADDR_ANY); servlen = sizeof(in_addr); } sap->sa_family = type; bind_ret = bind (crossbow_sockfd, sap, servlen); assert(bind_ret >= 0); listen (crossbow_sockfd, 5);}/* Read a single document from the socket, classify it and return classification */voidcrossbow_serve (){ int newsockfd, clilen; struct sockaddr cli_addr; FILE *in, *out; int ci; clilen = sizeof(cli_addr); newsockfd = accept(crossbow_sockfd, &cli_addr, &clilen); bow_verbosify (bow_progress, "Accepted connection\n"); assert (newsockfd >= 0); in = fdopen(newsockfd, "r"); out = fdopen(newsockfd, "w"); while (!feof(in)) { bow_wv *wv = bow_wv_new_from_text_fp (in, NULL); bow_wa *wa; if (!wv) { fprintf (out, ".\n"); fflush (out); break; } wa = crossbow_classify_doc_new_wa (wv); bow_wa_sort (wa); for (ci = 0; ci < wa->length; ci++) fprintf (out, "%s %g\n", bow_int2str (crossbow_classnames, wa->entry[ci].wi), wa->entry[ci].weight); fprintf (out, ".\n"); fflush (out); bow_wa_free (wa); bow_wv_free (wv); } fclose(in); fclose(out); close(newsockfd); bow_verbosify (bow_progress, "Closed connection\n");}voidcrossbow_query_serving (){ bow_verbosify (bow_progress, "Starting query server\n"); /* Don't add any new words from the queries to the vocabulary */ bow_word2int_do_not_add = 1; /* Train the vertical mixture model with EM. */ if (!crossbow_hem_restricted_horizontal) crossbow_hem_deterministic_horizontal = 1; ((crossbow_method*)bow_argp_method)->train_classifier (); bow_verbosify (bow_progress, "Ready to serve!\n"); crossbow_socket_init (crossbow_arg_state.server_port_num, 0); while (1) { bow_verbosify (bow_progress, "Waiting for connection\n"); crossbow_serve(); }}voidcrossbow_print_word_probabilities (){ bow_error ("Not implemented");}voidcrossbow_print_doc_names (){ int di; int tag = -1; crossbow_doc *doc; if (crossbow_arg_state.printing_tag) { tag = bow_str2type (crossbow_arg_state.printing_tag); if (tag == -1) bow_error ("Argument to --print-doc-names, `%s', is not a tag\n" "Try `train', `test', `unlabeled', etc"); } for (di = 0; di < crossbow_docs->length; di++) { doc = bow_array_entry_at_index (crossbow_docs, di); if (crossbow_arg_state.printing_tag == NULL || (tag >= 0 && doc->tag == tag)) printf ("%s\n", doc->filename); }}voidcrossbow_print_matrix (){ int di, wvi; crossbow_doc *doc; bow_wv *wv; for (di = 0; di < crossbow_docs->length; di++) { doc = bow_array_entry_at_index (crossbow_docs, di); printf ("%s ", doc->filename); wv = crossbow_wv_at_di (di); for (wvi = 0; wvi < wv->num_entries; wvi++) printf ("%s %d ", bow_int2word (wv->entry[wvi].wi), wv->entry[wvi].count); printf ("\n"); }}/* Definitions for using argp command-line processing */const char *argp_program_version ="crossbow " STRINGIFY(CROSSBOW_MAJOR_VERSION) "." STRINGIFY(CROSSBOW_MINOR_VERSION);const char *argp_program_bug_address = "<mccallum@cs.cmu.edu>";static char crossbow_argp_doc[] ="Crossbow -- a document clustering front-end to libbow";static char crossbow_argp_args_doc[] = "[ARG...]";enum { PRINT_IDF_KEY = 13000, QUERY_SERVER_KEY, QUERY_FORK_SERVER_KEY, CLUSTER_OUTPUT_DIR_KEY, BUILD_HIER_FROM_DIR_KEY, CLASSIFY_KEY, CLASSIFY_FILES_KEY, PRINT_WORD_PROBABILITIES_KEY, PRINT_DOC_NAMES_KEY, INDEX_MULTICLASS_LIST_KEY, PRINT_MATRIX_KEY, USE_VOCAB_IN_FILE_KEY,};static struct argp_option crossbow_options[] ={ {0, 0, 0, 0, "For building data structures from text files:", 1}, {"index", 'i', 0, 0, "tokenize training documents found under ARG..., build weight vectors, " "and save them to disk"}, {"index-multiclass-list", INDEX_MULTICLASS_LIST_KEY, "FILE", 0, "Index the files listed in FILE. Each line of FILE should contain " "a filenames followed by a list of classnames to which that file belongs."}, {"cluster", 'c', 0, 0, "cluster the documents, and write the results to disk"}, {"cluster-output-dir", CLUSTER_OUTPUT_DIR_KEY, "DIR", 0, "After clustering is finished, write the cluster to directory DIR"}, {"build-hier-from-dir", BUILD_HIER_FROM_DIR_KEY, 0, 0, "When indexing a single directory, use the directory structure to build " "a class hierarchy"}, {"classify", CLASSIFY_KEY, 0, 0, "Split the data into train/test, and classify the test data, outputing " "results in rainbow format"}, {"classify-files", CLASSIFY_FILES_KEY, "DIRNAME", 0, "Classify documents in DIRNAME, outputing `filename classname' pairs " "on each line."}, {"query-server", QUERY_SERVER_KEY, "PORTNUM", 0, "Run crossbow in server mode, listening on socket number PORTNUM. " "You can try it by executing this command, then in a different shell " "window on the same machine typing `telnet localhost PORTNUM'."}, {"print-word-probabilities", PRINT_WORD_PROBABILITIES_KEY, "FILEPREFIX", 0, "Print the word probability distribution in each leaf to files named " "FILEPREFIX-classname"}, {"print-doc-names", PRINT_DOC_NAMES_KEY, "TAG", OPTION_ARG_OPTIONAL, "Print the filenames of documents contained in the model. " "If the optional TAG argument is given, print only the documents " "that have the specified tag."}, {"print-matrix", PRINT_MATRIX_KEY, 0, 0, "Print the word/document count matrix in an awk- or perl-accessible " "format. Format is sparse and includes the words and the counts."}, {"use-vocab-in-file", USE_VOCAB_IN_FILE_KEY, "FILENAME", 0, "Limit vocabulary to just those words read as space-separated strings " "from FILE."}, { 0 }};static error_tcrossbow_parse_opt (int key, char *arg, struct argp_state *state){ switch (key) { case 'i': crossbow_arg_state.what_doing = crossbow_index; break; case INDEX_MULTICLASS_LIST_KEY: crossbow_arg_state.what_doing = crossbow_index_multiclass_list; crossbow_arg_state.multiclass_list_filename = arg; break; case 'c': crossbow_arg_state.what_doing = crossbow_cluster; break; case CLUSTER_OUTPUT_DIR_KEY: crossbow_arg_state.cluster_output_dir = arg; break; case BUILD_HIER_FROM_DIR_KEY: crossbow_arg_state.build_hier_from_dir = 1; break; case CLASSIFY_FILES_KEY: crossbow_arg_state.classify_files_dirname = arg; case CLASSIFY_KEY: crossbow_arg_state.what_doing = crossbow_classify; break; case QUERY_SERVER_KEY: crossbow_arg_state.what_doing = crossbow_query_serving; crossbow_arg_state.server_port_num = arg; bow_lexer_document_end_pattern = "\n.\r\n"; break; case PRINT_WORD_PROBABILITIES_KEY: crossbow_arg_state.what_doing = crossbow_print_word_probabilities; break; case PRINT_DOC_NAMES_KEY: crossbow_arg_state.what_doing = crossbow_print_doc_names; crossbow_arg_state.printing_tag = arg; break; case PRINT_MATRIX_KEY: crossbow_arg_state.what_doing = crossbow_print_matrix; break; case USE_VOCAB_IN_FILE_KEY: crossbow_arg_state.vocab_map = bow_int4str_new_from_text_file (arg); bow_verbosify (bow_progress, "Using vocab with %d words from file `%s'\n", crossbow_arg_state.vocab_map->str_array_length, arg); break; case ARGP_KEY_ARG: /* Now we consume all the rest of the arguments. STATE->next is the index in STATE->argv of the next argument to be parsed, which is the first STRING we're interested in, so we can just use `&state->argv[state->next]' as the value for RAINBOW_ARG_STATE->ARGS. IN ADDITION, by setting STATE->next to the end of the arguments, we can force argp to stop parsing here and return. */ crossbow_arg_state.non_option_argi = state->next - 1; if (crossbow_arg_state.what_doing == crossbow_index && state->next > state->argc) { /* Zero directory names is not enough. */ fprintf (stderr, "Need at least one directory to index.\n"); argp_usage (state); } state->next = state->argc; break; default: return ARGP_ERR_UNKNOWN; } return 0;}static struct argp crossbow_argp = { crossbow_options, crossbow_parse_opt, crossbow_argp_args_doc, crossbow_argp_doc, bow_argp_children};/* This method structure is defined in hem.c, and is the default bow_argp_method */extern crossbow_method hem_cluster_method;intmain (int argc, char *argv[]){ /* Default command-line argument values */ crossbow_arg_state.what_doing = crossbow_cluster; crossbow_arg_state.cluster_output_dir = NULL; crossbow_arg_state.build_hier_from_dir = 0; crossbow_arg_state.print_file_prefix = NULL; crossbow_arg_state.printing_tag = NULL; crossbow_arg_state.classify_files_dirname = NULL; crossbow_arg_state.vocab_map = NULL; bow_argp_method = (bow_method*)&hem_cluster_method; /* bow_lexer_toss_words_longer_than = 20; */ /* Parse the command-line arguments. */ argp_parse (&crossbow_argp, argc, argv, 0, 0, &crossbow_arg_state); crossbow_argv = argv; crossbow_argc = argc; if (*crossbow_arg_state.what_doing != crossbow_index && *crossbow_arg_state.what_doing != crossbow_index_multiclass_list) { crossbow_unarchive (bow_data_dirname); /* Do test/train splits. */ bow_set_doc_types (crossbow_docs, crossbow_classes_count, crossbow_classnames); } (*crossbow_arg_state.what_doing) (); if (crossbow_arg_state.cluster_output_dir && *crossbow_arg_state.what_doing != crossbow_index) crossbow_archive (bow_data_dirname); exit (0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -