📄 crossbow.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
      else	fprintf (out, "%s", leaves[wa->entry[0].wi]->name);      fprintf (out, "\n");    }  bow_wa_free (wa);  return (ret);}voidcrossbow_classify_tagged_docs (int tag, int verbose, FILE *out){  int di;  int doc_count = 0;  int correct_count = 0;  crossbow_doc *doc;    for (di = 0; di < crossbow_docs->length; di++)    {      doc = bow_array_entry_at_index (crossbow_docs, di);      if (tag != -1 && doc->tag != tag)	continue;      doc_count++;      if ((((crossbow_method*)bow_argp_method)->classify_doc)	  (doc, verbose, out))	correct_count++;    }  if (!verbose)    {      fprintf (out, "Fraction correct %f (%d/%d)\n",	       ((double)correct_count) / doc_count,	       correct_count, doc_count);#if 0      fprintf (out, "Average Inverse Rank %f\n",	       inverse_rank_sum / doc_count);      fprintf (out, "Average Score Difference %f\n",	       score_diff_sum / doc_count);#endif    }}voidcrossbow_classify_docs_in_dirname (const char *dirname, int verbose){  int classify_filename (const char *filename, void *context)    {      crossbow_doc doc;      bow_wv *wv;      FILE *fp;      fp = bow_fopen (filename, "r");      if (!bow_fp_is_text (fp))	return 0;      wv = bow_wv_new_from_text_fp (fp, filename);      fclose (fp);      if (!wv) 	return 0;      doc.tag = bow_doc_test;      doc.ci = -1;      doc.filename = filename;      doc.word_count = bow_wv_word_count (wv);      doc.wv_seek_pos = -1;      doc.di = -1;      doc.wv = wv;      doc.cis_size = -1;      doc.cis = NULL;      ((((crossbow_method*)bow_argp_method)->classify_doc)       (&doc, verbose, stdout));      return 0;    }  bow_map_filenames_from_dir (classify_filename, NULL, dirname, "");}voidcrossbow_cluster (){  bow_verbosify (bow_progress, "Starting clustering\n");  assert (((crossbow_method*)bow_argp_method)->cluster);  ((crossbow_method*)bow_argp_method)->cluster ();}voidcrossbow_classify (){  bow_verbosify (bow_progress, "Starting classification\n");  /* Train the vertical mixture model with EM. */  if (!crossbow_hem_restricted_horizontal)    crossbow_hem_deterministic_horizontal = 1;  ((crossbow_method*)bow_argp_method)->train_classifier ();  /* Classify the test documents and output results */  if (crossbow_arg_state.classify_files_dirname)    crossbow_classify_docs_in_dirname      (crossbow_arg_state.classify_files_dirname, 1);  else    crossbow_classify_tagged_docs (bow_doc_test, 2, stdout);}/* Code for query serving */static int crossbow_sockfd;voidcrossbow_socket_init (const char *socket_name, int use_unix_socket){  int servlen, type, bind_ret;  struct sockaddr_in in_addr;  struct sockaddr *sap;  type = use_unix_socket ? AF_UNIX : AF_INET;     crossbow_sockfd = socket(type, SOCK_STREAM, 0);  assert(crossbow_sockfd >= 0);  if (type == AF_UNIX)    {#ifdef WINNT      servlen = 0;  /* so that the compiler is happy */      sap = 0;      assert(WINNT == 0);#else /* !WINNT */      struct sockaddr_un un_addr;      sap = (struct sockaddr *)&un_addr;      bzero((char *)sap, sizeof(un_addr));      strcpy(un_addr.sun_path, socket_name);      servlen = strlen(un_addr.sun_path) + sizeof(un_addr.sun_family) + 1;#endif /* WINNT */    }  else    {      sap = (struct sockaddr *)&in_addr;      bzero((char *)sap, sizeof(in_addr));      in_addr.sin_port = htons(atoi(socket_name));      in_addr.sin_addr.s_addr = htonl(INADDR_ANY);      servlen = sizeof(in_addr);    }  sap->sa_family = type;       bind_ret = bind (crossbow_sockfd, sap, servlen);  assert(bind_ret >= 0);  listen (crossbow_sockfd, 5);}/* Read a single document from the socket, classify it and return   classification */voidcrossbow_serve (){  int newsockfd, clilen;  struct sockaddr cli_addr;  FILE *in, *out;  int ci;  clilen = sizeof(cli_addr);  newsockfd = accept(crossbow_sockfd, &cli_addr, &clilen);  bow_verbosify (bow_progress, "Accepted connection\n");  assert (newsockfd >= 0);  in = fdopen(newsockfd, "r");  out = fdopen(newsockfd, "w");  while (!feof(in))    {      bow_wv *wv = bow_wv_new_from_text_fp (in, NULL);      bow_wa *wa;      if (!wv)	{	  fprintf (out, ".\n");	  fflush (out);	  break;	}      wa = crossbow_classify_doc_new_wa (wv);      bow_wa_sort (wa);      for (ci = 0; ci < wa->length; ci++)	fprintf (out, "%s %g\n", 		 bow_int2str (crossbow_classnames, wa->entry[ci].wi),		 wa->entry[ci].weight);      fprintf (out, ".\n");      fflush (out);      bow_wa_free (wa);      bow_wv_free (wv);    }  fclose(in);  fclose(out);  close(newsockfd);  bow_verbosify (bow_progress, "Closed connection\n");}voidcrossbow_query_serving (){  bow_verbosify (bow_progress, "Starting query server\n");  /* Don't add any new words from the queries to the vocabulary */  bow_word2int_do_not_add = 1;  /* Train the vertical mixture model with EM. */  if (!crossbow_hem_restricted_horizontal)    crossbow_hem_deterministic_horizontal = 1;  ((crossbow_method*)bow_argp_method)->train_classifier ();  bow_verbosify (bow_progress, "Ready to serve!\n");  crossbow_socket_init (crossbow_arg_state.server_port_num, 0);  while (1)    {      bow_verbosify (bow_progress, "Waiting for connection\n");      crossbow_serve();    }}voidcrossbow_print_word_probabilities (){  bow_error ("Not implemented");}voidcrossbow_print_doc_names (){  int di;  int tag = -1;  crossbow_doc *doc;  if (crossbow_arg_state.printing_tag)    {      tag = bow_str2type (crossbow_arg_state.printing_tag);      if (tag == -1)	bow_error ("Argument to --print-doc-names, `%s', is not a tag\n"		   "Try `train', `test', `unlabeled', etc");    }  for (di = 0; di < crossbow_docs->length; di++)    {      doc = bow_array_entry_at_index (crossbow_docs, di);      if (crossbow_arg_state.printing_tag == NULL	  || (tag >= 0 && doc->tag == tag))	printf ("%s\n", doc->filename);    }}voidcrossbow_print_matrix (){  int di, wvi;  crossbow_doc *doc;  bow_wv *wv;  for (di = 0; di < crossbow_docs->length; di++)    {      doc = bow_array_entry_at_index (crossbow_docs, di);      printf ("%s ", doc->filename);      wv = crossbow_wv_at_di (di);      for (wvi = 0; wvi < wv->num_entries; wvi++)	printf ("%s %d ", 		bow_int2word (wv->entry[wvi].wi), wv->entry[wvi].count);      printf ("\n");    }}/* Definitions for using argp command-line processing */const char *argp_program_version ="crossbow " STRINGIFY(CROSSBOW_MAJOR_VERSION) "." STRINGIFY(CROSSBOW_MINOR_VERSION);const char *argp_program_bug_address = "<mccallum@cs.cmu.edu>";static char crossbow_argp_doc[] ="Crossbow -- a document clustering front-end to libbow";static char crossbow_argp_args_doc[] = "[ARG...]";enum {  PRINT_IDF_KEY = 13000,  QUERY_SERVER_KEY,  QUERY_FORK_SERVER_KEY,  CLUSTER_OUTPUT_DIR_KEY,  BUILD_HIER_FROM_DIR_KEY,  CLASSIFY_KEY,  CLASSIFY_FILES_KEY,  PRINT_WORD_PROBABILITIES_KEY,  PRINT_DOC_NAMES_KEY,  INDEX_MULTICLASS_LIST_KEY,  PRINT_MATRIX_KEY,  USE_VOCAB_IN_FILE_KEY,};static struct argp_option crossbow_options[] ={  {0, 0, 0, 0,   "For building data structures from text files:", 1},  {"index", 'i', 0, 0,   "tokenize training documents found under ARG..., build weight vectors, "   "and save them to disk"},  {"index-multiclass-list", INDEX_MULTICLASS_LIST_KEY, "FILE", 0,   "Index the files listed in FILE.  Each line of FILE should contain "   "a filenames followed by a list of classnames to which that file belongs."},  {"cluster", 'c', 0, 0,   "cluster the documents, and write the results to disk"},  {"cluster-output-dir", CLUSTER_OUTPUT_DIR_KEY, "DIR", 0,   "After clustering is finished, write the cluster to directory DIR"},  {"build-hier-from-dir", BUILD_HIER_FROM_DIR_KEY, 0, 0,   "When indexing a single directory, use the directory structure to build "   "a class hierarchy"},  {"classify", CLASSIFY_KEY, 0, 0,   "Split the data into train/test, and classify the test data, outputing "   "results in rainbow format"},  {"classify-files", CLASSIFY_FILES_KEY, "DIRNAME", 0,   "Classify documents in DIRNAME, outputing `filename classname' pairs "   "on each line."},  {"query-server", QUERY_SERVER_KEY, "PORTNUM", 0,   "Run crossbow in server mode, listening on socket number PORTNUM.  "   "You can try it by executing this command, then in a different shell "   "window on the same machine typing `telnet localhost PORTNUM'."},  {"print-word-probabilities", PRINT_WORD_PROBABILITIES_KEY, "FILEPREFIX", 0,   "Print the word probability distribution in each leaf to files named "   "FILEPREFIX-classname"},  {"print-doc-names", PRINT_DOC_NAMES_KEY, "TAG", OPTION_ARG_OPTIONAL,   "Print the filenames of documents contained in the model.  "   "If the optional TAG argument is given, print only the documents "   "that have the specified tag."},  {"print-matrix", PRINT_MATRIX_KEY, 0, 0,   "Print the word/document count matrix in an awk- or perl-accessible "   "format.  Format is sparse and includes the words and the counts."},  {"use-vocab-in-file", USE_VOCAB_IN_FILE_KEY, "FILENAME", 0,   "Limit vocabulary to just those words read as space-separated strings "   "from FILE."},  { 0 }};static error_tcrossbow_parse_opt (int key, char *arg, struct argp_state *state){  switch (key)    {    case 'i':      crossbow_arg_state.what_doing = crossbow_index;      break;    case INDEX_MULTICLASS_LIST_KEY:      crossbow_arg_state.what_doing = crossbow_index_multiclass_list;      crossbow_arg_state.multiclass_list_filename = arg;      break;    case 'c':      crossbow_arg_state.what_doing = crossbow_cluster;      break;    case CLUSTER_OUTPUT_DIR_KEY:      crossbow_arg_state.cluster_output_dir = arg;      break;    case BUILD_HIER_FROM_DIR_KEY:      crossbow_arg_state.build_hier_from_dir = 1;      break;    case CLASSIFY_FILES_KEY:      crossbow_arg_state.classify_files_dirname = arg;    case CLASSIFY_KEY:      crossbow_arg_state.what_doing = crossbow_classify;      break;    case QUERY_SERVER_KEY:      crossbow_arg_state.what_doing = crossbow_query_serving;      crossbow_arg_state.server_port_num = arg;      bow_lexer_document_end_pattern = "\n.\r\n";      break;    case PRINT_WORD_PROBABILITIES_KEY:      crossbow_arg_state.what_doing = crossbow_print_word_probabilities;      break;    case PRINT_DOC_NAMES_KEY:      crossbow_arg_state.what_doing = crossbow_print_doc_names;      crossbow_arg_state.printing_tag = arg;      break;    case PRINT_MATRIX_KEY:      crossbow_arg_state.what_doing = crossbow_print_matrix;      break;    case USE_VOCAB_IN_FILE_KEY:      crossbow_arg_state.vocab_map = bow_int4str_new_from_text_file (arg);      bow_verbosify (bow_progress,		     "Using vocab with %d words from file `%s'\n",		     crossbow_arg_state.vocab_map->str_array_length, arg);      break;    case ARGP_KEY_ARG:      /* Now we consume all the rest of the arguments.  STATE->next is the	 index in STATE->argv of the next argument to be parsed, which is the	 first STRING we're interested in, so we can just use	 `&state->argv[state->next]' as the value for RAINBOW_ARG_STATE->ARGS.	 IN ADDITION, by setting STATE->next to the end of the arguments, we	 can force argp to stop parsing here and return.  */      crossbow_arg_state.non_option_argi = state->next - 1;      if (crossbow_arg_state.what_doing == crossbow_index	  && state->next > state->argc)	{	  /* Zero directory names is not enough. */	  fprintf (stderr, "Need at least one directory to index.\n");	  argp_usage (state);	}      state->next = state->argc;      break;    default:      return ARGP_ERR_UNKNOWN;    }  return 0;}static struct argp crossbow_argp = { crossbow_options, crossbow_parse_opt, crossbow_argp_args_doc,  crossbow_argp_doc, bow_argp_children};/* This method structure is defined in hem.c,    and is the default bow_argp_method */extern crossbow_method hem_cluster_method;intmain (int argc, char *argv[]){  /* Default command-line argument values */  crossbow_arg_state.what_doing = crossbow_cluster;  crossbow_arg_state.cluster_output_dir = NULL;  crossbow_arg_state.build_hier_from_dir = 0;  crossbow_arg_state.print_file_prefix = NULL;  crossbow_arg_state.printing_tag = NULL;  crossbow_arg_state.classify_files_dirname = NULL;  crossbow_arg_state.vocab_map = NULL;  bow_argp_method = (bow_method*)&hem_cluster_method;  /* bow_lexer_toss_words_longer_than = 20; */  /* Parse the command-line arguments. */  argp_parse (&crossbow_argp, argc, argv, 0, 0, &crossbow_arg_state);  crossbow_argv = argv;  crossbow_argc = argc;  if (*crossbow_arg_state.what_doing != crossbow_index      && *crossbow_arg_state.what_doing != crossbow_index_multiclass_list)    {      crossbow_unarchive (bow_data_dirname);      /* Do test/train splits. */      bow_set_doc_types (crossbow_docs, crossbow_classes_count,			 crossbow_classnames);    }  (*crossbow_arg_state.what_doing) ();  if (crossbow_arg_state.cluster_output_dir      && *crossbow_arg_state.what_doing != crossbow_index)    crossbow_archive (bow_data_dirname);  exit (0);}
上一页 1 23
💿 文件大小 12 K
👤 上传用户 yjpynnpl
📂 所属分类 Linux/Unix编程
📄 代码行数 1,396 行
💻 语言类型 C语言
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -