⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 archer.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 3 页
字号:
  /* Add this CURRENT_DI to the combinted list of hits in DOC_HITS */  assert (current_di < archer_docs->array->length);  doc_hits[doc_hits_count].di = current_di;  doc_hits[doc_hits_count].score = 0;  for (h = 0; h < word_hits_count[pos]; h++)    doc_hits[doc_hits_count].score +=       word_hits[pos][h].wa->entry[current_wai[pos][h]].weight;  doc_hits[doc_hits_count].terms_count = 0;  doc_hits[doc_hits_count].terms = bow_malloc (MAX_QUERY_WORDS*sizeof (char*));  /* Add score value from the regular terms, if CURRENT_DI appears there */  for (h = 0; h < word_hits_count[reg]; h++)    {      if (word_hits_count[pos] != 0)	{	  while (current_wai[reg][h] != -1		 && (word_hits[reg][h].wa->entry[current_wai[reg][h]].wi		     < current_di))	    {	      if (current_wai[reg][h] < word_hits[reg][h].wa->length - 1)		current_wai[reg][h]++;	      else		current_wai[reg][h] = -1;	    }	}      if (word_hits[reg][h].wa->entry[current_wai[reg][h]].wi	  == current_di)	{	  doc_hits[doc_hits_count].score += 	    word_hits[reg][h].wa->entry[current_wai[reg][h]].weight;	  doc_hits[doc_hits_count].	    terms[doc_hits[doc_hits_count].terms_count]	    = word_hits[reg][h].term;	  doc_hits[doc_hits_count].terms_count++;	}    }  doc_hits_count++;  if (doc_hits_count >= doc_hits_size)    {      doc_hits_size *= 2;      doc_hits = bow_realloc (doc_hits, (doc_hits_size					 * sizeof (struct _doc_hit)));    }  current_di++;  goto next_current_di; hit_combination_done:  if (doc_hits_count)    {      fprintf (archer_arg_state.query_out_fp, ",HITCOUNT %d\n", 	       doc_hits_count);      num_hits_to_print = MIN (doc_hits_count, 			       archer_arg_state.num_hits_to_print);      /* Sort the DOC_HITS list */#if 1      archer_sort_hits (doc_hits, doc_hits_count, num_hits_to_print);#else      qsort (doc_hits, doc_hits_count, sizeof (struct _doc_hit), 	     (int(*)(const void*,const void*))compare_doc_hits);#endif      for (i = 0; i < num_hits_to_print; i++)	{	  fprintf (archer_arg_state.query_out_fp,		   "%s %f ", bow_sarray_keystr_at_index (archer_docs, doc_hits[i].di), 		   doc_hits[i].score);	  for (h = 0; h < word_hits_count[pos]; h++)	    fprintf (archer_arg_state.query_out_fp, 		     "%s, ", word_hits[pos][h].term);	  for (h = 0; h < doc_hits[i].terms_count-1; h++)	    fprintf (archer_arg_state.query_out_fp, 		     "%s, ", doc_hits[i].terms[h]);	  h = doc_hits[i].terms_count - 1;	  if (h >= 0)	    fprintf (archer_arg_state.query_out_fp, 		     "%s", doc_hits[i].terms[h]);	  fprintf (archer_arg_state.query_out_fp, "\n");	}    }  fprintf (archer_arg_state.query_out_fp, ".\n");  fflush (archer_arg_state.query_out_fp);  /* Free all the junk we malloc'ed */  for (f = 0; f < num_flags; f++)    for (h = 0; h < word_hits_count[f]; h++)      bow_free ((char*)word_hits[f][h].term);  for (h = 0; h < doc_hits_count; h++)    bow_free (doc_hits[h].terms);  bow_free (doc_hits);  bow_free (query_copy);}/* Set up to listen for queries on a socket */voidarcher_query_socket_init (const char *socket_name, int use_unix_socket){  int servlen, type, bind_ret;  struct sockaddr_un un_addr;  struct sockaddr_in in_addr;  struct sockaddr *sap;  type = use_unix_socket ? AF_UNIX : AF_INET;  archer_sockfd = socket (type, SOCK_STREAM, 0);  assert (archer_sockfd >= 0);  if (type == AF_UNIX)    {      sap = (struct sockaddr *)&un_addr;      bzero ((char *)sap, sizeof (un_addr));      strcpy (un_addr.sun_path, socket_name);      servlen = strlen (un_addr.sun_path) + sizeof(un_addr.sun_family) + 1;    }  else    {      sap = (struct sockaddr *)&in_addr;      bzero ((char *)sap, sizeof (in_addr));      in_addr.sin_port = htons (atoi (socket_name));      in_addr.sin_addr.s_addr = htonl (INADDR_ANY);      servlen = sizeof (in_addr);    }  sap->sa_family = type;       bind_ret = bind (archer_sockfd, sap, servlen);  assert (bind_ret >= 0);  bow_verbosify (bow_progress, "Listening on port %d\n", atoi (socket_name));  listen (archer_sockfd, 5);}/* We assume that commands are no longer than 1024 characters in length *//* At the moment, we assume that the only possible command is ",HITS <num>" */voidarcher_query_server_process_commands (FILE *fp, int doing_pre_fork_commands){  int first;  char buf[1024];  int i;  char s[1024];  /* See if the first character of the line is the special char ','      which indicates that this is a command line. */  while ((first = fgetc (fp)))    {      if ((doing_pre_fork_commands && first != ';')	  || (!doing_pre_fork_commands && first != ','))	{	  ungetc (first, fp);	  return;	}      /* Retrieve the rest of the line, and process the command. */      fgets ((char *) buf, 1024, fp);      if (doing_pre_fork_commands) 	{	  if (sscanf (buf, "INDEX %1023s", s) == 1)	    archer_index_filename (s, NULL);	  else if (sscanf (buf, "DELETE %1023s", s) == 1)	    archer_delete_filename (s);	  else if (strstr (buf, "ARCHIVE") == buf)	    archer_archive ();	  else if (strstr (buf, "QUIT") == buf)	    {	      archer_archive ();	      exit (0);	    }	  else	    bow_verbosify (bow_progress,			   "Unknown pre-fork command `%s'\n", buf);	}      else	{	  if (sscanf (buf, "HITS %d", &i) == 1)	    archer_arg_state.num_hits_to_print = i;	  else	    bow_verbosify (bow_progress,			   "Unknown post-fork command `%s'\n", buf);	}    }}voidarcher_query_serve_one_query (){  int newsockfd, clilen;  struct sockaddr cli_addr;  FILE *in, *out;  int pid;  char query_buf[BOW_MAX_WORD_LENGTH];   clilen = sizeof (cli_addr);  newsockfd = accept (archer_sockfd, &cli_addr, &clilen);  if (newsockfd == -1)    bow_error ("Not able to accept connections!\n");  bow_verbosify (bow_progress, "Accepted connection\n");  assert (newsockfd >= 0);  in = fdopen (newsockfd, "r");  out = fdopen (newsockfd, "w");  archer_arg_state.query_out_fp = out;  archer_arg_state.query_string = query_buf;  archer_query_server_process_commands (in, 1);  if (archer_arg_state.serve_with_forking)    {      if ((pid = fork()) != 0)	{	  /* parent - return to server mode */	  fclose (in);	  fclose (out);	  close (newsockfd);	  return;	}      else	{	  /* child - reopen the PV file so we get our own lseek() position */	  bow_wi2pv_reopen_pv (archer_wi2pv);	}    }  bow_verbosify (bow_progress, "Processing query...\n");  while (!feof(in))    {      /* Strips any special commands from the beginning of the stream */      archer_query_server_process_commands	(in, archer_arg_state.serve_with_forking ? 0 : 1);      fgets (query_buf, BOW_MAX_WORD_LENGTH, in);      archer_query ();    }  fclose (in);  fclose (out);  close (newsockfd);  bow_verbosify (bow_progress, "Closed connection.\n");   /* Kill the child - don't want it hanging around, sucking up memory :) */  if (archer_arg_state.serve_with_forking)    exit (0);}voidarcher_query_serve (){  archer_query_socket_init (archer_arg_state.server_port_num, 0);  for (;;)    archer_query_serve_one_query ();}voidarcher_print_all (){  int wi;  int di;  int pi;  bow_wi2pv_rewind (archer_wi2pv);  for (wi = 0; wi < bow_num_words (); wi++)    {      for (;;)	{	  bow_wi2pv_wi_next_di_pi (archer_wi2pv, wi, &di, &pi);	  if (di == -1)	    break;	  printf ("%010d %010d %s\n", di, pi, bow_int2word (wi));	}    }}voidarcher_print_word_stats (){  bow_wi2pv_print_stats (archer_wi2pv);}/* Definitions for using argp command-line processing */const char *argp_program_version ="archer " STRINGIFY(ARCHER_MAJOR_VERSION) "." STRINGIFY(ARCHER_MINOR_VERSION);const char *argp_program_bug_address = "<mccallum@cs.cmu.edu>";static char archer_argp_doc[] ="Archer -- a document retrieval front-end to libbow";static char archer_argp_args_doc[] = "[ARG...]";enum {  QUERY_SERVER_KEY = 3000,  QUERY_FORK_SERVER_KEY,  INDEX_LINES_KEY,  SCORE_IS_RAW_COUNT_KEY,};static struct argp_option archer_options[] ={  {0, 0, 0, 0,   "For building data structures from text files:", 1},  {"index", 'i', "DIRNAME", 0,   "Tokenize training documents found under DIRNAME, "   "and save them to disk"},  {"index-lines", INDEX_LINES_KEY, "FILENAME", 0,   "Like --index, except index each line of FILENAME as if it were a "   "separate document.  Documents are named after sequential line numbers."},  {0, 0, 0, 0,   "For doing document retreival using the data structures built with -i:", 2},  {"query", 'q', "WORDS", 0,    "tokenize input from stdin [or FILE], then print document most like it"},  {"query-server", QUERY_SERVER_KEY, "PORTNUM", 0,   "Run archer in socket server mode."},  {"query-forking-server", QUERY_FORK_SERVER_KEY, "PORTNUM", 0,   "Run archer in socket server mode, forking a new process with every "   "connection.  Allows multiple simultaneous connections."},  {"num-hits-to-show", 'n', "N", 0,   "Show the N documents that are most similar to the query text "   "(default N=1)"},  {"score-is-raw-count", SCORE_IS_RAW_COUNT_KEY, 0, 0,   "Instead of using a weighted sum of logs, the score of a document "   "will be simply the number of terms in both the query and the document."},  {0, 0, 0, 0,   "Diagnostics", 3},  {"print-all", 'p', 0, 0,   "Print, in unsorted order, all the document indices, positions and words"},  {"print-word-stats", 's', 0, 0,   "Print the number of times each word occurs."},  { 0 }};static error_tarcher_parse_opt (int key, char *arg, struct argp_state *state){  switch (key)    {    case 'q':      archer_arg_state.what_doing = archer_query;      archer_arg_state.query_string = arg;      break;    case 'i':      archer_arg_state.what_doing = archer_index;      archer_arg_state.dirname = arg;      break;    case INDEX_LINES_KEY:      archer_arg_state.what_doing = archer_index_lines;      archer_arg_state.dirname = arg;      break;    case 'p':      archer_arg_state.what_doing = archer_print_all;      break;    case 'n':      archer_arg_state.num_hits_to_print = atoi (arg);      break;    case 's':      archer_arg_state.what_doing = archer_print_word_stats;      break;    case SCORE_IS_RAW_COUNT_KEY:      archer_arg_state.score_is_raw_count = 1;      break;    case QUERY_FORK_SERVER_KEY:      archer_arg_state.serve_with_forking = 1;    case QUERY_SERVER_KEY:      archer_arg_state.what_doing = archer_query_serve;      archer_arg_state.server_port_num = arg;      break;    case ARGP_KEY_ARG:      /* Now we consume all the rest of the arguments.  STATE->next is the	 index in STATE->argv of the next argument to be parsed, which is the	 first STRING we're interested in, so we can just use	 `&state->argv[state->next]' as the value for ARCHER_ARG_STATE->ARGS.	 IN ADDITION, by setting STATE->next to the end of the arguments, we	 can force argp to stop parsing here and return.  */      archer_arg_state.non_option_argi = state->next - 1;      if (archer_arg_state.what_doing == archer_index	  && state->next > state->argc)	{	  /* Zero directory names is not enough. */	  fprintf (stderr, "Need at least one directory to index.\n");	  argp_usage (state);	}      state->next = state->argc;      break;    default:      return ARGP_ERR_UNKNOWN;    }  return 0;}static struct argp archer_argp = { archer_options, archer_parse_opt, archer_argp_args_doc,  archer_argp_doc, bow_argp_children};/* The main() function. */intmain (int argc, char *argv[]){  /* Prevents zombie children in System V environments */  signal (SIGCHLD, SIG_IGN);  /* Default command-line argument values */  archer_arg_state.what_doing = NULL;  archer_arg_state.num_hits_to_print = 10;  archer_arg_state.dirname = NULL;  archer_arg_state.query_string = NULL;  archer_arg_state.serve_with_forking = 0;  archer_arg_state.query_out_fp = stdout;  archer_arg_state.score_is_raw_count = 0;  /* Parse the command-line arguments. */  argp_parse (&archer_argp, argc, argv, 0, 0, &archer_arg_state);  if (archer_arg_state.what_doing == NULL)    bow_error ("No action specified on command-line.");  if (*archer_arg_state.what_doing != archer_index      && *archer_arg_state.what_doing != archer_index_lines)    archer_unarchive ();  (*archer_arg_state.what_doing) ();  exit (0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -