📄 archer.c
字号:
/* Add this CURRENT_DI to the combinted list of hits in DOC_HITS */ assert (current_di < archer_docs->array->length); doc_hits[doc_hits_count].di = current_di; doc_hits[doc_hits_count].score = 0; for (h = 0; h < word_hits_count[pos]; h++) doc_hits[doc_hits_count].score += word_hits[pos][h].wa->entry[current_wai[pos][h]].weight; doc_hits[doc_hits_count].terms_count = 0; doc_hits[doc_hits_count].terms = bow_malloc (MAX_QUERY_WORDS*sizeof (char*)); /* Add score value from the regular terms, if CURRENT_DI appears there */ for (h = 0; h < word_hits_count[reg]; h++) { if (word_hits_count[pos] != 0) { while (current_wai[reg][h] != -1 && (word_hits[reg][h].wa->entry[current_wai[reg][h]].wi < current_di)) { if (current_wai[reg][h] < word_hits[reg][h].wa->length - 1) current_wai[reg][h]++; else current_wai[reg][h] = -1; } } if (word_hits[reg][h].wa->entry[current_wai[reg][h]].wi == current_di) { doc_hits[doc_hits_count].score += word_hits[reg][h].wa->entry[current_wai[reg][h]].weight; doc_hits[doc_hits_count]. terms[doc_hits[doc_hits_count].terms_count] = word_hits[reg][h].term; doc_hits[doc_hits_count].terms_count++; } } doc_hits_count++; if (doc_hits_count >= doc_hits_size) { doc_hits_size *= 2; doc_hits = bow_realloc (doc_hits, (doc_hits_size * sizeof (struct _doc_hit))); } current_di++; goto next_current_di; hit_combination_done: if (doc_hits_count) { fprintf (archer_arg_state.query_out_fp, ",HITCOUNT %d\n", doc_hits_count); num_hits_to_print = MIN (doc_hits_count, archer_arg_state.num_hits_to_print); /* Sort the DOC_HITS list */#if 1 archer_sort_hits (doc_hits, doc_hits_count, num_hits_to_print);#else qsort (doc_hits, doc_hits_count, sizeof (struct _doc_hit), (int(*)(const void*,const void*))compare_doc_hits);#endif for (i = 0; i < num_hits_to_print; i++) { fprintf (archer_arg_state.query_out_fp, "%s %f ", bow_sarray_keystr_at_index (archer_docs, doc_hits[i].di), doc_hits[i].score); for (h = 0; h < word_hits_count[pos]; h++) fprintf (archer_arg_state.query_out_fp, "%s, ", word_hits[pos][h].term); for (h = 0; h < doc_hits[i].terms_count-1; h++) fprintf (archer_arg_state.query_out_fp, "%s, ", doc_hits[i].terms[h]); h = doc_hits[i].terms_count - 1; if (h >= 0) fprintf (archer_arg_state.query_out_fp, "%s", doc_hits[i].terms[h]); fprintf (archer_arg_state.query_out_fp, "\n"); } } fprintf (archer_arg_state.query_out_fp, ".\n"); fflush (archer_arg_state.query_out_fp); /* Free all the junk we malloc'ed */ for (f = 0; f < num_flags; f++) for (h = 0; h < word_hits_count[f]; h++) bow_free ((char*)word_hits[f][h].term); for (h = 0; h < doc_hits_count; h++) bow_free (doc_hits[h].terms); bow_free (doc_hits); bow_free (query_copy);}/* Set up to listen for queries on a socket */voidarcher_query_socket_init (const char *socket_name, int use_unix_socket){ int servlen, type, bind_ret; struct sockaddr_un un_addr; struct sockaddr_in in_addr; struct sockaddr *sap; type = use_unix_socket ? AF_UNIX : AF_INET; archer_sockfd = socket (type, SOCK_STREAM, 0); assert (archer_sockfd >= 0); if (type == AF_UNIX) { sap = (struct sockaddr *)&un_addr; bzero ((char *)sap, sizeof (un_addr)); strcpy (un_addr.sun_path, socket_name); servlen = strlen (un_addr.sun_path) + sizeof(un_addr.sun_family) + 1; } else { sap = (struct sockaddr *)&in_addr; bzero ((char *)sap, sizeof (in_addr)); in_addr.sin_port = htons (atoi (socket_name)); in_addr.sin_addr.s_addr = htonl (INADDR_ANY); servlen = sizeof (in_addr); } sap->sa_family = type; bind_ret = bind (archer_sockfd, sap, servlen); assert (bind_ret >= 0); bow_verbosify (bow_progress, "Listening on port %d\n", atoi (socket_name)); listen (archer_sockfd, 5);}/* We assume that commands are no longer than 1024 characters in length *//* At the moment, we assume that the only possible command is ",HITS <num>" */voidarcher_query_server_process_commands (FILE *fp, int doing_pre_fork_commands){ int first; char buf[1024]; int i; char s[1024]; /* See if the first character of the line is the special char ',' which indicates that this is a command line. */ while ((first = fgetc (fp))) { if ((doing_pre_fork_commands && first != ';') || (!doing_pre_fork_commands && first != ',')) { ungetc (first, fp); return; } /* Retrieve the rest of the line, and process the command. */ fgets ((char *) buf, 1024, fp); if (doing_pre_fork_commands) { if (sscanf (buf, "INDEX %1023s", s) == 1) archer_index_filename (s, NULL); else if (sscanf (buf, "DELETE %1023s", s) == 1) archer_delete_filename (s); else if (strstr (buf, "ARCHIVE") == buf) archer_archive (); else if (strstr (buf, "QUIT") == buf) { archer_archive (); exit (0); } else bow_verbosify (bow_progress, "Unknown pre-fork command `%s'\n", buf); } else { if (sscanf (buf, "HITS %d", &i) == 1) archer_arg_state.num_hits_to_print = i; else bow_verbosify (bow_progress, "Unknown post-fork command `%s'\n", buf); } }}voidarcher_query_serve_one_query (){ int newsockfd, clilen; struct sockaddr cli_addr; FILE *in, *out; int pid; char query_buf[BOW_MAX_WORD_LENGTH]; clilen = sizeof (cli_addr); newsockfd = accept (archer_sockfd, &cli_addr, &clilen); if (newsockfd == -1) bow_error ("Not able to accept connections!\n"); bow_verbosify (bow_progress, "Accepted connection\n"); assert (newsockfd >= 0); in = fdopen (newsockfd, "r"); out = fdopen (newsockfd, "w"); archer_arg_state.query_out_fp = out; archer_arg_state.query_string = query_buf; archer_query_server_process_commands (in, 1); if (archer_arg_state.serve_with_forking) { if ((pid = fork()) != 0) { /* parent - return to server mode */ fclose (in); fclose (out); close (newsockfd); return; } else { /* child - reopen the PV file so we get our own lseek() position */ bow_wi2pv_reopen_pv (archer_wi2pv); } } bow_verbosify (bow_progress, "Processing query...\n"); while (!feof(in)) { /* Strips any special commands from the beginning of the stream */ archer_query_server_process_commands (in, archer_arg_state.serve_with_forking ? 0 : 1); fgets (query_buf, BOW_MAX_WORD_LENGTH, in); archer_query (); } fclose (in); fclose (out); close (newsockfd); bow_verbosify (bow_progress, "Closed connection.\n"); /* Kill the child - don't want it hanging around, sucking up memory :) */ if (archer_arg_state.serve_with_forking) exit (0);}voidarcher_query_serve (){ archer_query_socket_init (archer_arg_state.server_port_num, 0); for (;;) archer_query_serve_one_query ();}voidarcher_print_all (){ int wi; int di; int pi; bow_wi2pv_rewind (archer_wi2pv); for (wi = 0; wi < bow_num_words (); wi++) { for (;;) { bow_wi2pv_wi_next_di_pi (archer_wi2pv, wi, &di, &pi); if (di == -1) break; printf ("%010d %010d %s\n", di, pi, bow_int2word (wi)); } }}voidarcher_print_word_stats (){ bow_wi2pv_print_stats (archer_wi2pv);}/* Definitions for using argp command-line processing */const char *argp_program_version ="archer " STRINGIFY(ARCHER_MAJOR_VERSION) "." STRINGIFY(ARCHER_MINOR_VERSION);const char *argp_program_bug_address = "<mccallum@cs.cmu.edu>";static char archer_argp_doc[] ="Archer -- a document retrieval front-end to libbow";static char archer_argp_args_doc[] = "[ARG...]";enum { QUERY_SERVER_KEY = 3000, QUERY_FORK_SERVER_KEY, INDEX_LINES_KEY, SCORE_IS_RAW_COUNT_KEY,};static struct argp_option archer_options[] ={ {0, 0, 0, 0, "For building data structures from text files:", 1}, {"index", 'i', "DIRNAME", 0, "Tokenize training documents found under DIRNAME, " "and save them to disk"}, {"index-lines", INDEX_LINES_KEY, "FILENAME", 0, "Like --index, except index each line of FILENAME as if it were a " "separate document. Documents are named after sequential line numbers."}, {0, 0, 0, 0, "For doing document retreival using the data structures built with -i:", 2}, {"query", 'q', "WORDS", 0, "tokenize input from stdin [or FILE], then print document most like it"}, {"query-server", QUERY_SERVER_KEY, "PORTNUM", 0, "Run archer in socket server mode."}, {"query-forking-server", QUERY_FORK_SERVER_KEY, "PORTNUM", 0, "Run archer in socket server mode, forking a new process with every " "connection. Allows multiple simultaneous connections."}, {"num-hits-to-show", 'n', "N", 0, "Show the N documents that are most similar to the query text " "(default N=1)"}, {"score-is-raw-count", SCORE_IS_RAW_COUNT_KEY, 0, 0, "Instead of using a weighted sum of logs, the score of a document " "will be simply the number of terms in both the query and the document."}, {0, 0, 0, 0, "Diagnostics", 3}, {"print-all", 'p', 0, 0, "Print, in unsorted order, all the document indices, positions and words"}, {"print-word-stats", 's', 0, 0, "Print the number of times each word occurs."}, { 0 }};static error_tarcher_parse_opt (int key, char *arg, struct argp_state *state){ switch (key) { case 'q': archer_arg_state.what_doing = archer_query; archer_arg_state.query_string = arg; break; case 'i': archer_arg_state.what_doing = archer_index; archer_arg_state.dirname = arg; break; case INDEX_LINES_KEY: archer_arg_state.what_doing = archer_index_lines; archer_arg_state.dirname = arg; break; case 'p': archer_arg_state.what_doing = archer_print_all; break; case 'n': archer_arg_state.num_hits_to_print = atoi (arg); break; case 's': archer_arg_state.what_doing = archer_print_word_stats; break; case SCORE_IS_RAW_COUNT_KEY: archer_arg_state.score_is_raw_count = 1; break; case QUERY_FORK_SERVER_KEY: archer_arg_state.serve_with_forking = 1; case QUERY_SERVER_KEY: archer_arg_state.what_doing = archer_query_serve; archer_arg_state.server_port_num = arg; break; case ARGP_KEY_ARG: /* Now we consume all the rest of the arguments. STATE->next is the index in STATE->argv of the next argument to be parsed, which is the first STRING we're interested in, so we can just use `&state->argv[state->next]' as the value for ARCHER_ARG_STATE->ARGS. IN ADDITION, by setting STATE->next to the end of the arguments, we can force argp to stop parsing here and return. */ archer_arg_state.non_option_argi = state->next - 1; if (archer_arg_state.what_doing == archer_index && state->next > state->argc) { /* Zero directory names is not enough. */ fprintf (stderr, "Need at least one directory to index.\n"); argp_usage (state); } state->next = state->argc; break; default: return ARGP_ERR_UNKNOWN; } return 0;}static struct argp archer_argp = { archer_options, archer_parse_opt, archer_argp_args_doc, archer_argp_doc, bow_argp_children};/* The main() function. */intmain (int argc, char *argv[]){ /* Prevents zombie children in System V environments */ signal (SIGCHLD, SIG_IGN); /* Default command-line argument values */ archer_arg_state.what_doing = NULL; archer_arg_state.num_hits_to_print = 10; archer_arg_state.dirname = NULL; archer_arg_state.query_string = NULL; archer_arg_state.serve_with_forking = 0; archer_arg_state.query_out_fp = stdout; archer_arg_state.score_is_raw_count = 0; /* Parse the command-line arguments. */ argp_parse (&archer_argp, argc, argv, 0, 0, &archer_arg_state); if (archer_arg_state.what_doing == NULL) bow_error ("No action specified on command-line."); if (*archer_arg_state.what_doing != archer_index && *archer_arg_state.what_doing != archer_index_lines) archer_unarchive (); (*archer_arg_state.what_doing) (); exit (0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -