📄 archer.c
字号:
/* archer - a document retreival front-end to libbow. *//* Copyright (C) 1998, 1999 Andrew McCallum Written by: Andrew Kachites McCallum <mccallum@cs.cmu.edu> This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#define _FILE_OFFSET_BITS 64#include <bow/libbow.h>#include <argp.h>#include <bow/archer.h>#include <errno.h> /* needed on DEC Alpha's */#include <sys/types.h>#include <sys/socket.h>#include <sys/un.h>#include <netinet/in.h>#include <netdb.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <strings.h>#include <signal.h>#include <unistd.h>#include <fcntl.h>#include <ctype.h>#include <limits.h>#include <sys/mman.h>/* The version number of this program. */#define ARCHER_MAJOR_VERSION 0#define ARCHER_MINOR_VERSION 0/* Global variables *//* The document/word/position matrix */bow_wi2pv *archer_wi2pv;/* The list of documents. */bow_sarray *archer_docs;/* The file descriptor of the socket on which we can act as a query-server. */int archer_sockfd;/* The variables that are set by command-line options. */struct archer_arg_state{ /* What this invocation of archer to do? */ void (*what_doing)(); int non_option_argi; int num_hits_to_print; FILE *query_out_fp; const char *dirname; const char *query_string; const char *server_port_num; int serve_with_forking; int score_is_raw_count;} archer_arg_state;/* Functions for creating, reading, writing a archer_doc */intarcher_doc_write (archer_doc *doc, FILE *fp){ int ret; ret = bow_fwrite_int (doc->tag, fp); ret += bow_fwrite_int (doc->word_count, fp); ret += bow_fwrite_int (doc->di, fp); return ret;}intarcher_doc_read (archer_doc *doc, FILE *fp){ int ret; int tag; ret = bow_fread_int (&tag, fp); doc->tag = tag; ret += bow_fread_int (&(doc->word_count), fp); ret += bow_fread_int (&(doc->di), fp); return ret;}voidarcher_doc_free (archer_doc *doc){}/* Writing and reading the word/document stats to disk. *//* Write the stats in the directory DATA_DIRNAME. */voidarcher_archive (){ char filename[BOW_MAX_WORD_LENGTH]; FILE *fp; sprintf (filename, "%s/vocabulary", bow_data_dirname); fp = bow_fopen (filename, "wb"); bow_words_write (fp); fclose (fp); sprintf (filename, "%s/wi2pv", bow_data_dirname); bow_wi2pv_write_to_filename (archer_wi2pv, filename); sprintf (filename, "%s/docs", bow_data_dirname); fp = bow_fopen (filename, "wb"); bow_sarray_write (archer_docs, (int(*)(void*,FILE*))archer_doc_write, fp); fclose (fp); fflush (archer_wi2pv->fp);}/* Read the stats from the directory DATA_DIRNAME. */voidarcher_unarchive (){ char filename[BOW_MAX_WORD_LENGTH]; FILE *fp; bow_verbosify (bow_progress, "Loading data files..."); sprintf (filename, "%s/vocabulary", bow_data_dirname); bow_words_read_from_file (filename); sprintf (filename, "%s/wi2pv", bow_data_dirname); archer_wi2pv = bow_wi2pv_new_from_filename (filename); sprintf (filename, "%s/docs", bow_data_dirname); fp = bow_fopen (filename, "rb"); archer_docs = bow_sarray_new_from_data_fp ((int(*)(void*,FILE*))archer_doc_read, archer_doc_free, fp); fclose (fp); bow_verbosify (bow_progress, "\n");}intarcher_index_filename (const char *filename, void *unused){ int di; archer_doc doc, *doc_ptr; int wi; int pi = 0; char word[BOW_MAX_WORD_LENGTH];#define USE_FAST_LEXER 1#if !USE_FAST_LEXER bow_lex *lex; FILE *fp;#endif /* Make sure this file isn't already in the index. If it is just return (after undeleting it, if necessary. */ doc_ptr = bow_sarray_entry_at_keystr (archer_docs, filename); if (doc_ptr) { if (doc_ptr->word_count < 0) doc_ptr->word_count = -(doc_ptr->word_count); return 1; } /* The index of this new document is the next available index in the array of documents. */ di = archer_docs->array->length;#if !USE_FAST_LEXER fp = fopen (filename, "r"); if (fp == NULL) { perror ("bow_fopen"); return 0; } /* NOTE: This will read just the first document from the file. */ lex = bow_default_lexer->open_text_fp (bow_default_lexer, fp, filename); if (lex == NULL) { fclose (fp); return 0; } while (bow_default_lexer->get_word (bow_default_lexer, lex, word, BOW_MAX_WORD_LENGTH)) { wi = bow_word2int_add_occurrence (word); if (wi < 0) continue; bow_wi2pv_add_wi_di_pi (archer_wi2pv, wi, di, pi);#if 0 /* Debugging */ { int di_read, pi_read; bow_wi2pv_wi_next_di_pi (archer_wi2pv, wi, &di_read, &pi_read); assert (di_read == di); assert (pi_read == pi); if (di == 0) printf ("%010d %010d %s\n", di, pi, bow_int2word (wi)); }#endif pi++; } bow_default_lexer->close (bow_default_lexer, lex); fclose (fp);#else /* USE_FAST_LEXER */ { int fd, c, wordlen; //bow_strtrie *strie; //int strtrie_index; unsigned hashid; char *docbuf; char *docbufptr; char *docbufptr_end; //size_t page_size = (size_t) sysconf (_SC_PAGESIZE); struct stat statbuf; if (!word_map) bow_words_set_map (NULL, 0); fd = open (filename, O_RDONLY); if (fd == -1) { perror ("archer index_filename open"); return 0; } fstat (fd, &statbuf); //statbuf.st_size = 20 * 1024; docbuf = mmap (NULL, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); close (fd); if (docbuf == (void*)-1) { fprintf (stderr, "\narcher index_filename(%s)\n", filename); perror (" mmap"); return 0; } docbufptr_end = docbuf + statbuf.st_size; /* One time through this loop for each word */ for (docbufptr = docbuf;;) { hashid = wordlen = 0; /* Ignore characters until we get a beginning character. */ while (!isalpha((unsigned)*docbufptr)) if (++docbufptr >= docbufptr_end) goto done_with_file; /* Add alphabetics to the word */ do { c = tolower((unsigned)*docbufptr); word[wordlen++] = c; /* The following must exactly match the behavior of int4str.c:_str2id */ hashid = 131 * hashid + c; docbufptr++; } while (wordlen < BOW_MAX_WORD_LENGTH && isalnum((unsigned)*docbufptr)); if (wordlen == BOW_MAX_WORD_LENGTH) { /* Word is longer than MAX, consume it and skip it. */ while (isalpha(*docbufptr++)) ; continue; } word[wordlen] = '\0'; /* Token is now in WORD; next see if it's too short or a stopword */ if (wordlen < 2 || wordlen > 30 || bow_stoplist_present_hash (word, hashid)) continue; /* Get the integer index of the word in WORD */ wi = _bow_str2int (word_map, word, hashid); bow_wi2pv_add_wi_di_pi (archer_wi2pv, wi, di, pi); if (docbufptr >= docbufptr_end) break; pi++; } done_with_file: munmap (docbuf, statbuf.st_size); }#endif /* USE_FAST_LEXER */ doc.tag = bow_doc_train; doc.word_count = pi; doc.di = di; bow_sarray_add_entry_with_keystr (archer_docs, &doc, filename); if (di % 200 == 0) bow_verbosify (bow_progress, "\r%8d |V|=%10d", di, bow_num_words()); di++; return pi;}voidarcher_index (){ archer_docs = bow_sarray_new (0, sizeof (archer_doc), archer_doc_free); archer_wi2pv = bow_wi2pv_new (0, "pv"); bow_verbosify (bow_progress, "Indexing files: "); bow_map_filenames_from_dir (archer_index_filename, NULL, archer_arg_state.dirname, ""); bow_verbosify (bow_progress, "\n"); archer_archive (); /* To close the FP for FILENAME_PV */ bow_wi2pv_free (archer_wi2pv);}/* Index each line of ARCHER_ARG_STATE.DIRNAME as if it were a separate file, named after the line number. */voidarcher_index_lines (){ static const int max_line_length = 2048; char buf[max_line_length]; FILE *fp; archer_doc doc; bow_lex *lex; char word[BOW_MAX_WORD_LENGTH]; int wi, di, pi; char filename[1024]; archer_docs = bow_sarray_new (0, sizeof (archer_doc), archer_doc_free); archer_wi2pv = bow_wi2pv_new (0, "pv"); fp = bow_fopen (archer_arg_state.dirname, "r"); bow_verbosify (bow_progress, "Indexing lines: "); while (fgets (buf, max_line_length, fp)) { lex = bow_default_lexer->open_str (bow_default_lexer, buf); if (lex == NULL) continue; di = archer_docs->array->length; sprintf (filename, "%08d", di); pi = 0; while (bow_default_lexer->get_word (bow_default_lexer, lex, word, BOW_MAX_WORD_LENGTH)) { wi = bow_word2int_add_occurrence (word); if (wi < 0) continue; bow_wi2pv_add_wi_di_pi (archer_wi2pv, wi, di, pi); } bow_default_lexer->close (bow_default_lexer, lex); doc.tag = bow_doc_train; doc.word_count = pi; doc.di = di; bow_sarray_add_entry_with_keystr (archer_docs, &doc, filename); pi++; } fclose (fp); bow_verbosify (bow_progress, "\n"); archer_archive (); /* To close the FP for FILENAME_PV */ bow_wi2pv_free (archer_wi2pv);}/* Set the special flag in FILENAME's doc structure indicating that this document has been removed from the index. Return zero on success, non-zero on failure. */intarcher_delete_filename (const char *filename){ archer_doc *doc; doc = bow_sarray_entry_at_keystr (archer_docs, filename); if (doc) { doc->word_count = -(doc->word_count); return 0; } return 1;}bow_wa *archer_query_hits_matching_wi (int wi, int *occurrence_count){ int count = 0; int di, pi; bow_wa *wa; if (wi >= archer_wi2pv->entry_count && archer_wi2pv->entry[wi].word_count <= 0) return NULL; wa = bow_wa_new (0); bow_pv_rewind (&(archer_wi2pv->entry[wi]), archer_wi2pv->fp); bow_wi2pv_wi_next_di_pi (archer_wi2pv, wi, &di, &pi); while (di != -1) { bow_wa_add_to_end (wa, di, 1); count++; bow_wi2pv_wi_next_di_pi (archer_wi2pv, wi, &di, &pi); } *occurrence_count = count; return wa;}/* Temporary constant. Fix this soon! */#define MAX_QUERY_WORDS 50bow_wa *archer_query_hits_matching_sequence (const char *query_string, const char *suffix_string){ int query[MAX_QUERY_WORDS]; /* WI's in the query */ int di[MAX_QUERY_WORDS]; int pi[MAX_QUERY_WORDS]; int query_len; int max_di, max_pi; int wi, i; bow_lex *lex; char word[BOW_MAX_WORD_LENGTH];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -