📄 archer.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* archer - a document retreival front-end to libbow. *//* Copyright (C) 1998, 1999 Andrew McCallum   Written by:  Andrew Kachites McCallum <mccallum@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#define _FILE_OFFSET_BITS 64#include <bow/libbow.h>#include <argp.h>#include <bow/archer.h>#include <errno.h>		/* needed on DEC Alpha's */#include <sys/types.h>#include <sys/socket.h>#include <sys/un.h>#include <netinet/in.h>#include <netdb.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <strings.h>#include <signal.h>#include <unistd.h>#include <fcntl.h>#include <ctype.h>#include <limits.h>#include <sys/mman.h>/* The version number of this program. */#define ARCHER_MAJOR_VERSION 0#define ARCHER_MINOR_VERSION 0/* Global variables *//* The document/word/position matrix */bow_wi2pv *archer_wi2pv;/* The list of documents. */bow_sarray *archer_docs;/* The file descriptor of the socket on which we can act as a query-server. */int archer_sockfd;/* The variables that are set by command-line options. */struct archer_arg_state{  /* What this invocation of archer to do? */  void (*what_doing)();  int non_option_argi;  int num_hits_to_print;  FILE *query_out_fp;  const char *dirname;  const char *query_string;  const char *server_port_num;  int serve_with_forking;  int score_is_raw_count;} archer_arg_state;/* Functions for creating, reading, writing a archer_doc */intarcher_doc_write (archer_doc *doc, FILE *fp){  int ret;  ret = bow_fwrite_int (doc->tag, fp);  ret += bow_fwrite_int (doc->word_count, fp);  ret += bow_fwrite_int (doc->di, fp);  return ret;}intarcher_doc_read (archer_doc *doc, FILE *fp){  int ret;  int tag;  ret = bow_fread_int (&tag, fp);  doc->tag = tag;  ret += bow_fread_int (&(doc->word_count), fp);  ret += bow_fread_int (&(doc->di), fp);  return ret;}voidarcher_doc_free (archer_doc *doc){}/* Writing and reading the word/document stats to disk. *//* Write the stats in the directory DATA_DIRNAME. */voidarcher_archive (){  char filename[BOW_MAX_WORD_LENGTH];  FILE *fp;  sprintf (filename, "%s/vocabulary", bow_data_dirname);  fp = bow_fopen (filename, "wb");  bow_words_write (fp);  fclose (fp);  sprintf (filename, "%s/wi2pv", bow_data_dirname);  bow_wi2pv_write_to_filename (archer_wi2pv, filename);  sprintf (filename, "%s/docs", bow_data_dirname);  fp = bow_fopen (filename, "wb");  bow_sarray_write (archer_docs, (int(*)(void*,FILE*))archer_doc_write, fp);  fclose (fp);  fflush (archer_wi2pv->fp);}/* Read the stats from the directory DATA_DIRNAME. */voidarcher_unarchive (){  char filename[BOW_MAX_WORD_LENGTH];  FILE *fp;  bow_verbosify (bow_progress, "Loading data files...");  sprintf (filename, "%s/vocabulary", bow_data_dirname);  bow_words_read_from_file (filename);  sprintf (filename, "%s/wi2pv", bow_data_dirname);  archer_wi2pv = bow_wi2pv_new_from_filename (filename);  sprintf (filename, "%s/docs", bow_data_dirname);  fp = bow_fopen (filename, "rb");  archer_docs =     bow_sarray_new_from_data_fp ((int(*)(void*,FILE*))archer_doc_read, 				archer_doc_free, fp);  fclose (fp);  bow_verbosify (bow_progress, "\n");}intarcher_index_filename (const char *filename, void *unused){  int di;  archer_doc doc, *doc_ptr;  int wi;  int pi = 0;  char word[BOW_MAX_WORD_LENGTH];#define USE_FAST_LEXER 1#if !USE_FAST_LEXER  bow_lex *lex;  FILE *fp;#endif  /* Make sure this file isn't already in the index.  If it is just     return (after undeleting it, if necessary. */  doc_ptr = bow_sarray_entry_at_keystr (archer_docs, filename);  if (doc_ptr)    {      if (doc_ptr->word_count < 0)	doc_ptr->word_count = -(doc_ptr->word_count);      return 1;    }  /* The index of this new document is the next available index in the     array of documents. */  di = archer_docs->array->length;#if !USE_FAST_LEXER  fp = fopen (filename, "r");  if (fp == NULL)    {      perror ("bow_fopen");      return 0;    }  /* NOTE: This will read just the first document from the file. */  lex = bow_default_lexer->open_text_fp (bow_default_lexer, fp, filename);  if (lex == NULL)    {      fclose (fp);      return 0;    }  while (bow_default_lexer->get_word (bow_default_lexer,				      lex, word, BOW_MAX_WORD_LENGTH))    {      wi = bow_word2int_add_occurrence (word);      if (wi < 0)	continue;      bow_wi2pv_add_wi_di_pi (archer_wi2pv, wi, di, pi);#if 0      /* Debugging */      {	int di_read, pi_read;	bow_wi2pv_wi_next_di_pi (archer_wi2pv, wi, &di_read, &pi_read);	assert (di_read == di);	assert (pi_read == pi);	if (di == 0)	  printf ("%010d %010d %s\n", di, pi, bow_int2word (wi));      }#endif      pi++;    }  bow_default_lexer->close (bow_default_lexer, lex);  fclose (fp);#else /* USE_FAST_LEXER */  {    int fd, c, wordlen;    //bow_strtrie *strie;    //int strtrie_index;    unsigned hashid;    char *docbuf;    char *docbufptr;    char *docbufptr_end;    //size_t page_size = (size_t) sysconf (_SC_PAGESIZE);    struct stat statbuf;    if (!word_map)      bow_words_set_map (NULL, 0);    fd = open (filename, O_RDONLY);    if (fd == -1)      {				perror ("archer index_filename open");				return 0;      }    fstat (fd, &statbuf);    //statbuf.st_size = 20 * 1024;    docbuf = mmap (NULL, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);    close (fd);    if (docbuf == (void*)-1)      {				fprintf (stderr, "\narcher index_filename(%s)\n", filename);				perror (" mmap");				return 0;      }    docbufptr_end = docbuf + statbuf.st_size;    /* One time through this loop for each word */    for (docbufptr = docbuf;;)      {	hashid = wordlen = 0;	/* Ignore characters until we get a beginning character. */	while (!isalpha((unsigned)*docbufptr))	  if (++docbufptr >= docbufptr_end)	    goto done_with_file;	/* Add alphabetics to the word */	do	  {	    c = tolower((unsigned)*docbufptr);	    word[wordlen++] = c;	    /* The following must exactly match the behavior of               int4str.c:_str2id */	    hashid = 131 * hashid + c;	    docbufptr++;	  }	while (wordlen < BOW_MAX_WORD_LENGTH	       && isalnum((unsigned)*docbufptr));	if (wordlen == BOW_MAX_WORD_LENGTH)	  {	    /* Word is longer than MAX, consume it and skip it. */	    while (isalpha(*docbufptr++))	      ;	    continue;	  }	word[wordlen] = '\0';	/* Token is now in WORD; next see if it's too short or a stopword */	if (wordlen < 2 	    || wordlen > 30	    || bow_stoplist_present_hash (word, hashid))	  continue;	/* Get the integer index of the word in WORD */	wi = _bow_str2int (word_map, word, hashid);	bow_wi2pv_add_wi_di_pi (archer_wi2pv, wi, di, pi);	if (docbufptr >= docbufptr_end)	  break;	pi++;      }  done_with_file:    munmap (docbuf, statbuf.st_size);  }#endif /* USE_FAST_LEXER */  doc.tag = bow_doc_train;  doc.word_count = pi;  doc.di = di;  bow_sarray_add_entry_with_keystr (archer_docs, &doc, filename);        if (di % 200 == 0)    bow_verbosify (bow_progress, "\r%8d |V|=%10d", di, bow_num_words());  di++;  return pi;}voidarcher_index (){  archer_docs = bow_sarray_new (0, sizeof (archer_doc), archer_doc_free);  archer_wi2pv = bow_wi2pv_new (0, "pv");  bow_verbosify (bow_progress, "Indexing files:              ");  bow_map_filenames_from_dir (archer_index_filename, NULL,			      archer_arg_state.dirname, "");  bow_verbosify (bow_progress, "\n");  archer_archive ();  /* To close the FP for FILENAME_PV */  bow_wi2pv_free (archer_wi2pv);}/* Index each line of ARCHER_ARG_STATE.DIRNAME as if it were a   separate file, named after the line number. */voidarcher_index_lines (){  static const int max_line_length = 2048;  char buf[max_line_length];  FILE *fp;  archer_doc doc;  bow_lex *lex;  char word[BOW_MAX_WORD_LENGTH];  int wi, di, pi;  char filename[1024];  archer_docs = bow_sarray_new (0, sizeof (archer_doc), archer_doc_free);  archer_wi2pv = bow_wi2pv_new (0, "pv");  fp = bow_fopen (archer_arg_state.dirname, "r");  bow_verbosify (bow_progress, "Indexing lines:              ");  while (fgets (buf, max_line_length, fp))    {      lex = bow_default_lexer->open_str (bow_default_lexer, buf);      if (lex == NULL)	continue;      di = archer_docs->array->length;      sprintf (filename, "%08d", di);      pi = 0;      while (bow_default_lexer->get_word (bow_default_lexer,					  lex, word, BOW_MAX_WORD_LENGTH))	{	  wi = bow_word2int_add_occurrence (word);	  if (wi < 0)	    continue;	  bow_wi2pv_add_wi_di_pi (archer_wi2pv, wi, di, pi);	}      bow_default_lexer->close (bow_default_lexer, lex);      doc.tag = bow_doc_train;      doc.word_count = pi;      doc.di = di;      bow_sarray_add_entry_with_keystr (archer_docs, &doc, filename);      pi++;    }  fclose (fp);  bow_verbosify (bow_progress, "\n");  archer_archive ();  /* To close the FP for FILENAME_PV */  bow_wi2pv_free (archer_wi2pv);}/* Set the special flag in FILENAME's doc structure indicating that   this document has been removed from the index.  Return zero on   success, non-zero on failure. */intarcher_delete_filename (const char *filename){  archer_doc *doc;  doc = bow_sarray_entry_at_keystr (archer_docs, filename);  if (doc)    {      doc->word_count = -(doc->word_count);      return 0;    }  return 1;}bow_wa *archer_query_hits_matching_wi (int wi, int *occurrence_count){  int count = 0;  int di, pi;  bow_wa *wa;  if (wi >= archer_wi2pv->entry_count && archer_wi2pv->entry[wi].word_count <= 0)    return NULL;  wa = bow_wa_new (0);  bow_pv_rewind (&(archer_wi2pv->entry[wi]), archer_wi2pv->fp);  bow_wi2pv_wi_next_di_pi (archer_wi2pv, wi, &di, &pi);  while (di != -1)    {      bow_wa_add_to_end (wa, di, 1);      count++;      bow_wi2pv_wi_next_di_pi (archer_wi2pv, wi, &di, &pi);    }  *occurrence_count = count;  return wa;}/* Temporary constant.  Fix this soon! */#define MAX_QUERY_WORDS 50bow_wa *archer_query_hits_matching_sequence (const char *query_string,				     const char *suffix_string){  int query[MAX_QUERY_WORDS];		/* WI's in the query */  int di[MAX_QUERY_WORDS];  int pi[MAX_QUERY_WORDS];  int query_len;  int max_di, max_pi;  int wi, i;  bow_lex *lex;  char word[BOW_MAX_WORD_LENGTH];
12 3 下一页
💿 文件大小 11 K
👤 上传用户 c_word
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -