📄 wi2dvf.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Word-index to document-vector-file *//* Copyright (C) 1997, 1998, 1999, 2000 Andrew McCallum   Written by:  Andrew Kachites McCallum <mccallum@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <netinet/in.h>		/* for machine-independent byte-order */#include <assert.h>#include <string.h>#define INIT_BOW_DVF(DVF) { DVF.seek_start = -1; DVF.dv = NULL; }unsigned int bow_wi2dvf_default_capacity = 1024;bow_wi2dvf *bow_wi2dvf_new (int capacity){  bow_wi2dvf *ret;  int i;  if (capacity == 0)    capacity = bow_wi2dvf_default_capacity;  ret = bow_malloc (sizeof (bow_wi2dvf) + (sizeof (bow_dvf) * capacity));  ret->size = capacity;  ret->num_words = 0;  ret->fp = NULL;  for (i = 0; i < capacity; i++)    INIT_BOW_DVF(ret->entry[i]);  return ret;}/* xxx We should think about a scheme that doesn't require keeping all   the "document vectors" in core at the time time.  We could write   them to disk, read them back in when we needed to add to them, then   write them back out again.  We would need a nice caching scheme, as   well as nice way to deal with "document vectors" that grow. *//* Add a "word vector" WV, associated with "document index" DI, to    the map WI2DVF. */ voidbow_wi2dvf_add_di_wv (bow_wi2dvf **wi2dvf, int di, bow_wv *wv){  int i, wi;  int max_wi = bow_num_words ();  if (max_wi > (*wi2dvf)->size)    {      /* There are so many unique words, we need to grow the array	 that maps WI's to DVF's. */      int wi = (*wi2dvf)->size;	        /* a "word index" */      (*wi2dvf)->size = MAX (max_wi, (*wi2dvf)->size * 2);      (*wi2dvf) = bow_realloc (*wi2dvf, 			       (sizeof (bow_wi2dvf)				+ (sizeof (bow_dvf) * (*wi2dvf)->size)));      /* Initialize the new part of the realloc'ed space. */      for ( ; wi < (*wi2dvf)->size; wi++)	INIT_BOW_DVF((*wi2dvf)->entry[wi]);    }  /* Run down the "word vector", depositing each entry in the WI2DVF. */  for (i = 0; i < wv->num_entries; i++)    {      wi = wv->entry[i].wi;      assert ((*wi2dvf)->size > wi);      if ((*wi2dvf)->entry[wi].dv == NULL)	{	  /* There is not yet a "document vector" for "word index" WI,	     so create one. */	  (*wi2dvf)->entry[wi].dv = bow_dv_new (0);	  /* This 2 is a flag to the hide/unhide code that this DV exists. */	  (*wi2dvf)->entry[wi].seek_start = 2;	  ((*wi2dvf)->num_words)++;	}      /* Add the "document index" DI and the count associated with         word index WI to the WI'th "document vector". */      bow_dv_add_di_count_weight (&((*wi2dvf)->entry[wi].dv), di,				  wv->entry[i].count, 				  wv->entry[i].weight);    }}/* Read all the words from character array DATA, and add them to WI2DVF,   associated with document index DI. */intbow_wi2dvf_add_di_text_str (bow_wi2dvf **wi2dvf, int di, char *data,			    const char *filename){  char word[BOW_MAX_WORD_LENGTH]; /* buffer for reading and stemming words */  int wi;			/* a word index */  bow_lex *lex;  int num_words = 0;  lex = bow_default_lexer->open_str (bow_default_lexer, data);  assert (lex);  /* Loop once for each lexical token in this document. */  while (bow_default_lexer->get_word (bow_default_lexer,				      lex, word, BOW_MAX_WORD_LENGTH))    {      /* Find out the word's "index". */      wi = bow_word2int_add_occurrence (word);      if (wi < 0)	continue;      /* Increment our stats about this word/document pair. */      bow_wi2dvf_add_wi_di_count_weight (wi2dvf, wi, di, 1, 1);      /* Increment our count of the number of words in this document. */      num_words++;    }  bow_default_lexer->close (bow_default_lexer, lex);  return num_words;}/* Read all the words from file pointer FP, and add them to WI2DVF,   associated with document index DI. */intbow_wi2dvf_add_di_text_fp (bow_wi2dvf **wi2dvf, int di, FILE *fp,			   const char *filename){  char word[BOW_MAX_WORD_LENGTH]; /* buffer for reading and stemming words */  int wi;			/* a word index */  bow_lex *lex;  int num_words = 0;  /* Loop once for each document in this file. */  while ((lex = bow_default_lexer->open_text_fp (bow_default_lexer, fp,						 filename)))    {      /* Loop once for each lexical token in this document. */      while (bow_default_lexer->get_word (bow_default_lexer,					  lex, word, BOW_MAX_WORD_LENGTH))	{	  /* Find out the word's "index". */	  wi = bow_word2int_add_occurrence (word);	  if (wi < 0)	    continue;	  /* Increment our stats about this word/document pair. */	  bow_wi2dvf_add_wi_di_count_weight (wi2dvf, wi, di, 1, 1);	  /* Increment our count of the number of words in this document. */	  num_words++;	}      bow_default_lexer->close (bow_default_lexer, lex);    }  return num_words;}/* In the map WI2DVF, increase by COUNT and WEIGHT our record of the   number times and weight that the document with "document index" DI   contains the word with "word index" WI. */voidbow_wi2dvf_add_wi_di_count_weight (bow_wi2dvf **wi2dvf, int wi,				   int di, int count, float weight){  if (wi >= (*wi2dvf)->size)    {      /* There are so many unique words, we need to grow the array	 that maps WI's to DVF's. */      int old_size = (*wi2dvf)->size; /* a "word vector" */      (*wi2dvf)->size = MAX (wi+1, (*wi2dvf)->size * 2);      (*wi2dvf) = bow_realloc (*wi2dvf, 			       (sizeof (bow_wi2dvf)				+ (sizeof (bow_dvf) * (*wi2dvf)->size)));      /* Initialize the new part of the realloc'ed space. */      for ( ; old_size < (*wi2dvf)->size; old_size++)	INIT_BOW_DVF((*wi2dvf)->entry[old_size]);    }   /* Increment the stats for the WI/DI pair. */  if ((*wi2dvf)->entry[wi].dv == NULL)    {      /* There is not yet a "document vector" for "word index" WI,	 so create one. */      (*wi2dvf)->entry[wi].dv = bow_dv_new (0);      /* This 2 is a flag to the hide/unhide code that this DV exists. */      (*wi2dvf)->entry[wi].seek_start = 2;      ((*wi2dvf)->num_words)++;    }  /* Add the "document index" DI and the count associated with     word index WI to the WI'th "document vector". */  bow_dv_add_di_count_weight (&((*wi2dvf)->entry[wi].dv), di, count, weight);}/* In the map WI2DVF, set to COUNT and WEIGHT our record of the   number times and weight that the document with "document index" DI   contains the word with "word index" WI. */voidbow_wi2dvf_set_wi_di_count_weight (bow_wi2dvf **wi2dvf, int wi,				   int di, int count, float weight){  if (wi >= (*wi2dvf)->size)    {      /* There are so many unique words, we need to grow the array	 that maps WI's to DVF's. */      int old_size = (*wi2dvf)->size; /* a "word vector" */      (*wi2dvf)->size = MAX (wi+1, (*wi2dvf)->size * 2);      (*wi2dvf) = bow_realloc (*wi2dvf, 			       (sizeof (bow_wi2dvf)				+ (sizeof (bow_dvf) * (*wi2dvf)->size)));      /* Initialize the new part of the realloc'ed space. */      for ( ; old_size < (*wi2dvf)->size; old_size++)	INIT_BOW_DVF((*wi2dvf)->entry[old_size]);    }   /* Increment the stats for the WI/DI pair. */  if ((*wi2dvf)->entry[wi].dv == NULL)    {      /* There is not yet a "document vector" for "word index" WI,	 so create one. */      (*wi2dvf)->entry[wi].dv = bow_dv_new (0);      /* This 2 is a flag to the hide/unhide code that this DV exists. */      (*wi2dvf)->entry[wi].seek_start = 2;      ((*wi2dvf)->num_words)++;    }  /* Add the "document index" DI and the count associated with     word index WI to the WI'th "document vector". */  bow_dv_set_di_count_weight (&((*wi2dvf)->entry[wi].dv), di, count, weight);}/* Return a pointer to the BOW_DE for a particular word/document pair,    or return NULL if there is no entry for that pair. */bow_de *bow_wi2dvf_entry_at_wi_di (bow_wi2dvf *wi2dvf, int wi, int di){  bow_dv *dv = bow_wi2dvf_dv (wi2dvf, wi);  if (!dv)    return NULL;  return bow_dv_entry_at_di (dv, di);}/* Remove the word with index WI from the vocabulary of the map WI2DVF */voidbow_wi2dvf_remove_wi (bow_wi2dvf *wi2dvf, int wi){  bow_dv *dv;  assert (wi < wi2dvf->size);  bow_error ("Don't call this function.  It's broken.");  /* xxx This could be more efficient.  Avoid reading it in, just to free it */  dv = bow_wi2dvf_dv (wi2dvf, wi);  if (dv)    {      bow_dv_free (wi2dvf->entry[wi].dv);      (wi2dvf->num_words)--;    }  INIT_BOW_DVF (wi2dvf->entry[wi]);}#define FREE_WHEN_HIDING_WI 0/* Temporarily hide the word with index WI from the vocabulary of the   map WI2DVF. The function BOW_WI2DVF_DV() will no longer see the entry   for this WI, but */voidbow_wi2dvf_hide_wi (bow_wi2dvf *wi2dvf, int wi){  assert (wi < wi2dvf->size);#if FREE_WHEN_HIDING_WI  if (wi2dvf->entry[wi].dv)    {      bow_dv_free (wi2dvf->entry[wi].dv);      /* (wi2dvf->num_words)--; */    }  wi2dvf->entry[wi].dv = NULL;#endif  /* The token -1 is reserved to mean that the DV is uninitialized. */  assert (!(wi2dvf->entry[wi].dv && wi2dvf->entry[wi].seek_start == -1));  /* Make the SEEK_START negative so we won't use it in normal situations,     but will be able to remember it and get it back when we need it. */  if (wi2dvf->entry[wi].seek_start > 0)    {      wi2dvf->entry[wi].seek_start = - (wi2dvf->entry[wi].seek_start);      (wi2dvf->num_words)--;    }}/* hide all the words that exist */voidbow_wi2dvf_hide_all_wi (bow_wi2dvf *wi2dvf){  int wi;  for (wi = 0; wi < wi2dvf->size; wi++)    {      bow_dv *dv = bow_wi2dvf_dv (wi2dvf, wi);      if (dv)	bow_wi2dvf_hide_wi (wi2dvf, wi);    }}/* unhide a specific word index */voidbow_wi2dvf_unhide_wi (bow_wi2dvf *wi2dvf, int wi){  assert (wi < wi2dvf->size);  assert (wi2dvf->entry[wi].seek_start < -1);  wi2dvf->entry[wi].seek_start = - (wi2dvf->entry[wi].seek_start);  (wi2dvf->num_words)++;}/* Hide all words occuring in only COUNT or fewer number of documents.   Return the number of words hidden. */intbow_wi2dvf_hide_words_by_doc_count (bow_wi2dvf *wi2dvf, int count){  int wi;  bow_dv *dv;  int num_hides = 0;  if (count == 0)    return 0;  for (wi = 0; wi < wi2dvf->size; wi++)    {      dv = bow_wi2dvf_dv (wi2dvf, wi);      if (dv && dv->length <= count)	{	  bow_wi2dvf_hide_wi (wi2dvf, wi);	  num_hides++;	}    }  return num_hides;}/* Hide all words occuring in only COUNT or fewer times.   Return the number of words hidden. */intbow_wi2dvf_hide_words_by_occur_count (bow_wi2dvf *wi2dvf, int count){  int wi;  bow_dv *dv;  int num_hides = 0;  if (count == 0)    return 0;  for (wi = 0; wi < wi2dvf->size; wi++)    {      dv = bow_wi2dvf_dv (wi2dvf, wi);      if (dv && bow_words_occurrences_for_wi (wi) <= count)	{	  bow_wi2dvf_hide_wi (wi2dvf, wi);	  num_hides++;	}    }  return num_hides;}/* hide all words where the prefix of the word matches the given   prefix */intbow_wi2dvf_hide_words_with_prefix (bow_wi2dvf *wi2dvf, char *prefix){  int wi;  int num_hides = 0;  int prefix_len = strlen (prefix);  bow_dv *dv;  /* hide all words where the prefix of the word matches the given     prefix */  for (wi = 0; wi < wi2dvf->size; wi++)    {      dv = bow_wi2dvf_dv (wi2dvf, wi);      if (dv && 0 == strncmp (prefix, bow_int2word (wi), prefix_len))	{	  bow_wi2dvf_hide_wi (wi2dvf, wi);	  num_hides++;	}    }  return num_hides;
12 下一页
💿 文件大小 7 K
👤 上传用户 jxyw163
📂 所属分类 Linux/Unix编程
📄 代码行数 779 行
💻 语言类型 C语言
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -