📄 wi2dvf.c
字号:
/* Word-index to document-vector-file *//* Copyright (C) 1997, 1998, 1999, 2000 Andrew McCallum Written by: Andrew Kachites McCallum <mccallum@cs.cmu.edu> This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <netinet/in.h> /* for machine-independent byte-order */#include <assert.h>#include <string.h>#define INIT_BOW_DVF(DVF) { DVF.seek_start = -1; DVF.dv = NULL; }unsigned int bow_wi2dvf_default_capacity = 1024;bow_wi2dvf *bow_wi2dvf_new (int capacity){ bow_wi2dvf *ret; int i; if (capacity == 0) capacity = bow_wi2dvf_default_capacity; ret = bow_malloc (sizeof (bow_wi2dvf) + (sizeof (bow_dvf) * capacity)); ret->size = capacity; ret->num_words = 0; ret->fp = NULL; for (i = 0; i < capacity; i++) INIT_BOW_DVF(ret->entry[i]); return ret;}/* xxx We should think about a scheme that doesn't require keeping all the "document vectors" in core at the time time. We could write them to disk, read them back in when we needed to add to them, then write them back out again. We would need a nice caching scheme, as well as nice way to deal with "document vectors" that grow. *//* Add a "word vector" WV, associated with "document index" DI, to the map WI2DVF. */ voidbow_wi2dvf_add_di_wv (bow_wi2dvf **wi2dvf, int di, bow_wv *wv){ int i, wi; int max_wi = bow_num_words (); if (max_wi > (*wi2dvf)->size) { /* There are so many unique words, we need to grow the array that maps WI's to DVF's. */ int wi = (*wi2dvf)->size; /* a "word index" */ (*wi2dvf)->size = MAX (max_wi, (*wi2dvf)->size * 2); (*wi2dvf) = bow_realloc (*wi2dvf, (sizeof (bow_wi2dvf) + (sizeof (bow_dvf) * (*wi2dvf)->size))); /* Initialize the new part of the realloc'ed space. */ for ( ; wi < (*wi2dvf)->size; wi++) INIT_BOW_DVF((*wi2dvf)->entry[wi]); } /* Run down the "word vector", depositing each entry in the WI2DVF. */ for (i = 0; i < wv->num_entries; i++) { wi = wv->entry[i].wi; assert ((*wi2dvf)->size > wi); if ((*wi2dvf)->entry[wi].dv == NULL) { /* There is not yet a "document vector" for "word index" WI, so create one. */ (*wi2dvf)->entry[wi].dv = bow_dv_new (0); /* This 2 is a flag to the hide/unhide code that this DV exists. */ (*wi2dvf)->entry[wi].seek_start = 2; ((*wi2dvf)->num_words)++; } /* Add the "document index" DI and the count associated with word index WI to the WI'th "document vector". */ bow_dv_add_di_count_weight (&((*wi2dvf)->entry[wi].dv), di, wv->entry[i].count, wv->entry[i].weight); }}/* Read all the words from character array DATA, and add them to WI2DVF, associated with document index DI. */intbow_wi2dvf_add_di_text_str (bow_wi2dvf **wi2dvf, int di, char *data, const char *filename){ char word[BOW_MAX_WORD_LENGTH]; /* buffer for reading and stemming words */ int wi; /* a word index */ bow_lex *lex; int num_words = 0; lex = bow_default_lexer->open_str (bow_default_lexer, data); assert (lex); /* Loop once for each lexical token in this document. */ while (bow_default_lexer->get_word (bow_default_lexer, lex, word, BOW_MAX_WORD_LENGTH)) { /* Find out the word's "index". */ wi = bow_word2int_add_occurrence (word); if (wi < 0) continue; /* Increment our stats about this word/document pair. */ bow_wi2dvf_add_wi_di_count_weight (wi2dvf, wi, di, 1, 1); /* Increment our count of the number of words in this document. */ num_words++; } bow_default_lexer->close (bow_default_lexer, lex); return num_words;}/* Read all the words from file pointer FP, and add them to WI2DVF, associated with document index DI. */intbow_wi2dvf_add_di_text_fp (bow_wi2dvf **wi2dvf, int di, FILE *fp, const char *filename){ char word[BOW_MAX_WORD_LENGTH]; /* buffer for reading and stemming words */ int wi; /* a word index */ bow_lex *lex; int num_words = 0; /* Loop once for each document in this file. */ while ((lex = bow_default_lexer->open_text_fp (bow_default_lexer, fp, filename))) { /* Loop once for each lexical token in this document. */ while (bow_default_lexer->get_word (bow_default_lexer, lex, word, BOW_MAX_WORD_LENGTH)) { /* Find out the word's "index". */ wi = bow_word2int_add_occurrence (word); if (wi < 0) continue; /* Increment our stats about this word/document pair. */ bow_wi2dvf_add_wi_di_count_weight (wi2dvf, wi, di, 1, 1); /* Increment our count of the number of words in this document. */ num_words++; } bow_default_lexer->close (bow_default_lexer, lex); } return num_words;}/* In the map WI2DVF, increase by COUNT and WEIGHT our record of the number times and weight that the document with "document index" DI contains the word with "word index" WI. */voidbow_wi2dvf_add_wi_di_count_weight (bow_wi2dvf **wi2dvf, int wi, int di, int count, float weight){ if (wi >= (*wi2dvf)->size) { /* There are so many unique words, we need to grow the array that maps WI's to DVF's. */ int old_size = (*wi2dvf)->size; /* a "word vector" */ (*wi2dvf)->size = MAX (wi+1, (*wi2dvf)->size * 2); (*wi2dvf) = bow_realloc (*wi2dvf, (sizeof (bow_wi2dvf) + (sizeof (bow_dvf) * (*wi2dvf)->size))); /* Initialize the new part of the realloc'ed space. */ for ( ; old_size < (*wi2dvf)->size; old_size++) INIT_BOW_DVF((*wi2dvf)->entry[old_size]); } /* Increment the stats for the WI/DI pair. */ if ((*wi2dvf)->entry[wi].dv == NULL) { /* There is not yet a "document vector" for "word index" WI, so create one. */ (*wi2dvf)->entry[wi].dv = bow_dv_new (0); /* This 2 is a flag to the hide/unhide code that this DV exists. */ (*wi2dvf)->entry[wi].seek_start = 2; ((*wi2dvf)->num_words)++; } /* Add the "document index" DI and the count associated with word index WI to the WI'th "document vector". */ bow_dv_add_di_count_weight (&((*wi2dvf)->entry[wi].dv), di, count, weight);}/* In the map WI2DVF, set to COUNT and WEIGHT our record of the number times and weight that the document with "document index" DI contains the word with "word index" WI. */voidbow_wi2dvf_set_wi_di_count_weight (bow_wi2dvf **wi2dvf, int wi, int di, int count, float weight){ if (wi >= (*wi2dvf)->size) { /* There are so many unique words, we need to grow the array that maps WI's to DVF's. */ int old_size = (*wi2dvf)->size; /* a "word vector" */ (*wi2dvf)->size = MAX (wi+1, (*wi2dvf)->size * 2); (*wi2dvf) = bow_realloc (*wi2dvf, (sizeof (bow_wi2dvf) + (sizeof (bow_dvf) * (*wi2dvf)->size))); /* Initialize the new part of the realloc'ed space. */ for ( ; old_size < (*wi2dvf)->size; old_size++) INIT_BOW_DVF((*wi2dvf)->entry[old_size]); } /* Increment the stats for the WI/DI pair. */ if ((*wi2dvf)->entry[wi].dv == NULL) { /* There is not yet a "document vector" for "word index" WI, so create one. */ (*wi2dvf)->entry[wi].dv = bow_dv_new (0); /* This 2 is a flag to the hide/unhide code that this DV exists. */ (*wi2dvf)->entry[wi].seek_start = 2; ((*wi2dvf)->num_words)++; } /* Add the "document index" DI and the count associated with word index WI to the WI'th "document vector". */ bow_dv_set_di_count_weight (&((*wi2dvf)->entry[wi].dv), di, count, weight);}/* Return a pointer to the BOW_DE for a particular word/document pair, or return NULL if there is no entry for that pair. */bow_de *bow_wi2dvf_entry_at_wi_di (bow_wi2dvf *wi2dvf, int wi, int di){ bow_dv *dv = bow_wi2dvf_dv (wi2dvf, wi); if (!dv) return NULL; return bow_dv_entry_at_di (dv, di);}/* Remove the word with index WI from the vocabulary of the map WI2DVF */voidbow_wi2dvf_remove_wi (bow_wi2dvf *wi2dvf, int wi){ bow_dv *dv; assert (wi < wi2dvf->size); bow_error ("Don't call this function. It's broken."); /* xxx This could be more efficient. Avoid reading it in, just to free it */ dv = bow_wi2dvf_dv (wi2dvf, wi); if (dv) { bow_dv_free (wi2dvf->entry[wi].dv); (wi2dvf->num_words)--; } INIT_BOW_DVF (wi2dvf->entry[wi]);}#define FREE_WHEN_HIDING_WI 0/* Temporarily hide the word with index WI from the vocabulary of the map WI2DVF. The function BOW_WI2DVF_DV() will no longer see the entry for this WI, but */voidbow_wi2dvf_hide_wi (bow_wi2dvf *wi2dvf, int wi){ assert (wi < wi2dvf->size);#if FREE_WHEN_HIDING_WI if (wi2dvf->entry[wi].dv) { bow_dv_free (wi2dvf->entry[wi].dv); /* (wi2dvf->num_words)--; */ } wi2dvf->entry[wi].dv = NULL;#endif /* The token -1 is reserved to mean that the DV is uninitialized. */ assert (!(wi2dvf->entry[wi].dv && wi2dvf->entry[wi].seek_start == -1)); /* Make the SEEK_START negative so we won't use it in normal situations, but will be able to remember it and get it back when we need it. */ if (wi2dvf->entry[wi].seek_start > 0) { wi2dvf->entry[wi].seek_start = - (wi2dvf->entry[wi].seek_start); (wi2dvf->num_words)--; }}/* hide all the words that exist */voidbow_wi2dvf_hide_all_wi (bow_wi2dvf *wi2dvf){ int wi; for (wi = 0; wi < wi2dvf->size; wi++) { bow_dv *dv = bow_wi2dvf_dv (wi2dvf, wi); if (dv) bow_wi2dvf_hide_wi (wi2dvf, wi); }}/* unhide a specific word index */voidbow_wi2dvf_unhide_wi (bow_wi2dvf *wi2dvf, int wi){ assert (wi < wi2dvf->size); assert (wi2dvf->entry[wi].seek_start < -1); wi2dvf->entry[wi].seek_start = - (wi2dvf->entry[wi].seek_start); (wi2dvf->num_words)++;}/* Hide all words occuring in only COUNT or fewer number of documents. Return the number of words hidden. */intbow_wi2dvf_hide_words_by_doc_count (bow_wi2dvf *wi2dvf, int count){ int wi; bow_dv *dv; int num_hides = 0; if (count == 0) return 0; for (wi = 0; wi < wi2dvf->size; wi++) { dv = bow_wi2dvf_dv (wi2dvf, wi); if (dv && dv->length <= count) { bow_wi2dvf_hide_wi (wi2dvf, wi); num_hides++; } } return num_hides;}/* Hide all words occuring in only COUNT or fewer times. Return the number of words hidden. */intbow_wi2dvf_hide_words_by_occur_count (bow_wi2dvf *wi2dvf, int count){ int wi; bow_dv *dv; int num_hides = 0; if (count == 0) return 0; for (wi = 0; wi < wi2dvf->size; wi++) { dv = bow_wi2dvf_dv (wi2dvf, wi); if (dv && bow_words_occurrences_for_wi (wi) <= count) { bow_wi2dvf_hide_wi (wi2dvf, wi); num_hides++; } } return num_hides;}/* hide all words where the prefix of the word matches the given prefix */intbow_wi2dvf_hide_words_with_prefix (bow_wi2dvf *wi2dvf, char *prefix){ int wi; int num_hides = 0; int prefix_len = strlen (prefix); bow_dv *dv; /* hide all words where the prefix of the word matches the given prefix */ for (wi = 0; wi < wi2dvf->size; wi++) { dv = bow_wi2dvf_dv (wi2dvf, wi); if (dv && 0 == strncmp (prefix, bow_int2word (wi), prefix_len)) { bow_wi2dvf_hide_wi (wi2dvf, wi); num_hides++; } } return num_hides;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -