📄 barrel.c
字号:
bow_verbosify (bow_progress, "Removing words by information gain: "); /* Hide words from the BARREL. */ for (i = num_words_to_keep; i < wi2ig_size; i++) { /* Hide the WI from BARREL. */ bow_wi2dvf_hide_wi (barrel->wi2dvf, wiig_list[i].wi); if (i % 100 == 0) bow_verbosify (bow_progress, "\b\b\b\b\b\b\b\b\b%9d", wi2ig_size - i); } /* Now that we have reduce vocabulary size, don't add more words to the vocabulary. For example, when doing --test-files, don't include in the QUERY_WV words that aren't in the current reduced vocabulary, the total number of words in the QUERY_WV will be too large! */ bow_word2int_do_not_add = 1; bow_verbosify (bow_progress, "\n");}/* Set the BARREL->WI2DVF->ENTRY[WI].IDF to the sum of the COUNTS for the given WI among those documents in the training set. */voidbow_barrel_set_idf_to_count_in_train (bow_barrel *barrel){ bow_wi2dvf *wi2dvf = barrel->wi2dvf; int wi, nwi, dvi; bow_dv *dv; bow_cdoc *cdoc; nwi = MIN (wi2dvf->size, bow_num_words()); for (wi = 0; wi < nwi; wi++) { dv = bow_wi2dvf_dv (wi2dvf, wi); if (!dv) continue; dv->idf = 0; for (dvi = 0; dvi < dv->length; dvi++) { cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); if (cdoc->type == bow_doc_train) dv->idf += dv->entry[dvi].count; } }}/* Return the number of unique words among those documents with TYPE tag (train, test, unlabeled, etc) equal to TYPE. */int bow_barrel_num_unique_words_of_type (bow_barrel *doc_barrel, int type){ int wi, max_wi, dvi; int num_unique = 0; bow_dv *dv; bow_cdoc *cdoc; max_wi = MIN (doc_barrel->wi2dvf->size, bow_num_words()); for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi); for (dvi = 0; dv && dvi < dv->length; dvi++) { cdoc = bow_array_entry_at_index (doc_barrel->cdocs, dv->entry[dvi].di); if (cdoc->type == type) { num_unique++; break; } } } return num_unique;}int_bow_barrel_cdoc_write (bow_cdoc *cdoc, FILE *fp){ int ret; ret = bow_fwrite_int (cdoc->type, fp); ret += bow_fwrite_float (cdoc->normalizer, fp); ret += bow_fwrite_float (cdoc->prior, fp); ret += bow_fwrite_int (cdoc->word_count, fp); ret += bow_fwrite_string (cdoc->filename, fp); if (bow_file_format_version < 5) ret += bow_fwrite_short (cdoc->class, fp); else ret += bow_fwrite_int (cdoc->class, fp); return ret;}int_bow_barrel_cdoc_read (bow_cdoc *cdoc, FILE *fp){ int ret; int type; ret = bow_fread_int (&type, fp); cdoc->type = type; cdoc->class_probs = NULL; ret += bow_fread_float (&(cdoc->normalizer), fp); ret += bow_fread_float (&(cdoc->prior), fp); ret += bow_fread_int (&(cdoc->word_count), fp); ret += bow_fread_string ((char**)&(cdoc->filename), fp); if (bow_file_format_version < 5) { short s; ret += bow_fread_short (&s, fp); cdoc->class = s; } else ret += bow_fread_int (&(cdoc->class), fp); return ret;}/* Create and return a `barrel' by reading data from the file-pointer FP. */bow_barrel *bow_barrel_new_from_data_fp (FILE *fp){ bow_barrel *ret; int version_tag; int method_id; version_tag = fgetc (fp); /* xxx assert (version_tag >= 0); */ if (version_tag <= 0) return NULL; if (_bow_barrel_version != -1 && _bow_barrel_version != version_tag) bow_error ("Trying to read bow_barrel's with different version numbers"); _bow_barrel_version = version_tag; ret = bow_malloc (sizeof (bow_barrel)); if (_bow_barrel_version < 3) { bow_fread_int (&method_id, fp); bow_error ("Can no longer read barrels earlier than version 3"); /* ret->method = _old_bow_methods[method_id]; */ } else { char *method_string; bow_fread_string (&method_string, fp); ret->method = (rainbow_method*) bow_method_at_name (method_string); bow_free (method_string); } ret->cdocs = bow_array_new_from_data_fp ((int(*)(void*,FILE*))_bow_barrel_cdoc_read, _bow_barrel_cdoc_free, fp); assert (ret->cdocs->length); if (bow_file_format_version > 5) ret->classnames = bow_int4str_new_from_fp (fp); else ret->classnames = NULL; ret->wi2dvf = bow_wi2dvf_new_from_data_fp (fp); assert (ret->wi2dvf->num_words); return ret;}/* Decide whether to keep this or not. Currently it it used by rainbow-h.c. */bow_barrel *bow_barrel_new_from_data_file (const char *filename){ FILE *fp; bow_barrel *ret_barrel; int wi; bow_dv *dv; int dv_count = 0; fp = bow_fopen (filename, "rb"); ret_barrel = bow_barrel_new_from_data_fp (fp); if (ret_barrel) { /* Read in all the dvf's so that we can close the FP. */ for (wi = 0; wi < ret_barrel->wi2dvf->size; wi++) { dv = bow_wi2dvf_dv (ret_barrel->wi2dvf, wi); if (dv) dv_count++; } ret_barrel->wi2dvf->fp = NULL; assert (dv_count); } fclose (fp); return ret_barrel;}/* Read a line from FP until a newline, and return a newly malloc'ed buffer containing the line read. */char *getline (FILE *fp){ int bufsize = 1024; int buflen = 0; char *buf = bow_malloc (bufsize); int byte; while ((byte = fgetc (fp)) != EOF && byte != '\n') { buf[buflen++] = byte; if (buflen >= bufsize) { bufsize *= 2; buf = bow_realloc (buf, bufsize); } } if (byte == EOF) { bow_free (buf); return NULL; } buf[buflen] = '\0'; return buf;}/* Create a new barrel and fill it from contents in --print-barrel=FORMAT read in from FILENAME. */bow_barrel *bow_barrel_new_from_printed_barrel_file (const char *filename, const char *format){ FILE *fp; enum { word_index, word_string, word_string_and_index, word_empty } word_format = word_string_and_index; enum { binary_count, integer_count } word_count_format = integer_count; int sparse_format = 1; int di; bow_cdoc cdoc; int wi; float count; int int_count; char datafilename[BOW_MAX_WORD_LENGTH]; char classname[BOW_MAX_WORD_LENGTH]; int word_count_column; int num_chars_read; char *buf, *line; bow_barrel *ret; /* Returns 1 on success, 0 on failure. */ int read_word_count (char **string, int *wi, float *count) { char word[BOW_MAX_WORD_LENGTH]; int ret = 0; int num_chars_read; switch (word_format) { case word_index: if (sscanf (*string, "%d %f%n", wi, count, &num_chars_read) == 2) ret = 1; break; case word_string: if (sscanf (*string, "%s %f%n", word, count, &num_chars_read) == 2) { ret = 1; *wi = bow_word2int (word); } break; case word_string_and_index: if (sscanf (*string,"%s %d %f%n",word,wi,count,&num_chars_read) == 3) ret = 1; break; case word_empty: if (sscanf (*string, "%f%n", count, &num_chars_read) == 1) { ret = 1; *wi = word_count_column; } break; } if (word_count_format == binary_count) *count = (*count > 0); if (ret) *string += num_chars_read; return ret; } if (format && strchr (format, 'a')) sparse_format = 0; if (format && strchr (format, 'b')) word_count_format = binary_count; if (format && strchr (format, 'n')) word_format = word_index; else if (format && strchr (format, 'w')) word_format = word_string; else if (format && strchr (format, 'e')) word_format = word_empty; ret = bow_barrel_new (0, 0, sizeof (bow_cdoc), _bow_barrel_cdoc_free); ret->classnames = bow_int4str_new (0); fp = bow_fopen (filename, "r"); /* Each time through the loop reads one line. */ while ((buf = getline (fp))) { line = buf; if (sscanf (line, "%s%n", datafilename, &num_chars_read) != 1) bow_error ("Didn't find expected filename"); line += num_chars_read; if (sscanf (line, "%s%n", classname, &num_chars_read) != 1) bow_error ("Didn't find expected classname"); line += num_chars_read; cdoc.filename = strdup (datafilename); assert (cdoc.filename); cdoc.class = bow_str2int (ret->classnames, classname); cdoc.type = bow_doc_train; cdoc.prior = 1.0f; cdoc.class_probs = NULL; di = bow_array_append (ret->cdocs, &cdoc); while (read_word_count (&line, &wi, &count)) { if (count) { int_count = rint (count); bow_wi2dvf_add_wi_di_count_weight (&(ret->wi2dvf), wi, di, int_count, count); } else assert (sparse_format == 0); } bow_free (buf); } return ret;}/* Write BARREL to the file-pointer FP in a machine independent format. */voidbow_barrel_write (bow_barrel *barrel, FILE *fp){ if (!barrel) { fputc (0, fp); /* 0 version_tag means NULL barrel */ return; } fputc (BOW_DEFAULT_BARREL_VERSION, fp); _bow_barrel_version = BOW_DEFAULT_BARREL_VERSION; bow_fwrite_string (barrel->method->name, fp); bow_array_write (barrel->cdocs, (int(*)(void*,FILE*))_bow_barrel_cdoc_write, fp); bow_int4str_write (barrel->classnames, fp); /* The wi2dvf must be written last because when we read it, we don't actually read the whole thing; we only read the seek-table. */ bow_wi2dvf_write (barrel->wi2dvf, fp);}/* Print barrel to FP in human-readable and awk-accessible format. */voidbow_barrel_printf_old1 (bow_barrel *barrel, FILE *fp, const char *format){ bow_dv_heap *heap; /* a heap of "document vectors" */ int current_di; bow_cdoc *cdoc; bow_verbosify (bow_progress, "Printing barrel: "); heap = bow_make_dv_heap_from_wi2dvf (barrel->wi2dvf); /* Keep going until the heap is empty */ while (heap->length > 0) { /* Set the current document we're working on */ current_di = heap->entry[0].current_di; assert (heap->entry[0].dv->idf == heap->entry[0].dv->idf); /* NaN */ if (current_di % 10 == 0) bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", current_di); /* Here we should check if this di is part of some training set and move on if it isn't. */ /* Get the document */ cdoc = bow_cdocs_di2doc (barrel->cdocs, current_di);#if 0 /* If it's not a model document, then move on to next one */ if (cdoc->type != model) { do { bow_dv_heap_update (heap); } while ((current_di == heap->entry[0].current_di) && (heap->length > 0)); /* Try again */ continue; }#endif fprintf (fp, "%s", cdoc->filename); /* Loop over all words in this document, printing out the FORMAT-requested statistics. */ do {#if 0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -