📄 barrel.c
字号:
"Removing words by information gain: "); /* Hide words from the BARREL. */ for (i = num_words_to_keep; i < wi2ig_size; i++) { /* Hide the WI from BARREL. */ bow_wi2dvf_hide_wi (barrel->wi2dvf, wiig_list[i].wi); if (i % 100 == 0) bow_verbosify (bow_progress, "\b\b\b\b\b\b\b\b\b%9d", wi2ig_size - i); } bow_verbosify (bow_progress, "\n");}int_bow_barrel_cdoc_write (bow_cdoc *cdoc, FILE *fp){ int ret; ret = bow_fwrite_int (cdoc->type, fp); ret += bow_fwrite_float (cdoc->normalizer, fp); ret += bow_fwrite_float (cdoc->prior, fp); ret += bow_fwrite_int (cdoc->word_count, fp); ret += bow_fwrite_string (cdoc->filename, fp); if (bow_file_format_version < 5) ret += bow_fwrite_short (cdoc->class, fp); else ret += bow_fwrite_int (cdoc->class, fp); return ret;}int_bow_barrel_cdoc_read (bow_cdoc *cdoc, FILE *fp){ int ret; int type; ret = bow_fread_int (&type, fp); cdoc->type = type; ret += bow_fread_float (&(cdoc->normalizer), fp); ret += bow_fread_float (&(cdoc->prior), fp); ret += bow_fread_int (&(cdoc->word_count), fp); ret += bow_fread_string ((char**)&(cdoc->filename), fp); if (bow_file_format_version < 5) { short s; ret += bow_fread_short (&s, fp); cdoc->class = s; } else ret += bow_fread_int (&(cdoc->class), fp); return ret;}/* Create and return a `barrel' by reading data from the file-pointer FP. */bow_barrel *bow_barrel_new_from_data_fp (FILE *fp){ bow_barrel *ret; int version_tag; int method_id; version_tag = fgetc (fp); /* xxx assert (version_tag >= 0); */ if (version_tag <= 0) return NULL; if (_bow_barrel_version != -1 && _bow_barrel_version != version_tag) bow_error ("Trying to read bow_barrel's with different version numbers"); _bow_barrel_version = version_tag; ret = bow_malloc (sizeof (bow_barrel)); if (_bow_barrel_version < 3) { bow_fread_int (&method_id, fp); ret->method = _old_bow_methods[method_id]; } else { char *method_string; bow_fread_string (&method_string, fp); ret->method = bow_method_at_name (method_string); free (method_string); } ret->cdocs = bow_array_new_from_data_fp ((int(*)(void*,FILE*))_bow_barrel_cdoc_read, _bow_barrel_cdoc_free, fp); assert (ret->cdocs->length); ret->wi2dvf = bow_wi2dvf_new_from_data_fp (fp); assert (ret->wi2dvf->num_words); return ret;}/* Decide whether to keep this or not. Currently it it used by rainbow-h.c. */bow_barrel *bow_barrel_new_from_data_file (const char *filename){ FILE *fp; bow_barrel *ret_barrel; int wi; bow_dv *dv; int dv_count = 0; fp = bow_fopen (filename, "r"); ret_barrel = bow_barrel_new_from_data_fp (fp); if (ret_barrel) { /* Read in all the dvf's so that we can close the FP. */ for (wi = 0; wi < ret_barrel->wi2dvf->size; wi++) { dv = bow_wi2dvf_dv (ret_barrel->wi2dvf, wi); if (dv) dv_count++; } ret_barrel->wi2dvf->fp = NULL; assert (dv_count); } fclose (fp); return ret_barrel;}/* Write BARREL to the file-pointer FP in a machine independent format. */voidbow_barrel_write (bow_barrel *barrel, FILE *fp){ if (!barrel) { fputc (0, fp); /* 0 version_tag means NULL barrel */ return; } fputc (BOW_DEFAULT_BARREL_VERSION, fp); _bow_barrel_version = BOW_DEFAULT_BARREL_VERSION; bow_fwrite_string (barrel->method->name, fp); bow_array_write (barrel->cdocs, (int(*)(void*,FILE*))_bow_barrel_cdoc_write, fp); bow_wi2dvf_write (barrel->wi2dvf, fp);}/* Print barrel to FP in human-readable and awk-accessible format. */voidbow_barrel_printf (bow_barrel *barrel, FILE *fp, const char *format){ bow_dv_heap *heap; /* a heap of "document vectors" */ int current_di; bow_cdoc *cdoc; bow_verbosify (bow_progress, "Printing barrel: "); heap = bow_make_dv_heap_from_wi2dvf (barrel->wi2dvf); /* Keep going until the heap is empty */ while (heap->length > 0) { /* Set the current document we're working on */ current_di = heap->entry[0].current_di; assert (heap->entry[0].dv->idf == heap->entry[0].dv->idf); /* NaN */ if (current_di % 10 == 0) bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", current_di); /* Here we should check if this di is part of some training set and move on if it isn't. */ /* Get the document */ cdoc = bow_cdocs_di2doc (barrel->cdocs, current_di);#if 0 /* If it's not a model document, then move on to next one */ if (cdoc->type != model) { do { bow_dv_heap_update (heap); } while ((current_di == heap->entry[0].current_di) && (heap->length > 0)); /* Try again */ continue; }#endif fprintf (fp, "%s", cdoc->filename); /* Loop over all words in this document, printing out the FORMAT-requested statistics. */ do {#if 0 int wi; for (wi = 0; heap->entry[0].wi > wi; wi++) fprintf (fp, " 0");#endif fprintf (fp, " %s %d %d", bow_int2word (heap->entry[0].wi), heap->entry[0].wi, heap->entry[0].dv->entry[heap->entry[0].index].count); /* Update the heap, we are done with this di, move it to its new position */ bow_dv_heap_update (heap);#if 0 for (; heap->entry[0].wi > wi; wi++) fprintf (fp, " 0");#endif } while ((current_di == heap->entry[0].current_di) && (heap->length > 0)); fprintf (fp, "\n"); } bow_free (heap); bow_verbosify (bow_progress, "\n"); }/* Print barrel to FP in human-readable and awk-accessible format. Step through each CDOC in BARREL->CDOCS instead of using a heap. This way we even print out the documents that have zero words. This function runs much more slowly than the one above. */voidbow_new_slow_barrel_printf (bow_barrel *barrel, FILE *fp, const char *format){ int di; bow_cdoc *cdoc; bow_de *de; int wi, max_wi; bow_verbosify (bow_progress, "Printing barrel: "); max_wi = barrel->wi2dvf->size; for (di = 0; di < barrel->cdocs->length; di++) { if (barrel->cdocs->length - di % 10 == 0) bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", barrel->cdocs->length - di); cdoc = bow_array_entry_at_index (barrel->cdocs, di); fprintf (fp, "%s", cdoc->filename); for (wi = 0; wi < max_wi; wi++) { de = bow_wi2dvf_entry_at_wi_di (barrel->wi2dvf, wi, di); if (de) fprintf (fp, " %s %d %d", bow_int2word (wi), wi, de->count); } fprintf (fp, "\n"); } bow_verbosify (bow_progress, "\n"); }/* Print on stdout the number of times WORD occurs in the various docs/classes of BARREL. */voidbow_barrel_print_word_count (bow_barrel *barrel, const char *word){ int wi; bow_dv *dv; int dvi; bow_cdoc *cdoc; wi = bow_word2int (word); if (wi == -1) { fprintf (stderr, "No such word `%s' in dictionary\n", word); exit (-1); } dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (!dv) { fprintf (stderr, "No document vector for word `%s'\n", word); return; } for (dvi = 0; dvi < dv->length; dvi++) { cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); printf ("%9d / %9d (%9.5f) %s\n", dv->entry[dvi].count, cdoc->word_count, ((float)dv->entry[dvi].count / cdoc->word_count), cdoc->filename); }}/* Free the memory held by BARREL. */voidbow_barrel_free (bow_barrel *barrel){ if (barrel->wi2dvf) bow_wi2dvf_free (barrel->wi2dvf); if (barrel->cdocs) bow_array_free (barrel->cdocs); bow_free (barrel);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -