📄 barrel.c
字号:
int wi; for (wi = 0; heap->entry[0].wi > wi; wi++) fprintf (fp, " 0");#endif fprintf (fp, " %s %d %d", bow_int2word (heap->entry[0].wi), heap->entry[0].wi, heap->entry[0].dv->entry[heap->entry[0].index].count); /* Update the heap, we are done with this di, move it to its new position */ bow_dv_heap_update (heap);#if 0 for (; heap->entry[0].wi > wi; wi++) fprintf (fp, " 0");#endif } while ((current_di == heap->entry[0].current_di) && (heap->length > 0)); fprintf (fp, "\n"); } bow_free (heap); bow_verbosify (bow_progress, "\n"); }/* Print barrel to FP in human-readable and awk-accessible format. Step through each CDOC in BARREL->CDOCS instead of using a heap. This way we even print out the documents that have zero words. This function runs much more slowly than the one above. */voidbow_new_slow_barrel_printf (bow_barrel *barrel, FILE *fp, const char *format){ int di; bow_cdoc *cdoc; bow_de *de; int wi, max_wi; bow_verbosify (bow_progress, "Printing barrel: "); max_wi = barrel->wi2dvf->size; for (di = 0; di < barrel->cdocs->length; di++) { if (barrel->cdocs->length - di % 10 == 0) bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", barrel->cdocs->length - di); cdoc = bow_array_entry_at_index (barrel->cdocs, di); fprintf (fp, "%s", cdoc->filename); for (wi = 0; wi < max_wi; wi++) { de = bow_wi2dvf_entry_at_wi_di (barrel->wi2dvf, wi, di); if (de) fprintf (fp, " %s %d %d", bow_int2word (wi), wi, de->count); } fprintf (fp, "\n"); } bow_verbosify (bow_progress, "\n"); }/* Print barrel to FP in various formats. Defaults are first in lists: s - sparse OR a - all i - integer OR b - binary c - combination OR n - word index OR w - word string OR e - empty OR I - UC Irvine format, same as Sahami's "feat-sel" format. *//* Print document, but print only those documents for which the function PRINT_IF_TRUE returns non-zero. */voidbow_barrel_printf_selected (bow_barrel *barrel, FILE *fp, const char *format, int (*print_if_true)(bow_cdoc*)){ enum { word_index, word_string, word_string_and_index, word_empty, word_long } word_format = word_string_and_index; enum { binary_count, integer_count } word_count_format = integer_count; int doing_uci_format = 0; int doing_ipl_format = 0; int sparse_format = 1; bow_dv_heap *heap; bow_wv *wv; int di; bow_cdoc *cdoc; int wi, wvi; bow_dv *dv; int last_di; void print_word_count (int wi, int count) { int oi; const char *word; if (word_count_format == binary_count) count = (count > 0); switch (word_format) { case word_index: printf ("%d %d ", wi, count); break; case word_string: printf ("%s %d ", bow_int2word (wi), count); break; case word_string_and_index: printf ("%s %d %d ", bow_int2word (wi), wi, count); break; case word_empty: printf ("%d ", count); break; case word_long: word = bow_int2word (wi); for (oi = 0; oi < count; oi++) printf("%s ", word); } } if (format && strchr (format, 'I')) { doing_uci_format = 1; sparse_format = 0; word_count_format = binary_count; word_format = word_empty; } if (format && strchr (format, 'P')) doing_ipl_format = 1; if (format && strchr (format, 'a')) sparse_format = 0; if (format && strchr (format, 'b')) word_count_format = binary_count; if (format && strchr (format, 'n')) word_format = word_index; else if (format && strchr (format, 'w')) word_format = word_string; else if (format && strchr (format, 'e')) word_format = word_empty; else if (format && strchr (format, 'l')) word_format = word_long; if (doing_uci_format) { /* Print the number of dimentions and the number of features */ printf ("%d\n%d\n", barrel->wi2dvf->num_words, barrel->cdocs->length); } heap = bow_test_new_heap (barrel); wv = NULL; last_di = -1; while ((di = bow_heap_next_wv (heap, barrel, &wv, print_if_true)) != -1) { /* Print documents that have no words in them. while (last_di ); */ cdoc = bow_array_entry_at_index (barrel->cdocs, di); if (!doing_uci_format && !doing_ipl_format) printf ("%s %s ", cdoc->filename, bow_barrel_classname_at_index (barrel, cdoc->class)); else if (doing_ipl_format) printf ("%s %s ", bow_barrel_classname_at_index (barrel, cdoc->class), cdoc->filename); if (sparse_format) { for (wvi = 0; wvi < wv->num_entries; wvi++) print_word_count (wv->entry[wvi].wi, wv->entry[wvi].count); } else { for (wi = 0, wvi = 0; wi < barrel->wi2dvf->size; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (!dv) continue; if (wv->entry[wvi].wi < wi && wvi < wv->num_entries) wvi++; assert (wv->entry[wvi].wi >= wi || wvi >= wv->num_entries); if ((wvi < wv->num_entries) && wv->entry[wvi].wi == wi) print_word_count (wi, wv->entry[wvi].count); else print_word_count (wi, 0); } } if (doing_uci_format) /* Print the class index. */ printf (": %d", cdoc->class); printf ("\n"); }}voidbow_barrel_printf (bow_barrel *barrel, FILE *fp, const char *format){ bow_barrel_printf_selected (barrel, fp, format, bow_cdoc_yes);}/* Print on stdout the number of times WORD occurs in the various docs/classes of BARREL. */voidbow_barrel_print_word_count (bow_barrel *barrel, const char *word){ int wi; bow_dv *dv; int dvi; bow_cdoc *cdoc; wi = bow_word2int (word); if (wi == -1) { fprintf (stderr, "No such word `%s' in dictionary\n", word); exit (-1); } dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (!dv) { fprintf (stderr, "No document vector for word `%s'\n", word); return; } for (dvi = 0; dvi < dv->length; dvi++) { cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di); printf ("%9d / %9d (%9.5f) %s\n", dv->entry[dvi].count, cdoc->word_count, ((float)dv->entry[dvi].count / cdoc->word_count), cdoc->filename); }}/* For copying a class barrel. Doesn't deal with class_probs at all. */bow_barrel *bow_barrel_copy (bow_barrel *barrel){ int ci; int wi; int dvi; bow_dv *dv; bow_dv *copy_dv; bow_barrel *copy = bow_barrel_new(barrel->wi2dvf->size, bow_barrel_num_classes(barrel), barrel->cdocs->entry_size, barrel->cdocs->free_func); copy->method = barrel->method; copy->is_vpc = 1; copy->classnames = bow_int4str_new(0); /* Initialize the CDOCS and CLASSNAMES parts of the copy. Create BOW_CDOC structures for each class, and append them to the copy->cdocs array. */ for (ci = 0; ci < bow_barrel_num_classes(barrel) ; ci++) { bow_cdoc *old_cdoc = bow_array_entry_at_index(barrel->cdocs, ci); bow_cdoc cdoc; cdoc.type = old_cdoc->type; cdoc.normalizer = old_cdoc->normalizer; cdoc.word_count = old_cdoc->word_count; cdoc.prior = old_cdoc->prior; cdoc.filename = strdup (old_cdoc->filename); cdoc.class = old_cdoc->class; cdoc.class_probs = NULL; bow_array_append (copy->cdocs, &cdoc); bow_str2int (copy->classnames, cdoc.filename); } /* set up the wi2dvf structure */ for (wi = 0; wi < barrel->wi2dvf->size; wi++) { dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); if (!dv) continue; for (dvi = 0; dvi < dv->length; dvi++) bow_wi2dvf_add_wi_di_count_weight (&(copy->wi2dvf), wi, dv->entry[dvi].di, dv->entry[dvi].count, dv->entry[dvi].weight); /* Set the IDF of the class's wi2dvf directly from the doc's wi2dvf */ copy_dv = bow_wi2dvf_dv (copy->wi2dvf, wi); copy_dv->idf = dv->idf; } return (copy);}/* Define an iterator over the columns of a barrel */struct bow_barrel_iterator_context { bow_barrel *barrel; int ci; bow_dv *dv; int dvi;};#define CONTEXT ((struct bow_barrel_iterator_context*)context)static voidbarrel_iterator_reset_at_wi (int wi, void *context){ bow_cdoc *cdoc; CONTEXT->dv = bow_wi2dvf_dv (CONTEXT->barrel->wi2dvf, wi); CONTEXT->dvi = 0; /* Advance to the first document matching our criterion */ while (CONTEXT->dv && CONTEXT->dvi < CONTEXT->dv->length) { cdoc = bow_array_entry_at_index (CONTEXT->barrel->cdocs, CONTEXT->dv->entry[CONTEXT->dvi].di); if (cdoc->class == CONTEXT->ci && cdoc->type == bow_doc_train) break; CONTEXT->dvi++; }}static intbarrel_iterator_advance_to_next_di (void *context){ bow_cdoc *cdoc; if (CONTEXT->dv == NULL) return 0; CONTEXT->dvi++; while (CONTEXT->dvi < CONTEXT->dv->length) { cdoc = bow_array_entry_at_index (CONTEXT->barrel->cdocs, CONTEXT->dv->entry[CONTEXT->dvi].di); if (cdoc->class == CONTEXT->ci && cdoc->type == bow_doc_train) break; CONTEXT->dvi++; } if (CONTEXT->dvi >= CONTEXT->dv->length) return 0; return 1;}static intbarrel_iterator_doc_index (void *context){ if (CONTEXT->dv == NULL || CONTEXT->dvi >= CONTEXT->dv->length) return INT_MIN; return CONTEXT->dv->entry[CONTEXT->dvi].di;}static doublebarrel_iterator_count_for_doc (void *context){ if (CONTEXT->dv == NULL || CONTEXT->dvi >= CONTEXT->dv->length) return 0.0/0; /* NaN */ return CONTEXT->dv->entry[CONTEXT->dvi].count;}bow_iterator_double *bow_barrel_iterator_for_ci_new (bow_barrel *barrel, int ci){ bow_iterator_double *ret; void *context; ret = bow_malloc (sizeof (bow_iterator_double) + sizeof (struct bow_barrel_iterator_context)); ret->reset = barrel_iterator_reset_at_wi; ret->advance = barrel_iterator_advance_to_next_di; ret->index = barrel_iterator_doc_index; ret->value = barrel_iterator_count_for_doc; context = ret->context = (char*)ret + sizeof (bow_iterator_double); CONTEXT->barrel = barrel; CONTEXT->ci = ci; CONTEXT->dv = NULL; CONTEXT->dvi = 0; return ret;}#undef CONTEXT/* Free the memory held by BARREL. */voidbow_barrel_free (bow_barrel *barrel){ if (barrel->wi2dvf) bow_wi2dvf_free (barrel->wi2dvf); if (barrel->cdocs) bow_array_free (barrel->cdocs); if (barrel->classnames) bow_int4str_free (barrel->classnames); bow_free (barrel);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -