📄 em.c
字号:
doublebow_gamma_distribution (int ia){ int j; double am, e, s, v1, v2, x, y; assert (ia >= 1) ; if (ia < 6) { x = 1.0; for (j = 1; j <= ia; j++) x *= bow_random_01 (); x = - log (x); } else { do { do { do { v1 = 2.0 * bow_random_01 () - 1.0; v2 = 2.0 * bow_random_01 () - 1.0; } while (v1 * v1 + v2 * v2 > 1.0); y = v2 / v1; am = ia - 1; s = sqrt (2.0 * am + 1.0); x = s * y + am; } while (x <= 0.0); e = (1.0 + y * y) * exp (am * log (x/am) - s * y); } while (bow_random_01 () > e); } return x;}/* Change the weights by sampling from the multinomial distribution specified by the training data. Start from the current values of the DV WEIGHTS. Typically this would be called after iteration 1 of EM, before the unlabeled documents were included in the WEIGHTS. */voidbow_em_perturb_weights (bow_barrel *doc_barrel, bow_barrel *vpc_barrel){ double variance; double num_words_per_ci[bow_barrel_num_classes (vpc_barrel)]; int ci, wi, dvi, max_wi; bow_dv *dv; bow_cdoc *cdoc; double pr_w_c; if (bow_em_perturb_starting_point == bow_em_perturb_none) return; bow_random_set_seed (); max_wi = MIN (doc_barrel->wi2dvf->size, bow_num_words ()); /* Perturb the counts (which are stored in WEIGHT) */ for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi); if (!dv) continue; for (dvi = 0; dvi < dv->length; dvi++) { /* WEIGHT can be zero if the prob of a class for the doc that had this word was zero */ if (bow_em_perturb_starting_point == bow_em_perturb_with_gaussian) { if (0 != dv->entry[dvi].weight) { cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, dv->entry[dvi].di); pr_w_c = dv->entry[dvi].weight / cdoc->normalizer; variance = cdoc->normalizer * pr_w_c * (1 - pr_w_c); dv->entry[dvi].weight = bow_em_gaussian (dv->entry[dvi].weight, variance); if (dv->entry[dvi].weight < 0) dv->entry[dvi].weight = 0; } } else if (bow_em_perturb_starting_point == bow_em_perturb_with_dirichlet) { dv->entry[dvi].weight = bow_gamma_distribution (dv->entry[dvi].weight + 1); /* The +1 is assuming we are using LaPlace smoothing */ /* xxx I hope that we are still multiplying weights by 200 (for a length 200 document), otherwise weight will always get rounded down into nothing, because bow_gamma_distribution only takes int's */ } } } /* Reset the CDOC->WORD_COUNT and CDOC->NORMALIZER */ for (ci = 0; ci < bow_barrel_num_classes (vpc_barrel); ci++) { cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci); cdoc->normalizer = 0; num_words_per_ci[ci] = 0; } for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi); if (!dv) continue; for (dvi = 0; dvi < dv->length; dvi++) { /* WEIGHT can be zero if the prob of a class for the doc that had this word was zero */ if (0 != dv->entry[dvi].weight) { cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, dv->entry[dvi].di); num_words_per_ci[dv->entry[dvi].di] += dv->entry[dvi].weight;#if 0 /* Now using normalizer for non-int word_count */ cdoc->normalizer++;#endif } } } for (ci = 0; ci < bow_barrel_num_classes (vpc_barrel); ci++) { bow_cdoc *cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci); cdoc->normalizer = num_words_per_ci[ci];#if 0 cdoc->word_count = (int) rint(num_words_per_ci[ci]); assert (cdoc->word_count >= 0);#endif assert (cdoc->normalizer >= 0); }} /* Create a class barrel with EM-style clustering on unlabeled docs */bow_barrel *bow_em_new_vpc_with_weights (bow_barrel *doc_barrel){ bow_barrel *vpc_barrel; /* the vector-per-class barrel */ int wi; /* word index */ int max_wi; /* max word index */ int dvi; /* document vector index */ int ci; /* class index */ bow_dv *dv; /* document vector */ int di; /* document index */ int binary_neg_ci = -1; bow_dv_heap *test_heap=NULL; /* we'll extract test WV's from here */ bow_wv *query_wv; bow_score *hits; int actual_num_hits; int hi; /* hit index */ bow_cdoc *doc_cdoc; int num_tested; int em_runs = 0; int num_train_docs = 0; int num_unlabeled_docs = 0; int max_new_ci; int max_old_ci; int (* bow_cdoc_next_em_doc)(bow_cdoc *) = bow_cdoc_is_unlabeled; double old_perplexity = DBL_MAX; double new_perplexity = DBL_MAX / 2; double old_accuracy = -2; double new_accuracy = -1; /*bow_wi2dvf *prev_wi2dvf = NULL;*/ /*float prev_priors[200];*/ /*int prev_word_counts[200];*/ /*float prev_normalizers[200];*/ float total_weight; float labeled_weight_fraction; float new_labeled_fraction; /* some sanity checks first */ assert(200 > bow_barrel_num_classes(doc_barrel)); assert(200 > bow_em_multi_hump_neg + 1); assert (!bow_em_multi_hump_neg || (bow_em_binary_case && em_stat_method == nb_score)); assert (!strcmp(doc_barrel->method->name, "em") || !strcmp(doc_barrel->method->name, "active")); assert (doc_barrel->classnames); assert (!(bow_em_perturb_starting_point && em_anneal)); assert (em_stat_method == nb_score || bow_em_multi_hump_neg == 0); assert (bow_em_multi_hump_neg == 0 || em_labeled_for_start_only == 0); /* this option is broken */ assert (!em_halt_using_perplexity); /* initialize some variables */ bow_em_making_barrel = 1; if (bow_smoothing_method == bow_smoothing_dirichlet) bow_naivebayes_load_dirichlet_alphas (); max_old_ci = bow_barrel_num_classes(doc_barrel); if (bow_em_multi_hump_neg) max_new_ci = bow_em_multi_hump_neg + 1; else max_new_ci = max_old_ci; if (bow_em_multi_hump_neg > 1) bow_cdoc_next_em_doc = bow_cdoc_is_multi_hump_doc; max_wi = MIN (doc_barrel->wi2dvf->size, bow_num_words ()); /* assert(doc_barrel->wi2dvf->size == bow_num_words ()); */ /* remove words from vocab if using only the unlabeled vocab */ if (em_set_vocab_from_unlabeled) { int removed = 0; int kept = 0; for (wi = 0; wi < max_wi; wi++) { int found = 0; bow_dv *dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi); if (!dv) continue; dvi = 0; while (dvi < dv->length) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, dv->entry[dvi].di); if (cdoc->type == bow_doc_unlabeled) { found = 1; break; } dvi++; } if (!found) { bow_wi2dvf_hide_wi (doc_barrel->wi2dvf, wi); removed++; } else kept++; } bow_verbosify (bow_progress, "Removed %d words using unlabeled data; %d remaining\n", removed, kept); } /* Count the number of training and unlabeled documents */ for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); if (cdoc->type == bow_doc_train) num_train_docs++; else if (cdoc->type == bow_doc_unlabeled) num_unlabeled_docs++; } /* Identify the binary positive and negative class */ if (bow_em_binary_case) { assert (em_binary_pos_classname != NULL); assert (em_binary_neg_classname != NULL); for (ci = 0; ci < max_old_ci; ci++) { if (em_binary_pos_classname != NULL && -1 == binary_pos_ci && !strcmp(em_binary_pos_classname, filename_to_classname (bow_barrel_classname_at_index (doc_barrel, ci)))) { binary_pos_ci = ci; } if (em_binary_neg_classname != NULL && -1 == binary_neg_ci && !strcmp(em_binary_neg_classname, filename_to_classname (bow_barrel_classname_at_index (doc_barrel, ci)))) { binary_neg_ci = ci; } } if (binary_pos_ci == -1) bow_error ("No such binary positive class %s.", em_binary_pos_classname); if (binary_neg_ci == -1) bow_error ("No such binary negative class %s.", em_binary_neg_classname); } /* should the free function be a real one? */ /* Create an empty barrel; we fill it with vector-per-class data and return it. */ vpc_barrel = bow_barrel_new (doc_barrel->wi2dvf->size, doc_barrel->cdocs->length-1, doc_barrel->cdocs->entry_size, doc_barrel->cdocs->free_func); vpc_barrel->method = doc_barrel->method; vpc_barrel->classnames = bow_int4str_new (0); /* setup the cdoc structure for the class barrel, except for the word counts and normalizer, which we'll do later. */ for (ci = 0; ci < max_old_ci; ci++) { bow_cdoc cdoc; /* create the cdoc structure */ cdoc.type = bow_doc_train; cdoc.normalizer = -0.0f; /* just a temporary measure */ cdoc.word_count = 0; /* just a temporary measure */ cdoc.filename = strdup (bow_barrel_classname_at_index (doc_barrel, ci)); bow_barrel_add_classname(vpc_barrel, cdoc.filename); if (!cdoc.filename) bow_error ("Memory exhausted."); cdoc.class_probs = NULL; cdoc.class = ci; bow_array_append (vpc_barrel->cdocs, &cdoc); } /* if multi-hump, then add a cdoc for each of the other negative humps as well */ if (bow_em_multi_hump_neg) { for (ci = max_old_ci; ci < max_new_ci; ci++) { bow_cdoc cdoc; char *name = bow_malloc (sizeof (char) * (strlen(em_binary_neg_classname) + 10)); cdoc.type = bow_doc_train; cdoc.normalizer = 0.0f; /* just a temporary measure */ cdoc.word_count = 0; /* just a temporary measure */ sprintf(name, "%s%d", em_binary_neg_classname, ci); cdoc.filename = name; bow_barrel_add_classname(vpc_barrel, cdoc.filename); if (!cdoc.filename) bow_error ("Memory exhausted."); cdoc.class_probs = NULL; cdoc.class = ci; bow_array_append (vpc_barrel->cdocs, &cdoc); } } /* if we're comparing to naivebayes, do that now */ if (em_compare_to_nb == 1) bow_em_compare_to_nb(doc_barrel); /* Set word_count for docs correctly. Do this after comparing to NB b/c making a NB class barrel messes with the word counts. */ { /* Create the heap from which we'll get WV's. */ query_wv = NULL; test_heap = bow_test_new_heap (doc_barrel); /* Loop once for each document. */ while (-1 != (di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, bow_cdoc_yes))) { int word_count = 0; int wvi; doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); for (wvi = 0; wvi < query_wv->num_entries; wvi++) word_count += query_wv->entry[wvi].count; doc_cdoc->word_count = word_count; } } /* initialize the EM starting point */ { /* cycle through the document barrel and make sure that each document has a correctly initialized class_probs structure. set class_probs of train docs. Note that these class_probs indexes are indexes into the NEW class indexes not the OLD ones!*/ for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); if (!cdoc->class_probs) cdoc->class_probs = bow_malloc (sizeof (float) * max_new_ci); /* initialize the class_probs to all zeros */ for (ci=0; ci < max_new_ci; ci++) cdoc->class_probs[ci] = 0.0; /* if it's a known doc, set its class_probs that way */ if (cdoc->type == bow_doc_train) cdoc->class_probs[cdoc->class] = 1.0; } /* redistribute class probs of negative docs if multi-hump */ if (bow_em_multi_hump_neg) { if (em_multi_hump_init == bow_em_init_spiked) { int counts[500]; int n; int yet_to_find = 0; assert (bow_em_multi_hump_neg < 500); /* Count the number of negative documents */ for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); if (cdoc->class == binary_neg_ci) yet_to_find++; } /* set the number of docs per negative hump */ assert(yet_to_find >= bow_em_multi_hump_neg); for (n=0; n < bow_em_multi_hump_neg; n++) counts[n] = 0; for (n=0; n < yet_to_find; n++)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -