📄 hem.c
字号:
double log_prob_of_data = 0; int num_data_words = 0; /* the number of word occurrences */ num_leaves = bow_treenode_leaf_count (crossbow_root); for (di = 0; di < crossbow_docs->length; di++) { doc = bow_array_entry_at_index (crossbow_docs, di); if (! (*use_doc_p)((bow_doc*)doc)) continue; wv = crossbow_wv_at_di (di); max_leaf_pp = -FLT_MAX; for (iterator = crossbow_root, li = 0; (leaf = bow_treenode_iterate_leaves (&iterator)); li++) { if (crossbow_hem_shrinkage) leaf_data_log_prob = bow_treenode_log_prob_of_wv (leaf, wv); else leaf_data_log_prob = bow_treenode_log_local_prob_of_wv (leaf, wv); leaf_pp = log(leaf->prior) + leaf_data_log_prob; assert (leaf_pp == leaf_pp);#if 1 /* Test for -Inf, and if so, immediately return Inf */ if (leaf_pp == -HUGE_VAL) return HUGE_VAL;#endif if (leaf_pp > max_leaf_pp) max_leaf_pp = leaf_pp; } assert (max_leaf_pp != -FLT_MAX); log_prob_of_data += max_leaf_pp; num_data_words += bow_wv_word_count (wv); } /* Return the perlexity */ if (num_data_words) return exp (-log_prob_of_data / num_data_words); return 0;}/* Return the perplexity (given knowledge of the class label, P(D,C|theta)) of the data in documents for which the function USE_DOC_P returns non-zero. */doublecrossbow_hem_labeled_perplexity (int (*use_doc_p)(bow_doc*)){ int di; crossbow_doc *doc; bow_wv *wv; treenode *leaf; int num_leaves; double leaf_data_log_prob; double log_prob_of_data = 0; int num_data_words = 0; /* the number of word occurrences */ num_leaves = bow_treenode_leaf_count (crossbow_root); for (di = 0; di < crossbow_docs->length; di++) { doc = bow_array_entry_at_index (crossbow_docs, di); if (! (*use_doc_p)((bow_doc*)doc)) continue; wv = crossbow_wv_at_di (di); leaf = bow_treenode_descendant_matching_name (crossbow_root, doc->filename); if (crossbow_hem_shrinkage) leaf_data_log_prob = bow_treenode_log_prob_of_wv (leaf, wv); else leaf_data_log_prob = bow_treenode_log_local_prob_of_wv (leaf, wv); /* Test for -Inf, and if so, immediately return Inf */ if (leaf_data_log_prob == -HUGE_VAL) return HUGE_VAL; log_prob_of_data += (log (leaf->prior) + leaf_data_log_prob); assert (log_prob_of_data == log_prob_of_data); num_data_words += bow_wv_word_count (wv); } /* Return the perlexity */ if (num_data_words) return exp (-log_prob_of_data / num_data_words); return 0;}/* Classify all unlabeled documents and convert the most confidently classified to labeled */voidcrossbow_hem_label_most_confident (){ int di, li; crossbow_doc *doc; // bow_wv *wv; bow_wa *wa; int word_count; double score; int leaf_count = bow_treenode_leaf_count (crossbow_root); bow_wa **high_scores_per_class; static int unlabeled_count = -1; static int num_to_label = 999; treenode *iterator, *leaf; assert (crossbow_hem_incremental_labeling); /* Calculate num_to_label if we are to label all examples in 20 iterations. */ if (unlabeled_count == -1) { unlabeled_count = 0; for (di = 0; di < crossbow_docs->length; di++) { doc = bow_array_entry_at_index (crossbow_docs, di); if (doc->tag == bow_doc_unlabeled) unlabeled_count++; } num_to_label = unlabeled_count / 20; } high_scores_per_class = alloca (leaf_count * sizeof (void*)); for (li = 0; li < leaf_count; li++) high_scores_per_class[li] = bow_wa_new (0); for (di = 0; di < crossbow_docs->length; di++) { bow_wv *wv; doc = bow_array_entry_at_index (crossbow_docs, di); if (doc->tag != bow_doc_unlabeled) continue; wv = crossbow_wv_at_di (doc->di); word_count = bow_wv_word_count (wv); wv = crossbow_wv_at_di (doc->di); assert (wv); wa = crossbow_classify_doc_new_wa (wv); bow_wa_sort (wa); score = wa->entry[0].weight; score /= ((word_count + 1) / MIN(9,word_count)); bow_wa_append (high_scores_per_class[wa->entry[0].wi], di, score); bow_wa_free (wa); } for (iterator = crossbow_root, li = 0; (leaf = bow_treenode_iterate_leaves (&iterator)); li++) { int i, num_to_label_this_class = MAX(1,num_to_label * leaf->prior); if (high_scores_per_class[li]->length == 0) continue; bow_wa_sort (high_scores_per_class[li]); if (num_to_label_this_class > high_scores_per_class[li]->length) { bow_verbosify (bow_quiet, "Not enough unlabeled documents classified as %s\n", leaf->name); num_to_label_this_class = high_scores_per_class[li]->length; } for (i = 0; i < num_to_label_this_class; i++) { char *newname = bow_malloc (128); doc = bow_array_entry_at_index (crossbow_docs, high_scores_per_class[li]->entry[i].wi); assert (doc->tag = bow_doc_unlabeled); doc->tag = bow_doc_train; doc->ci = li; /* xxx Yuck! WhizBang-specific */ sprintf (newname, "./data%s%s", leaf->name, strrchr(doc->filename, '/') + 1); /* xxx Memory leak here. Free the doc->name first. */ doc->filename = newname; bow_verbosify (bow_progress, "Labeling class %10s %35s %g\n", leaf->name, doc->filename, high_scores_per_class[li]->entry[i].weight); } } for (li = 0; li < leaf_count; li++) bow_wa_free (high_scores_per_class[li]);}#if MN#include "mn.c"#endif/* Return the perplexity */doublecrossbow_hem_em_one_iteration (){ int di; crossbow_doc *doc; bow_wv *wv; treenode *iterator, *leaf, *ancestor; int li; /* a leaf index */ int wvi; int num_leaves; double *leaf_membership; double *leaf_data_prob; double pp, log_prob_of_data = 0; int num_data_words = 0; /* the number of word occurrences */ double *ancestor_membership; double ancestor_membership_total; double total_deposit_prob; int found_deterministic_leaf; int docs_added_count = 0;#if MN return crossbow_hem_em_one_mn_iteration ();#endif num_leaves = bow_treenode_leaf_count (crossbow_root); leaf_membership = alloca (num_leaves * sizeof (double)); leaf_data_prob = alloca (num_leaves * sizeof (double)); /* xxx Here NUM_LEAVES+10 should be MAX_DEPTH */ ancestor_membership = alloca ((num_leaves + 10) * sizeof (double)); for (di = 0; di < crossbow_docs->length; di++) { total_deposit_prob = 0; doc = bow_array_entry_at_index (crossbow_docs, di); if (crossbow_hem_incremental_labeling) { if (crossbow_hem_lambdas_from_validation) { if (doc->tag != bow_doc_train && doc->tag != bow_doc_validation) continue; } else { if (doc->tag != bow_doc_train) continue; } } else if (crossbow_hem_lambdas_from_validation) { if (doc->tag != bow_doc_train && doc->tag != bow_doc_unlabeled && doc->tag != bow_doc_validation) continue; } else { if (doc->tag != bow_doc_train && doc->tag != bow_doc_unlabeled) continue; } /* Temporary fix */ if (strstr (doc->filename, ".include") || strstr (doc->filename, ".exclude")) continue; /* E-step estimating leaf membership probability for one document, with annealing temperature. */ wv = crossbow_wv_at_di (di); found_deterministic_leaf = 0; for (iterator = crossbow_root, li = 0; (leaf = bow_treenode_iterate_leaves (&iterator)); li++) { if (crossbow_hem_shrinkage) { if (crossbow_hem_loo) leaf_data_prob[li] = bow_treenode_log_prob_of_wv_loo (leaf, wv, di); else leaf_data_prob[li] = bow_treenode_log_prob_of_wv (leaf, wv); } else { if (crossbow_hem_loo) leaf_data_prob[li] = bow_treenode_log_local_prob_of_wv_loo (leaf, wv, di); else leaf_data_prob[li] = bow_treenode_log_local_prob_of_wv (leaf, wv); } assert (leaf_data_prob[li] > -HUGE_VAL); if (crossbow_hem_deterministic_horizontal && (doc->tag == bow_doc_train || doc->tag == bow_doc_validation)) { if (strstr (doc->filename, leaf->name)) { assert (!found_deterministic_leaf); leaf_membership[li] = 1.0; found_deterministic_leaf = 1; } else /* The validation document was formerly an unlabeled document. Set the membership to zero for now; we will set it to the results of the E-step below when we call crossbow_convert_log_probs_to_probs */ leaf_membership[li] = 0.0; continue; } else if (crossbow_hem_restricted_horizontal && (doc->tag == bow_doc_train || doc->tag == bow_doc_validation)) { treenode *label_node = bow_treenode_descendant_matching_name (crossbow_root, doc->filename); if (strstr (leaf->name, label_node->name)) leaf_membership[li] = (log (leaf->prior) + (leaf_data_prob[li] / crossbow_hem_temperature)); else /* Set it to probability zero, which, in log space is -Inf */ leaf_membership[li] = -HUGE_VAL; } else { leaf_membership[li] = (log (leaf->prior) + (leaf_data_prob[li] / crossbow_hem_temperature)); } } if (!crossbow_hem_deterministic_horizontal || doc->tag == bow_doc_unlabeled || !found_deterministic_leaf) /* Last condition above for unlabeled docs that were changed to validation docs */ crossbow_convert_log_probs_to_probs (leaf_membership, num_leaves); else /* No longer meaningful!? */ assert (found_deterministic_leaf); /* For perplexity calculation */ for (iterator = crossbow_root, li = 0; (leaf = bow_treenode_iterate_leaves (&iterator)); li++) { /* xxx Should this be with bow_treenode_complete_log_prob_of_wv()? */ if (leaf_membership[li]) log_prob_of_data += (leaf_membership[li] * leaf_data_prob[li]); assert (log_prob_of_data == log_prob_of_data); } num_data_words += bow_wv_word_count (wv); docs_added_count++; /* E-step estimating ancestor membership probability for words in one document, and M-step for one document */ for (iterator = crossbow_root, li = 0; (leaf = bow_treenode_iterate_leaves (&iterator)); li++) { if (leaf_membership[li] == 0) continue; if (strstr (leaf->name, "/Misc/")) continue; for (wvi = 0; wvi < wv->num_entries; wvi++) { if (crossbow_hem_shrinkage) { int ai; double word_deposit, lambda_deposit; /* Calculate normalized ancestor membership probs */ ancestor_membership_total = 0; for (ancestor = leaf, ai = 0; ancestor; ancestor = ancestor->parent, ai++) { if (crossbow_hem_loo) ancestor_membership[ai] = leaf->lambdas[ai] * bow_treenode_pr_wi_loo_local (ancestor, wv->entry[wvi].wi, di, wvi); else ancestor_membership[ai] = leaf->lambdas[ai] * ancestor->words[wv->entry[wvi].wi]; assert (ancestor_membership[ai] >= 0); ancestor_membership_total += ancestor_membership[ai]; } ancestor_membership[ai] = leaf->lambdas[ai] * 1.0 / leaf->words_capacity; ancestor_membership_total += ancestor_membership[ai]; assert (ancestor_membership_total); for (ai = 0; ai < leaf->depth + 2; ai++) { assert (ancestor_membership[ai] >= 0); ancestor_membership[ai] /= ancestor_membership_total; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -