📄 hem.c
字号:
/* The M-step */ for (ancestor = leaf, ai = 0; ancestor; ancestor = ancestor->parent, ai++) { if (crossbow_hem_vertical_word_movement) word_deposit = wv->entry[wvi].count * leaf_membership[li] * ancestor_membership[ai]; else word_deposit = wv->entry[wvi].count * leaf_membership[li];#define UNLABELED_WEIGHT_REDUCED 0#if UNLABELED_WEIGHT_REDUCED if (doc->tag == bow_doc_unlabeled) word_deposit /= 3;#endif assert (word_deposit >= 0); if (!crossbow_hem_lambdas_from_validation || doc->tag != bow_doc_validation) { if (crossbow_hem_loo) bow_treenode_add_new_loo_for_di_wvi (ancestor, word_deposit, di, wvi, wv->num_entries, crossbow_docs->length); ancestor->new_words[wv->entry[wvi].wi] += word_deposit; } if (ancestor_membership[ai] == 0) continue; lambda_deposit = wv->entry[wvi].count * leaf_membership[li] * ancestor_membership[ai]; assert (lambda_deposit >= 0); if (!crossbow_hem_lambdas_from_validation || doc->tag == bow_doc_validation) leaf->new_lambdas[ai] += lambda_deposit; } /* The uniform distribution */ if (!crossbow_hem_lambdas_from_validation || doc->tag == bow_doc_validation) leaf->new_lambdas[ai] += wv->entry[wvi].count * leaf_membership[li] * ancestor_membership[ai]; } /* if crossbow_hem_shrinkage */ else { /* The M-step without shrinkage, without ancestor membership probabilities. */ leaf->new_words[wv->entry[wvi].wi] += wv->entry[wvi].count * leaf_membership[li]; leaf->new_lambdas[0]++; } assert (leaf->new_words[wv->entry[wvi].wi] >= 0); assert (leaf->new_words[wv->entry[wvi].wi] == leaf->new_words[wv->entry[wvi].wi]); } leaf->new_prior += leaf_membership[li]; } } /* Finish M-step */ bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1); for (iterator = crossbow_root; (leaf = bow_treenode_iterate_all (&iterator));) { if (crossbow_hem_shrinkage) { bow_treenode_set_words_from_new_words (leaf, 0); bow_treenode_set_lambdas_from_new_lambdas (leaf, 1); } else { bow_treenode_set_words_from_new_words (leaf, 1); bow_treenode_set_lambdas_from_new_lambdas (leaf, 0); } } pp = exp (-log_prob_of_data / num_data_words); bow_verbosify (bow_progress, "EM incorporated %d documents; pp=%g\n", docs_added_count, pp); /* Return the perlexity */ return pp;}intcrossbow_hem_consider_splitting (){ int grandparents_count; treenode *tn, *iterator, **grandparents; int ci; int num_leaves; int did_split = 0; /* Make an array of grandparents, then try splitting them. If you just iterate through tree, then iteration gets messed up the creation of new grandchildren. */ num_leaves = bow_treenode_leaf_count (crossbow_root); grandparents = bow_malloc (num_leaves * sizeof (void*)); grandparents_count = 0; for (iterator = crossbow_root; (tn = bow_treenode_iterate_all (&iterator));) { if (bow_treenode_is_leaf_parent (tn)) grandparents[grandparents_count++] = tn; } for (ci = 0; ci < grandparents_count; ci++) did_split |= crossbow_hem_hypothesize_grandchildren (grandparents[ci], crossbow_hem_branching_factor);#if 0 printf (".........................................................\n"); for (iterator = crossbow_root; (tn = bow_treenode_iterate_all (&iterator));) { printf ("%s %g\n", tn->name, tn->prior); if (tn->children_count == 0) { bow_treenode_word_probs_print (tn, 5); printf ("\n"); bow_treenode_word_leaf_likelihood_ratios_print (tn, 5); //bow_treenode_word_likelihood_ratios_print (tn, 10); } }#endif bow_free (grandparents); return did_split;}void crossbow_hem_cluster (){ int di; crossbow_doc *doc; double pp, old_pp, test_pp; treenode *iterator, *tn; FILE *classify_fp; int iteration; char buf[1024]; bow_random_set_seed(); bow_treenode_set_lambdas_uniform (crossbow_root); /* initialize all data to be at the root */ for (di = 0; di < crossbow_docs->length; di++) { int wvi; bow_wv *wv = crossbow_wv_at_di (di); doc = bow_array_entry_at_index (crossbow_docs, di); if (doc->tag != bow_doc_train && doc->tag != bow_doc_unlabeled) continue; for (wvi = 0; wvi < wv->num_entries; wvi++) { crossbow_root->new_words[wv->entry[wvi].wi] += wv->entry[wvi].count; if (crossbow_hem_loo) bow_treenode_add_new_loo_for_di_wvi (crossbow_root, wv->entry[wvi].count, di, wvi, wv->num_entries, crossbow_docs->length); } } crossbow_root->new_prior = 1.0; //bow_treenode_set_new_words_from_perturbed_words_all (crossbow_root); bow_treenode_set_words_from_new_words_all (crossbow_root, 1.0 / crossbow_root->words_capacity); bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1.0); /* Initialize children of the root */ if (crossbow_root->children_count == 0) crossbow_hem_create_children_for_node (crossbow_root, crossbow_hem_branching_factor); /* CROSSBOW_HEM_TEMPERATURE already set */ iteration = 0; for ( ; crossbow_hem_temperature >= crossbow_hem_temperature_end; crossbow_hem_temperature *= crossbow_hem_temperature_decay) { bow_verbosify (bow_progress, "TEMPERATURE = %g\n", crossbow_hem_temperature); printf ("TEMPERATURE = %g\n", crossbow_hem_temperature); /* Always Add hypothesis children here. */ /* Run EM to convergence. */ old_pp = FLT_MAX; pp = old_pp / 2; /* Loop until convergence, i.e. perplexity doesn't change */ while (ABS (old_pp - pp) > 2 && iteration < crossbow_hem_max_num_iterations) { printf ("--------------------------------------------------" " Iteration %d\n", iteration); old_pp = pp; pp = crossbow_hem_em_one_iteration (); iteration++; test_pp = crossbow_hem_perplexity (bow_doc_is_test); printf ("train-pp=%f test-pp=%f \n", pp, test_pp); for (iterator = crossbow_root; (tn = bow_treenode_iterate_all (&iterator));) { printf ("%s", tn->name); if (tn->children_count == 0) { int ai, ci; printf (" prior=%g lambdas=[ ", tn->prior); for (ai = 0; ai < tn->depth + 2; ai++) printf ("%5.3f ", tn->lambdas[ai]); printf ("]"); if (0 && crossbow_classes_count > 1) { printf ("\n classes=[ "); for (ci = 0; ci < crossbow_classes_count; ci++) printf ("%5.3f ", tn->classes[ci]); printf ("]"); } } else printf (" KL %g WKL %g", bow_treenode_children_kl_div (tn), bow_treenode_children_weighted_kl_div (tn)); printf ("\n"); if (1 || tn->children_count == 0) { //bow_treenode_word_likelihood_ratios_print (tn, 10); bow_treenode_word_probs_print (tn, 10); printf ("\n"); bow_treenode_word_likelihood_ratios_print (tn, 5); //bow_treenode_word_leaf_likelihood_ratios_print (tn, 5); //bow_treenode_word_leaf_odds_ratios_print (tn, 10); } } //crossbow_leaf_document_probs_print (3); } /* Consider making splits. */ /* xxx This function should delete leaves that didn't become "real". */ if (crossbow_hem_consider_splitting ()) { /* xxx But leaves were just perturbed!!! */ /* Output document classifications */ sprintf (buf, "crossbow-classifications-%d", iteration); classify_fp = bow_fopen (buf, "w"); crossbow_classify_tagged_docs (-1, 1, classify_fp); fflush (classify_fp); fclose (classify_fp); /* Output top words */ sprintf (buf, "crossbow-words-%d", iteration); classify_fp = bow_fopen (buf, "w"); bow_verbosify (bow_progress, "========= keywords ========\n"); bow_treenode_keywords_print_all (crossbow_root, classify_fp); fflush (classify_fp); fclose (classify_fp); } /* Perturb the leaves */ bow_treenode_set_new_words_from_perturbed_words_all (crossbow_root, 0.1); bow_treenode_set_words_from_new_words_all (crossbow_root, 0); }}/* Put all documents into the NEW_WORDS distributions. */voidcrossbow_hem_place_labeled_data (){ int di, wvi; crossbow_doc *doc; treenode *leaf; bow_wv *wv; /* Clear all previous information. */ bow_treenode_set_new_words_to_zero_all (crossbow_root); bow_treenode_free_loo_all (crossbow_root, crossbow_docs->length); /* Initialize the word distributions and LOO entries with the data and initialize lambdas to uniform */ for (di = 0; di < crossbow_docs->length; di++) { doc = bow_array_entry_at_index (crossbow_docs, di); wv = crossbow_wv_at_di (di); if (doc->tag != bow_doc_train) continue; /* Temporary fix */ if (strstr (doc->filename, ".include") || strstr (doc->filename, ".exclude")) continue; leaf = bow_treenode_descendant_matching_name (crossbow_root, doc->filename); //assert (leaf->children_count == 0); leaf->new_prior++; while (leaf) { for (wvi = 0; wvi < wv->num_entries; wvi++) { leaf->new_words[wv->entry[wvi].wi] += wv->entry[wvi].count; bow_treenode_add_new_loo_for_di_wvi (leaf, wv->entry[wvi].count, di, wvi, wv->num_entries, crossbow_docs->length); } leaf = leaf->parent; } }}/* Do full EM, without determinisitic annealing, or leaf splitting. */voidcrossbow_hem_full_em (){ double pp, old_pp; double test_labeled_pp, test_unlabeled_pp; double train_labeled_pp, train_unlabeled_pp; treenode *iterator, *tn; int iteration = 0; int old_hem_shrinkage;#if PRINT_WORD_DISTS char prefix[BOW_MAX_WORD_LENGTH];#endif //assert (crossbow_hem_shrinkage); //assert (crossbow_hem_loo); if (crossbow_hem_garbage_collection) { /* Add "Misc" children to each parent in the tree */ bow_treenode_add_misc_child_all (crossbow_root); } /* If CROSSBOW_HEM_LAMBDAS_FROM_VALIDATION is non-zero, then change X percent of the train and unlabeled documents to validation. */ if (crossbow_hem_lambdas_from_validation) { int di; crossbow_doc *doc; int validation_count = 0; for (di = 0; di < crossbow_docs->length; di++) { doc = bow_array_entry_at_index (crossbow_docs, di); if ((/*doc->tag == bow_doc_train ||*/ doc->tag == bow_doc_unlabeled) && (bow_random_double (0.0, 1.0) < crossbow_hem_lambdas_from_validation)) { doc->tag = bow_doc_validation; validation_count++; } } bow_verbosify (bow_progress, "Placed %d document in validation set\n", validation_count); } /* Initialize the word distributions and LOO entries with the data and initialize lambdas to uniform */ crossbow_hem_place_labeled_data (); bow_treenode_set_words_from_new_words_all (crossbow_root, 1); bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1); bow_treenode_set_lambdas_leaf_only_all (crossbow_root); printf ("No Shrinkage\n"); old_hem_shrinkage = crossbow_hem_shrinkage; crossbow_hem_shrinkage = 0; crossbow_classify_tagged_docs (bow_doc_test, 0, stdout); crossbow_hem_shrinkage = old_hem_shrinkage; train_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_train); train_unlabeled_pp=crossbow_hem_unlabeled_perplexity (bow_doc_is_train); test_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_test); test_unlabeled_pp = crossbow_hem_unlabeled_perplexity (bow_doc_is_test); printf ("train-unlabeled-pp=%f train-labeled-pp=%f\n" " test-unlabeled-pp=%f test-labeled-pp=%f\n", train_unlabeled_pp, train_labeled_pp, test_unlabeled_pp, test_labeled_pp); if (crossbow_hem_vertical_word_movement) bow_treenode_word_probs_print_all (crossbow_root, 5); crossbow_hem_place_labeled_data (); if (crossbow_hem_shrinkage) bow_treenode_set_words_from_new_words_all (crossbow_root, 0); else bow_treenode_set_words_from_new_words_all (crossbow_root, 1); bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -