📄 hem.c
字号:
/* Initialize the lambdas */#if SHRINK_WITH_UNIFORM_ONLY /* Set the lambdas to use the uniform and the leaf, and nothing else */ for (iterator = crossbow_root; (tn = bow_treenode_iterate_leaves (&iterator));) { int li; for (li = 0; li < tn->depth + 2; li++) { if (li == 0 || li == tn->depth+1) tn->lambdas[li] = 0.5; else tn->lambdas[li] = 0; } }#elif 1 if (crossbow_hem_shrinkage) bow_treenode_set_lambdas_uniform_all (crossbow_root); else bow_treenode_set_lambdas_leaf_only_all (crossbow_root); #else /* Just for fun see what happens when we initialize more data in leaves */ for (iterator = crossbow_root; (tn = bow_treenode_iterate_leaves (&iterator));) { int li; for (li = 0; li < tn->depth + 2; li++) { if (li == 0) tn->lambdas[li] = 0.5; else tn->lambdas[li] = 0.5 / (tn->depth + 1); } }#endif //bow_treenode_word_probs_print_all (crossbow_root, 5); if (crossbow_hem_pseudo_labeled) bow_tag_change_tags (crossbow_docs, bow_doc_train, bow_doc_unlabeled); /* Run EM to convergence. */ old_pp = FLT_MAX; pp = -1; crossbow_hem_temperature = 1; /* Loop until convergence, i.e. perplexity doesn't change */ while (/* ABS (old_pp - pp) > 0.1 && */ iteration < crossbow_hem_max_num_iterations) { printf ("--------------------------------------------------" " Iteration %d\n", iteration); /* Output the percent correct, and various perplexities. */ crossbow_classify_tagged_docs (bow_doc_test, 0, stdout); train_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_train); train_unlabeled_pp = crossbow_hem_unlabeled_perplexity (bow_doc_is_train); test_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_test); test_unlabeled_pp = crossbow_hem_unlabeled_perplexity (bow_doc_is_test); printf ("train-unlabeled-pp=%f train-labeled-pp=%f\n" " test-unlabeled-pp=%f test-labeled-pp=%f\n", train_unlabeled_pp, train_labeled_pp, test_unlabeled_pp, test_labeled_pp);#if PRINT_WORD_DISTS sprintf (prefix, "word-dists/em%d-%d", iteration, bow_random_seed); bow_treenode_print_all_word_probabilities_all (prefix, 1);#endif for (iterator = crossbow_root; (tn = bow_treenode_iterate_all (&iterator));) { printf ("%s", tn->name); if (tn->children_count == 0) { int ai; printf ("\n lambdas=[ "); for (ai = 0; ai < tn->depth + 2; ai++) printf ("%5.3f ", tn->lambdas[ai]); printf ("]"); } printf ("\n"); if (1 || tn->children_count == 0) { printf ("prior=%g\n", tn->prior); //bow_treenode_word_likelihood_ratios_print (tn, 10); //printf ("\n"); if (crossbow_hem_vertical_word_movement) bow_treenode_word_probs_print (tn, 5); //bow_treenode_word_likelihood_ratios_print (tn, 5); //bow_treenode_word_leaf_likelihood_ratios_print (tn, 5); //bow_treenode_word_leaf_odds_ratios_print (tn, 10); } } old_pp = pp; pp = crossbow_hem_em_one_iteration (); if (iteration % 2 == 0 && crossbow_hem_incremental_labeling) crossbow_hem_label_most_confident (); iteration++; }}/* If we replace the loss function L= sum_i (tilde{p}_i - p_i)^2with LL = sum_i (tilde{p}_i - p_i)^2/ (p_i (1-p_i) )then we get a loss function which is still tractable but is moresensitive to errors for small probabilities.If I repeat the calucations I get that lambda should be: lambda = (t/n) / ( (t/n) + B)where B = sum_i (u_i -p_i)^2 /( p_i (1-p_i) )(the sum is over the vocabulary). Here, t is the vocabulary size. */#define LOG_LOSS 1voidcrossbow_hem_fienberg_treenode (treenode *tn){ double u; double numerator; double wi_err; double sq_err; double n; double lambda; treenode *ancestor, *node; int wi, i; double b; double t; /* Sample size = Total number of word occurrences. */ n = tn->new_words_normalizer; t = tn->words_capacity; numerator = sq_err = b = 0;#if SHRINK_WITH_UNIFORM_ONLY if (tn->children_count != 0) { for (i = 0; i < tn->depth + 2; i++) tn->lambdas[i] = 0; goto do_children; }#endif if (SHRINK_WITH_UNIFORM_ONLY || tn->parent == NULL) { /* Calculating lambda for the root */ for (wi = 0; wi < tn->words_capacity; wi++) { /* Parent word distribution is the uniform distribution */ u = 1.0 / tn->words_capacity; numerator += tn->words[wi] * (1.0 - tn->words[wi]); wi_err = u - tn->words[wi]; sq_err += wi_err * wi_err; b += ((wi_err * wi_err) / (tn->words[wi] * (1.0 - tn->words[wi]))); } printf (" n = %d sum p*(1-p) = %f squared error = %f b = %f\n", (int)n, numerator, sq_err, b);#if LOG_LOSS lambda = (t/n) / ((t/n) + b);#else lambda = (1.0/n) * (numerator / (sq_err + (1.0/n) * numerator));#endif#if SHRINK_WITH_UNIFORM_ONLY tn->lambdas[0] = 1.0 - lambda; for (i = 1; i < tn->depth + 1; i++) tn->lambdas[i] = 0; tn->lambdas[tn->depth+1] = lambda;#else tn->lambdas[1] = lambda; tn->lambdas[0] = 1.0 - lambda;#endif } else { /* Calculating lambda for an interior node or leaf */ for (wi = 0; wi < tn->words_capacity; wi++) { /* Calculate parent word distribution as a mixture */ u = 0; node = tn->parent; for (ancestor = node, i = 0; ancestor; ancestor = ancestor->parent, i++) u += node->lambdas[i] * ancestor->words[wi]; /* Add in the uniform distribution */ u += node->lambdas[i] / node->words_capacity; numerator += tn->words[wi] * (1.0 - tn->words[wi]); wi_err = u - tn->words[wi]; sq_err += wi_err * wi_err; b += ((wi_err * wi_err) / (tn->words[wi] * (1.0 - tn->words[wi]))); if (0 && wi % 1000 == 0) printf ("n %f s %f\n", numerator, sq_err); } printf (" n = %d sum p*(1-p) = %f squared error = %f b = %f\n", (int)n, numerator, sq_err, b);#if LOG_LOSS lambda = (t/n) / ((t/n) + b);#else lambda = (1.0/n) * (numerator / (sq_err + (1.0/n) * numerator));#endif tn->lambdas[0] = 1.0 - lambda; for (i = 1; i < tn->depth + 2; i++) tn->lambdas[i] = lambda * tn->parent->lambdas[i-1]; } bow_verbosify (bow_progress, "%20s\n local_lambda=%f parent_lambda=%f\n", tn->name, 1.0 - lambda, lambda);#if SHRINK_WITH_UNIFORM_ONLY do_children:#endif for (i = 0; i < tn->children_count; i++) crossbow_hem_fienberg_treenode (tn->children[i]);}voidcrossbow_hem_fienberg (){ treenode *iterator, *tn; double test_labeled_pp, test_unlabeled_pp; double train_labeled_pp, train_unlabeled_pp;#if PRINT_WORD_DISTS char prefix[BOW_MAX_WORD_LENGTH];#endif double lambda;#if 0 /* Print the word distribution of all the data, then exit. */ bow_set_all_docs_untagged (crossbow_docs); bow_set_doc_types_of_remaining (crossbow_docs, bow_doc_train); crossbow_hem_place_labeled_data (); bow_treenode_set_words_from_new_words_all (crossbow_root, 0); bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 0); bow_treenode_set_lambdas_uniform_all (crossbow_root); sprintf (prefix, "word-dists/all-mle"); bow_treenode_print_all_word_probabilities_all (prefix, 0); sprintf (prefix, "word-dists/all-uniform"); bow_treenode_print_all_word_probabilities_all (prefix, 1); exit (0);#endif#if 0 /* Initialize the word distributions and LOO entries with the data and initialize lambdas to use local estimates only */ crossbow_hem_place_labeled_data (); bow_treenode_set_words_from_new_words_all (crossbow_root, 1); bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1); bow_treenode_set_lambdas_leaf_only_all (crossbow_root); printf ("\n\nNo Shrinkage\n"); crossbow_classify_tagged_docs (bow_doc_test, 0, 0, stdout);#endif crossbow_hem_place_labeled_data (); bow_treenode_set_words_from_new_words_all (crossbow_root, 1); bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 1); crossbow_hem_fienberg_treenode (crossbow_root); /* Print the tree */ for (iterator = crossbow_root; (tn = bow_treenode_iterate_all (&iterator));) { int ai; printf ("%s", tn->name); printf (" prior=%g lambdas=[ ", tn->prior); for (ai = 0; ai < tn->depth + 2; ai++) printf ("%5.3f ", tn->lambdas[ai]); printf ("]\n"); } printf ("\n\nFienberg\n");#if PRINT_WORD_DISTS sprintf (prefix, "word-dists/fienberg-%d", bow_random_seed); bow_treenode_print_all_word_probabilities_all (prefix, 1); sprintf (prefix, "word-dists/map-%d", bow_random_seed); bow_treenode_print_all_word_probabilities_all (prefix, 0);#endif crossbow_classify_tagged_docs (bow_doc_test, 0, stdout); train_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_train); train_unlabeled_pp=crossbow_hem_unlabeled_perplexity (bow_doc_is_train); test_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_test); test_unlabeled_pp = crossbow_hem_unlabeled_perplexity (bow_doc_is_test); printf ("train-unlabeled-pp=%f train-labeled-pp=%f\n" "test-unlabeled-pp=%f test-labeled-pp=%f\n", train_unlabeled_pp, train_labeled_pp, test_unlabeled_pp, test_labeled_pp);#if 1 /* Set lambdas several different constants and test */ crossbow_hem_place_labeled_data (); bow_treenode_set_words_from_new_words_all (crossbow_root, 0); bow_treenode_set_leaf_prior_from_new_prior_all (crossbow_root, 0); for (lambda = 0.0; lambda < 1.01; lambda += 0.05) { printf ("\nFixed local_lambda=%f uniform_lambda=%f\n", 1.0 - lambda, lambda); for (iterator = crossbow_root; (tn = bow_treenode_iterate_all (&iterator));) { int ai; for (ai = 0; ai < tn->depth + 2; ai++) { if (ai == 0) tn->lambdas[ai] = 1.0 - lambda; else if (ai == tn->depth + 1) tn->lambdas[ai] = lambda; else tn->lambdas[ai] = 0; } } crossbow_classify_tagged_docs (bow_doc_test, 0, stdout); train_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_train); train_unlabeled_pp = crossbow_hem_unlabeled_perplexity (bow_doc_is_train); test_labeled_pp = crossbow_hem_labeled_perplexity (bow_doc_is_test); test_unlabeled_pp = crossbow_hem_unlabeled_perplexity (bow_doc_is_test); printf ("train-unlabeled-pp=%f train-labeled-pp=%f\n" "test-unlabeled-pp=%f test-labeled-pp=%f\n", train_unlabeled_pp, train_labeled_pp, test_unlabeled_pp, test_labeled_pp); }#endif}extern int crossbow_classify_doc (crossbow_doc *doc, int verbose, FILE *out);crossbow_method hem_cluster_method ={ "hem-cluster", NULL, NULL, crossbow_hem_cluster, crossbow_classify_doc,};crossbow_method hem_classify_method ={ "hem-classify", NULL, crossbow_hem_full_em, NULL, crossbow_classify_doc,};crossbow_method hem_fienberg_method ={ "fienberg-classify", NULL, crossbow_hem_fienberg, NULL, crossbow_classify_doc,};void _register_method_hem () __attribute__ ((constructor));void _register_method_hem (){ bow_method_register_with_name ((bow_method*)&hem_cluster_method, "hem-cluster", sizeof (crossbow_method), NULL); bow_method_register_with_name ((bow_method*)&hem_classify_method, "hem-classify", sizeof (crossbow_method), NULL); bow_method_register_with_name ((bow_method*)&hem_fienberg_method, "fienberg-classify", sizeof (crossbow_method), NULL); bow_argp_add_child (&crossbow_hem_argp_child);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -