📄 treenode.c
字号:
pr_wi_given_tn = bow_treenode_pr_wi (tn, wi); pr_wi_given_not_tn = 0; for (iterator = root; (leaf = bow_treenode_iterate_leaves (&iterator)); ) { if (leaf != tn) pr_wi_given_not_tn += (bow_treenode_pr_wi (leaf, wi) / (leaf_count-1)); //pr_wi_given_not_tn += leaf->words[wi] / (leaf_count - 1); } if (pr_wi_given_tn == 0) lr = -1; else if (pr_wi_given_not_tn == 0) lr = 1; else lr = (pr_wi_given_tn * log (pr_wi_given_tn / pr_wi_given_not_tn)); //assert (lr < 1); bow_wa_append (wa, wi, lr); } return wa;}/* Return an array of words with their associated odds ratios, calculated relative to all the leaves. */bow_wa *bow_treenode_word_leaf_odds_ratios (treenode *tn){ int wi, leaf_count; bow_wa *wa; double pr_wi_given_tn; double pr_wi_given_not_tn; double lr; treenode *root, *iterator, *leaf; if (tn->children_count != 0) return NULL; root = tn; while (root->parent) root = root->parent; leaf_count = bow_treenode_leaf_count (root); wa = bow_wa_new (tn->words_capacity+2); for (wi = 0; wi < tn->words_capacity; wi++) { pr_wi_given_tn = tn->words[wi]; pr_wi_given_not_tn = 0; for (iterator = root; (leaf = bow_treenode_iterate_leaves (&iterator)); ) { if (leaf != tn) pr_wi_given_not_tn += leaf->words[wi] / (leaf_count - 1); } lr = (/* pr_wi_given_tn * */ log ((pr_wi_given_tn * (1 - pr_wi_given_not_tn)) / (pr_wi_given_not_tn * (1 - pr_wi_given_tn)))); bow_wa_append (wa, wi, lr); } return wa;}/* Return an array of words with their associated likelihood ratios, calculated relative to all the leaves. */bow_wa *bow_treenode_word_leaf_mean_ratios (treenode *tn){ int wi, leaf_count; bow_wa *wa; double pr_wi_given_tn; double pr_wi; double lr; treenode *root, *iterator, *leaf; if (tn->children_count != 0) return NULL; root = tn; while (root->parent) root = root->parent; leaf_count = bow_treenode_leaf_count (root); wa = bow_wa_new (tn->words_capacity+2); for (wi = 0; wi < tn->words_capacity; wi++) { pr_wi_given_tn = tn->words[wi]; pr_wi = 0; for (iterator = root; (leaf = bow_treenode_iterate_leaves (&iterator)); ) { pr_wi += leaf->words[wi] / leaf_count; } assert (pr_wi > 0); lr = pr_wi_given_tn / pr_wi; bow_wa_append (wa, wi, lr); } return wa;}/* Print the NUM_TO_PRINT words with highest likelihood ratios, calculated relative to its siblings. */voidbow_treenode_word_likelihood_ratios_print (treenode *tn, int num_to_print){ bow_wa *wa; wa = bow_treenode_word_likelihood_ratios (tn); if (wa) { bow_wa_sort (wa); bow_wa_fprintf (wa, stdout, num_to_print); bow_wa_free (wa); }}/* Print the NUM_TO_PRINT words with highest likelihood ratios, calculated relative to all the leaves. */voidbow_treenode_word_leaf_likelihood_ratios_print (treenode *tn, int num_to_print){ bow_wa *wa; wa = bow_treenode_word_leaf_likelihood_ratios (tn); if (wa) { bow_wa_sort (wa); bow_wa_fprintf (wa, stdout, num_to_print); bow_wa_free (wa); }}/* Print the NUM_TO_PRINT words with highest odds ratios, calculated relative to all the leaves. */voidbow_treenode_word_leaf_odds_ratios_print (treenode *tn, int num_to_print){ bow_wa *wa; wa = bow_treenode_word_leaf_odds_ratios (tn); if (wa) { bow_wa_sort (wa); bow_wa_fprintf (wa, stdout, num_to_print); bow_wa_free (wa); }}/* Same as above, for all nodes in the tree. */voidbow_treenode_word_likelihood_ratios_print_all (treenode *tn, int num_to_print){ int ci; printf ("%s\nprior=%g\n", tn->name, tn->prior); bow_treenode_word_likelihood_ratios_print (tn, num_to_print); for (ci = 0; ci < tn->children_count; ci++) bow_treenode_word_likelihood_ratios_print_all (tn->children[ci], num_to_print);}/* Return a bow_wa array of words with their associated probabilities */bow_wa *bow_treenode_word_probs (treenode *tn){ int wi; bow_wa *wa; wa = bow_wa_new (tn->words_capacity+2); for (wi = 0; wi < tn->words_capacity; wi++) bow_wa_append (wa, wi, tn->words[wi]); return wa;}/* Print the NUM_TO_PRINT words with highest probability */voidbow_treenode_word_probs_print (treenode *tn, int num_to_print){ bow_wa *wa; wa = bow_treenode_word_probs (tn); if (wa) { bow_wa_sort (wa); bow_wa_fprintf (wa, stdout, num_to_print); bow_wa_free (wa); }}/* Same as above, for all nodes in the tree. */voidbow_treenode_word_probs_print_all (treenode *tn, int num_to_print){ int ci; printf ("%s\n", tn->name); if (tn->children_count == 0) printf (" prior=%g\n", tn->prior); bow_treenode_word_probs_print (tn, num_to_print); for (ci = 0; ci < tn->children_count; ci++) bow_treenode_word_probs_print_all (tn->children[ci], num_to_print);}/* Print most probable words in one line, and only if parent's WKL is high enough */voidbow_treenode_keywords_print (treenode *tn, FILE *fp){ bow_wa *wa; int wai; //double kldiv; if (tn->parent == NULL) return; //if (bow_treenode_children_weighted_kl_div (tn->parent) < 500) return;#if 0 if ((kldiv = bow_treenode_pair_kl_div (tn, tn->parent)) < 0.5) { fprintf (fp, "alias %s %s\n", tn->name, tn->parent->name); bow_verbosify (bow_progress, "%s kldiv versus parent %g SKIP\n", tn->name, kldiv); return; } else { bow_verbosify (bow_progress, "%s kldiv versus parent %g\n", tn->name, kldiv); } for (ci = 0; ci < tn->ci_in_parent; ci++) { if (((kldiv = bow_treenode_pair_kl_div (tn, tn->parent->children[ci])) < 0.5)) { fprintf (fp, "alias %s %s\n", tn->name, tn->parent->children[ci]->name); bow_verbosify (bow_progress, "%s %s kldiv versus sibling %g SKIP\n", tn->name, tn->parent->children[ci]->name, kldiv); return; } else { bow_verbosify (bow_progress, "%s %s kldiv versus sibling %g\n", tn->name, tn->parent->children[ci]->name, kldiv); } }#endif wa = bow_treenode_word_probs (tn); if (wa) { fprintf (fp, "%s %g ", tn->name, bow_treenode_pair_kl_div (tn, tn->parent)); bow_wa_sort (wa); for (wai = 0; wai < 10; wai++) fprintf (fp, "%s ", bow_int2word (wa->entry[wai].wi)); fprintf (fp, "\n"); bow_wa_free (wa); }}/* Same as above, but for TN and all treenodes under TN */voidbow_treenode_keywords_print_all (treenode *tn, FILE *fp){ int ci; bow_treenode_keywords_print (tn, fp); for (ci = 0; ci < tn->children_count; ci++) bow_treenode_keywords_print_all (tn->children[ci], fp);}/* Print the (normalized) probability of word WI in each of the nodes of the tree rooted at ROOT. */voidbow_treenode_normalized_word_prob_all_print (treenode *root, int wi){ int leaf_count; double *nodes, nodes_total; int ni; treenode *iterator, *node; leaf_count = bow_treenode_leaf_count (root); nodes = alloca (sizeof (double) * leaf_count); nodes_total = 0; for (iterator = root, ni = 0; (node=bow_treenode_iterate_all (&iterator)); ni++) { nodes[ni] = node->words[wi]; nodes_total += nodes[ni]; } for (iterator = root, ni = 0; (node=bow_treenode_iterate_all (&iterator)); ni++) printf ("%10f %s\n", nodes[ni] / nodes_total, node->name);}/* Print the word distribution for each leaf to a separate file, each file having prefix FILENAME_PREFIX. Use vertical mixture if SHRINKAGE is non-zero. */voidbow_treenode_print_all_word_probabilities_all (const char *filename_prefix, int shrinkage){ int li, wi; char *s; treenode *iterator, *leaf; char leafname[BOW_MAX_WORD_LENGTH]; char filename[BOW_MAX_WORD_LENGTH]; FILE *fp; double pr_w; bow_verbosify (bow_progress, "Starting word probability printing\n"); for (iterator = crossbow_root, li = 0; (leaf = bow_treenode_iterate_leaves (&iterator)); li++) { strcpy (leafname, leaf->name); /* Convert '/' to '-' */ for (s = leafname; *s; s++) if (*s == '/') *s = '-'; sprintf (filename, "%s-%s", filename_prefix, leafname); fp = bow_fopen (filename, "w"); for (wi = 0; wi < leaf->words_capacity; wi++) { if (shrinkage) pr_w = bow_treenode_pr_wi (leaf, wi); else pr_w = leaf->words[wi]; fprintf (fp, "%f %s\n", pr_w, bow_int2word (wi)); } fclose (fp); }}/* Return the "KL Divergence to the Mean" among the children of TN */doublebow_treenode_children_kl_div (treenode *tn){ double *mean; double kldiv; int wi, ci; if (tn->children_count < 2) return 0; /* Calculate the mean distribution */ mean = bow_malloc (tn->words_capacity * sizeof (double)); for (wi = 0; wi < tn->words_capacity; wi++) { mean[wi] = 0; for (ci = 0; ci < tn->children_count; ci++) mean[wi] += tn->children[ci]->words[wi]; mean[wi] /= tn->children_count; } /* Calculate "KL Divergence to the Mean" for each child. */ kldiv = 0; for (ci = 0; ci < tn->children_count; ci++) { for (wi = 0; wi < tn->words_capacity; wi++) { /* Testing for tn->children[ci]->words[wi] is legitimate. Testing for mean[wi] is a concession to round-off error */ if (tn->children[ci]->words[wi] && mean[wi]) kldiv += (tn->children[ci]->words[wi] * log (tn->children[ci]->words[wi] / mean[wi])); //assert (kldiv < 10); } } bow_free (mean); kldiv /= tn->children_count; return kldiv;}/* Return the weighted "KL Divergence to the mean among the children of TN" multiplied by the number of words of training data in the children. */doublebow_treenode_children_weighted_kl_div (treenode *tn){ double weight = 0; int ci; for (ci = 0; ci < tn->children_count; ci++) weight += tn->children[ci]->new_words_normalizer; return weight * bow_treenode_children_kl_div (tn);}/* Return the "KL Divergence to the mean" between TN1 and TN2. */doublebow_treenode_pair_kl_div (treenode *tn1, treenode *tn2){ double *mean; double kldiv; int wi; /* Calculate the mean distribution */ mean = bow_malloc (tn1->words_capacity * sizeof (double)); for (wi = 0; wi < tn1->words_capacity; wi++) { mean[wi] = 0; mean[wi] += tn1->words[wi]; mean[wi] += tn2->words[wi]; mean[wi] /= 2; } /* Calculate "KL Divergence to the Mean" for each one. */ kldiv = 0; for (wi = 0; wi < tn1->words_capacity; wi++) { /* Testing for tn->children[ci]->words[wi] is legitimate. Testing for mean[wi] is a concession to round-off error */ if (mean[wi]) { if (tn1->words[wi]) kldiv += tn1->words[wi] * log (tn1->words[wi] / mean[wi]); if (tn2->words[wi]) kldiv += tn2->words[wi] * log (tn2->words[wi] / mean[wi]); } } bow_free (mean); kldiv /= 2; return kldiv;}/* Same as above, but multiply by the number of words in TN1 and TN2. */doublebow_treenode_pair_weighted_kl_div (treenode *tn1, treenode *tn2){ return ((tn1->new_words_normalizer + tn2->new_words_normalizer) * bow_treenode_pair_kl_div (tn1, tn2));}/* Return non-zero if any of TN's children are leaves */intbow_treenode_is_leaf_parent (treenode *tn){ int ci; for (ci = 0; ci < tn->children_count; ci++) if (tn->children[ci]->children_count == 0) return 1; return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -