📄 treenode.c
字号:
{ int ai; double total_lambdas_count = 0.0;#if MISC_STAYS_FLAT if (strstr (tn->name, "/Misc/")) { bow_treenode_set_lambdas_uniform (tn); for (ai = 0; ai < tn->depth + 2; ai++) tn->new_lambdas[ai] = 0; }#endif /* Calculate the normalizing constant */ for (ai = 0; ai < tn->depth + 2; ai++) total_lambdas_count += tn->new_lambdas[ai]; total_lambdas_count += alpha * (tn->depth + 2); //assert (total_lambdas_count); if (total_lambdas_count == 0) { alpha = 1.0 / (tn->depth + 2); total_lambdas_count = 1.0; } for (ai = 0; ai < tn->depth + 2; ai++) { assert (tn->new_lambdas[ai] >= 0);#if 1 || !USE_ACCELERATED_EM tn->lambdas[ai] = (alpha + tn->new_lambdas[ai]) / total_lambdas_count;#else tn->lambdas[ai] = (((1.0 - EM_ACCELERATION) * tn->lambdas[ai]) + (EM_ACCELERATION * (alpha + tn->new_lambdas[ai]) / total_lambdas_count)); if (tn->lambdas[ai] < 0) tn->lambdas[ai] = 0;#endif assert (tn->lambdas[ai] >= 0); assert (tn->lambdas[ai] <= 1); tn->new_lambdas[ai] = 0; }}/* Set the CLASSES distribution to uniform, allocating space for it if necessary */voidbow_treenode_set_classes_uniform (treenode *tn, int classes_capacity){ int ci; if (tn->classes_capacity == 0) { tn->classes_capacity = classes_capacity; tn->classes = bow_malloc (classes_capacity * sizeof (double)); tn->new_classes = bow_malloc (classes_capacity * sizeof (double)); } assert (classes_capacity == tn->classes_capacity); for (ci = 0; ci < classes_capacity; ci++) { tn->classes[ci] = 1.0 / classes_capacity; tn->new_classes[ci] = 0.0; }}/* Normalize the NEW_CLASSES distribution, move it into the CLASSES array and zero the NEW_CLASSES array. ALPHA is the parameter for the Dirichlet prior. */voidbow_treenode_set_classes_from_new_classes (treenode *tn, double alpha){ int ci; double total_classes_count = 0; assert (tn->classes_capacity > 0); for (ci = 0; ci < tn->classes_capacity; ci++) total_classes_count += tn->new_classes[ci]; total_classes_count += alpha * tn->classes_capacity; if (total_classes_count == 0) { alpha = 1.0 / tn->classes_capacity; total_classes_count = 1.0; } for (ci = 0; ci < tn->classes_capacity; ci++) { tn->classes[ci] = (alpha + tn->new_classes[ci]) / total_classes_count; tn->new_classes[ci] = 0; }}/* Return the log-probability of node TN's WORD distribution having produced the word vector WV. */doublebow_treenode_log_local_prob_of_wv (treenode *tn, bow_wv *wv){ int wvi; double log_prob = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) log_prob += wv->entry[wvi].count * log (tn->words[wv->entry[wvi].wi]); return log_prob;}/* Return the expected complete log likelihood of node TN's word distribution having produced the word vector WV. */doublebow_treenode_complete_log_prob_of_wv (treenode *tn, bow_wv *wv){ int wvi, ai; double log_prob = 0; treenode *ancestor; double *ancestor_membership; double ancestor_membership_normalizer; ancestor_membership = alloca ((tn->depth + 2) * sizeof (double)); for (wvi = 0; wvi < wv->num_entries; wvi++) { ancestor_membership_normalizer = 0; for (ancestor = tn, ai = 0; ancestor; ancestor = ancestor->parent, ai++) { ancestor_membership[ai] = (tn->lambdas[ai] * ancestor->words[wv->entry[wvi].wi]); ancestor_membership_normalizer += ancestor_membership[ai]; } ancestor_membership[ai] = tn->lambdas[ai] * 1.0 / tn->words_capacity; ancestor_membership_normalizer += ancestor_membership[ai]; for (ancestor = tn, ai = 0; ancestor; ancestor = ancestor->parent, ai++) { log_prob += (wv->entry[wvi].count * (ancestor_membership[ai] / ancestor_membership_normalizer) * log (ancestor->words[wv->entry[wvi].wi])); } log_prob += (wv->entry[wvi].count * (ancestor_membership[ai] / ancestor_membership_normalizer) * log (1.0 / tn->words_capacity)); } assert (log_prob == log_prob); return log_prob;}/* Return the probability of word WI in LEAF, using the hierarchical mixture */doublebow_treenode_pr_wi (treenode *node, int wi){ int i; treenode *ancestor; double ret = 0; if (node->children_count == 0) { /* NODE is a leaf. Return the vertical mixture using shrinkage */ for (ancestor = node, i = 0; ancestor; ancestor = ancestor->parent, i++) ret += node->lambdas[i] * ancestor->words[wi]; /* Add in the uniform distribution */ ret += node->lambdas[i] / node->words_capacity; } else { /* NODE is an interior node of the tree. Return a weighted average of the leaves under NODE. */ double prior_sum = 0; treenode *iterator, *leaf; for (iterator = node; (leaf = bow_treenode_iterate_leaves_under_node (&iterator, node)); ) { prior_sum += leaf->prior; ret += leaf->prior * bow_treenode_pr_wi (leaf, wi); } ret /= prior_sum; } return ret;}/* Return the probability of word WI in node TN, but with the probability mass of document LOO_DI removed. */doublebow_treenode_pr_wi_loo_local (treenode *tn, int wi, int loo_di, int loo_wvi){ double ret; double denominator; /* If there is no LOO information, return the non-LOO estimate. */ if (!(tn->di_loo) || !(tn->di_wvi_loo) || !(tn->di_wvi_loo[loo_di])) ret = tn->words[wi]; else {#if 0 double foo1 = ((tn->words[wi] * tn->new_words_normalizer) - tn->di_wvi_loo[loo_di][loo_wvi]); double foo2 = (tn->new_words_normalizer - tn->di_loo[loo_di]); if (foo1 < -1e-14) bow_error ("Foo1 %g orig %.18f minus %.18f", foo1, (tn->words[wi] * tn->new_words_normalizer), tn->di_wvi_loo[loo_di][loo_wvi]); assert (foo2 >= -1e-14);#endif denominator = (tn->new_words_normalizer - tn->di_loo[loo_di]); assert (denominator >= 0); /* Make sure it is non-negative, but account for round-off error */ assert ((tn->words[wi] * tn->new_words_normalizer) - tn->di_wvi_loo[loo_di][loo_wvi] >= -1e-7); if (denominator) ret = (((tn->words[wi] * tn->new_words_normalizer) - tn->di_wvi_loo[loo_di][loo_wvi]) / denominator); else /* Without this document, there is no training data for this class. Return a uniform distribution. */ ret = 1.0 / tn->words_capacity; if (ret < 0) { /* Account for roundoff error */ assert (ret > -1e-14); ret = 0; } } assert (ret >= 0); assert (ret <= 1); assert (tn); /* to keep tn available in debugger */ assert (loo_di >= 0 && loo_wvi >= 0); return ret;}/* Return the probability of word WI in LEAF, using the hierarchical mixture, but with the probability mass of document LOO_DI removed. */doublebow_treenode_pr_wi_loo (treenode *tn, int wi, int loo_di, int loo_wvi){ int i; treenode *ancestor; double ret = 0; if (tn->children_count == 0) { /* TN is a leaf. Return the vertical mixture using shrinkage */ for (ancestor = tn, i = 0; ancestor; ancestor = ancestor->parent, i++) ret += (tn->lambdas[i] * bow_treenode_pr_wi_loo_local (ancestor, wi, loo_di, loo_wvi)); /* Add in the uniform distribution */ ret += tn->lambdas[i] / tn->words_capacity; } else { /* TN is an interior node of the tree. Return a weighted average of the leaves under TN. */ double prior_sum = 0; treenode *iterator, *leaf; for (iterator = tn; (leaf = bow_treenode_iterate_leaves_under_node (&iterator, tn)); ) { prior_sum += leaf->prior; ret += leaf->prior * bow_treenode_pr_wi_loo (leaf, wi, loo_di, loo_wvi); } ret /= prior_sum; } assert (ret > 0); assert (ret < 1); return ret;}/* Return the prior probability of TN being in a path selected for generating a document. */doublebow_treenode_prior (treenode *tn){ if (tn->children_count != 0) { /* TN is an interior node of the tree; sum the priors of the leaves under it. */ treenode *iterator, *leaf; double ret = 0; for (iterator = tn; (leaf = bow_treenode_iterate_leaves_under_node (&iterator, tn)); ) { ret += leaf->prior; } return ret; } /* TN is a leaf; simply return its prior. */ return tn->prior;}/* Return the log-probability of node TN's WORD distribution having produced the word vector WV. */doublebow_treenode_log_prob_of_wv (treenode *tn, bow_wv *wv){ int wvi; double log_prob = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) log_prob += (wv->entry[wvi].count * log (bow_treenode_pr_wi (tn, wv->entry[wvi].wi))); return log_prob;}/* Same as above, but return a probability instead of a log-probability */doublebow_treenode_prob_of_wv (treenode *tn, bow_wv *wv){ return exp (bow_treenode_log_prob_of_wv (tn, wv));}/* Return the log-probability of node TN's WORD distribution having produced the word vector WV, but with document DI removed from TN's WORD distribution. */doublebow_treenode_log_prob_of_wv_loo (treenode *tn, bow_wv *wv, int loo_di){ int wvi; double log_prob = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) log_prob += (wv->entry[wvi].count * log (bow_treenode_pr_wi_loo (tn, wv->entry[wvi].wi, loo_di, wvi))); assert (log_prob < 0); return log_prob;}/* Return the local log-probability of node TN's WORD distribution having produced the word vector WV, but with document DI removed from TN's WORD distribution. */doublebow_treenode_log_local_prob_of_wv_loo (treenode *tn, bow_wv *wv, int loo_di){ int wvi; double log_prob = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) log_prob += (wv->entry[wvi].count * log (bow_treenode_pr_wi_loo_local (tn, wv->entry[wvi].wi, loo_di, wvi))); assert (log_prob < 0); return log_prob;}/* Return the number of leaves under (and including) TN */intbow_treenode_leaf_count (treenode *tn){ if (tn->children_count == 0) return 1; else { int ci, lc = 0; for (ci = 0; ci < tn->children_count; ci++) lc += bow_treenode_leaf_count (tn->children[ci]); return lc; }}/* Return the number of tree nodes under (and including) TN */intbow_treenode_node_count (treenode *tn){ if (tn->children_count == 0) return 1; else { int ci, lc = 0; for (ci = 0; ci < tn->children_count; ci++) lc += bow_treenode_node_count (tn->children[ci]); /* Plus one for TN itself. */ return lc + 1; }}/* Return an array of words with their associated likelihood ratios, calculated relative to its siblings. */bow_wa *bow_treenode_word_likelihood_ratios (treenode *tn){ int wi, ci; bow_wa *wa; double pr_wi_given_tn; double pr_wi_given_not_tn; double lr; if (tn->parent == NULL) return NULL; wa = bow_wa_new (tn->words_capacity+2); for (wi = 0; wi < tn->words_capacity; wi++) { pr_wi_given_tn = bow_treenode_pr_wi (tn, wi); pr_wi_given_not_tn = 0; for (ci = 0; ci < tn->parent->children_count; ci++) { if (ci != tn->ci_in_parent) pr_wi_given_not_tn += (bow_treenode_pr_wi (tn->parent->children[ci], wi) / (tn->parent->children_count - 1)); } if (pr_wi_given_tn == 0) lr = -1; else if (pr_wi_given_not_tn == 0) lr = 1; else lr = (pr_wi_given_tn * log (pr_wi_given_tn / pr_wi_given_not_tn)); //assert (lr < 1); bow_wa_append (wa, wi, lr); } return wa;}/* Return an array of words with their associated likelihood ratios, calculated relative to all the leaves. */bow_wa *bow_treenode_word_leaf_likelihood_ratios (treenode *tn){ int wi, leaf_count; bow_wa *wa; double pr_wi_given_tn; double pr_wi_given_not_tn; double lr; treenode *root, *iterator, *leaf; if (tn->children_count != 0) return NULL; root = tn; while (root->parent) root = root->parent; leaf_count = bow_treenode_leaf_count (root); wa = bow_wa_new (tn->words_capacity+2); for (wi = 0; wi < tn->words_capacity; wi++) { //pr_wi_given_tn = tn->words[wi];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -