📄 treenode.c
字号:
tn->new_words[i] = 0; } /* Read the prior */ bow_fread_double (&(tn->prior), fp); bow_fread_double (&(tn->new_prior), fp); /* Read the lambda mixture weights */ bow_fread_int (&(tn->depth), fp); tn->lambdas = bow_malloc ((tn->depth + 2) * sizeof (double)); tn->new_lambdas = bow_malloc ((tn->depth + 2) * sizeof (double)); for (i = 0; i < tn->depth + 2; i++) { bow_fread_double (&(tn->lambdas[i]), fp); bow_fread_double (&(tn->new_lambdas[i]), fp); } /* Read in the class distribution */ bow_fread_int (&(tn->classes_capacity), fp); if (tn->classes_capacity) { tn->classes = bow_malloc (tn->classes_capacity * sizeof (double)); tn->new_classes = bow_malloc (tn->classes_capacity * sizeof (double)); for (i = 0; i < tn->classes_capacity; i++) { bow_fread_double (&(tn->classes[i]), fp); tn->new_classes[i] = 0; } } else tn->classes = tn->new_classes = NULL; /* Read the children treenodes */ bow_fread_int (&(tn->children_count), fp); bow_fread_int (&(tn->children_capacity), fp); tn->children = bow_malloc (tn->children_capacity * sizeof (void*)); bow_fread_int (&(tn->ci_in_parent), fp); for (i = 0; i < tn->children_count; i++) { tn->children[i] = bow_treenode_new_from_fp (fp); tn->children[i]->parent = tn; assert (tn->children[i]->ci_in_parent == i); } /* Initialize the DI_WI_NEW_WORDS later, only if requested. */ tn->di_loo = NULL; tn->di_wvi_loo = NULL; tn->new_di_loo = NULL; tn->new_di_wvi_loo = NULL; //bow_verbosify (bow_progress, "Read treenode %s\n", tn->name); return tn;}/* Set all of TN's ancestor mixture weights, LAMBDAS, to equal values. */voidbow_treenode_set_lambdas_uniform (treenode *tn){ int i; double lambda = 1.0 / (tn->depth + 2); for (i = 0; i < tn->depth + 2; i++) tn->lambdas[i] = lambda;}/* Same as above, but for all leaves in the tree. */voidbow_treenode_set_lambdas_uniform_all (treenode *tn){ treenode *iterator, *leaf; assert (tn->parent == NULL); for (iterator = tn; (leaf = bow_treenode_iterate_leaves (&iterator)); ) bow_treenode_set_lambdas_uniform (leaf);}/* Set TN's mixture weights, LAMBDAS, to use only the estimates. */voidbow_treenode_set_lambdas_leaf_only (treenode *tn){ int i; tn->lambdas[0] = 1; for (i = 1; i < tn->depth + 2; i++) tn->lambdas[i] = 0;}/* Same as above, but for all leaves in the tree. */voidbow_treenode_set_lambdas_leaf_only_all (treenode *tn){ treenode *iterator, *leaf; assert (tn->parent == NULL); for (iterator = tn; (leaf = bow_treenode_iterate_leaves (&iterator)); ) bow_treenode_set_lambdas_leaf_only (leaf);}/* Add WEIGHT to treenode TN's record of how much probability mass document DI contributed to TN's NEW_WORDS for the word at DI's WVI'th word. This mass can later be subtracted to do leave-one-out calculations. DI_WV_NUM_ENTRIES-1 is the maximum WVI that can be expected for DI; DI_COUNT-1 is the maximum DI that can be expected; both are used to know how much space to allocate. */voidbow_treenode_add_new_loo_for_di_wvi (treenode *tn, double weight, int di, int wvi, int di_wv_num_entries, int di_count){ int i; if (tn->new_di_loo == NULL) { tn->new_di_loo = bow_malloc (di_count * sizeof (double)); for (i = 0; i < di_count; i++) tn->new_di_loo[i] = 0; } if (tn->new_di_wvi_loo == NULL) { tn->new_di_wvi_loo = bow_malloc (di_count * sizeof (void*)); for (i = 0; i < di_count; i++) tn->new_di_wvi_loo[i] = NULL; } if (tn->new_di_wvi_loo[di] == NULL) { tn->new_di_wvi_loo[di] = bow_malloc (di_wv_num_entries * sizeof (double)); for (i = 0; i < di_wv_num_entries; i++) tn->new_di_wvi_loo[di][i] = 0; } tn->new_di_loo[di] += weight; tn->new_di_wvi_loo[di][wvi] += weight;}/* Clear all LOO info for treenode TN */voidbow_treenode_free_loo (treenode *tn, int di_count){ int i; /* For now, clear by freeing */ if (tn->di_loo) { bow_free (tn->di_loo); tn->di_loo = NULL; } if (tn->di_wvi_loo) { for (i = 0; i < di_count; i++) { if (tn->di_wvi_loo[i]) bow_free (tn->di_wvi_loo[i]); } bow_free (tn->di_wvi_loo); tn->di_wvi_loo = NULL; }}/* Same as above, over all nodes of the tree. */voidbow_treenode_free_loo_all (treenode *root, int di_count){ int ci; bow_treenode_free_loo (root, di_count); for (ci = 0; ci < root->children_count; ci++) bow_treenode_free_loo_all (root->children[ci], di_count);}/* Clear all LOO info for treenode TN */voidbow_treenode_free_loo_and_new_loo (treenode *tn, int di_count){ int i; if (tn->di_loo) { bow_free (tn->di_loo); tn->di_loo = NULL; } if (tn->new_di_loo) { bow_free (tn->new_di_loo); tn->new_di_loo = NULL; } if (tn->di_wvi_loo) { for (i = 0; i < di_count; i++) { if (tn->di_wvi_loo[i]) bow_free (tn->di_wvi_loo[i]); } bow_free (tn->di_wvi_loo); tn->di_wvi_loo = NULL; } if (tn->new_di_wvi_loo) { for (i = 0; i < di_count; i++) { if (tn->new_di_wvi_loo[i]) bow_free (tn->new_di_wvi_loo[i]); } bow_free (tn->new_di_wvi_loo); tn->new_di_wvi_loo = NULL; }}/* Same as above, over all nodes of the tree. */voidbow_treenode_free_loo_and_new_loo_all (treenode *root, int di_count){ int ci; bow_treenode_free_loo_and_new_loo (root, di_count); for (ci = 0; ci < root->children_count; ci++) bow_treenode_free_loo_and_new_loo_all (root->children[ci], di_count);}/* Set the leave-one-out information used for future BOW_TREENODE_PR_WI*() calculations from the NEW_*_LOO variables, then clear the NEW_*_LOO variables so they are ready for the next round. */static voidbow_treenode_set_loo_from_new_loo (treenode *tn, int di_count){ bow_treenode_free_loo (tn, di_count); tn->di_loo = tn->new_di_loo; tn->di_wvi_loo = tn->new_di_wvi_loo; tn->new_di_loo = NULL; tn->new_di_wvi_loo = NULL;}/* Normalize the NEW_WORDS distribution, move it into the WORDS array and zero the NEW_WORDS array. ALPHA is the parameter for the Dirichlet prior. */voidbow_treenode_set_words_from_new_words (treenode *tn, double alpha){ int wi; double total_word_count = 0.0; /* A special case for "Misc" nodes: increase their smoothing. NOTE: This has no effect if MISC_STAYS_FLAT is non-zero. */ if (strstr (tn->name, "/Misc/")) alpha++; /* Calculate the normalizing constant */ for (wi = 0; wi < tn->words_capacity; wi++) total_word_count += tn->new_words[wi]; total_word_count += alpha * tn->words_capacity; //assert (total_word_count); if (total_word_count == 0) { alpha = 1.0 / tn->words_capacity;; total_word_count = 1.0; } for (wi = 0; wi < tn->words_capacity; wi++) { //assert (tn->new_words[wi] > 0); assert (tn->new_words[wi] >= 0);#if !USE_ACCELERATED_EM#if !MISC_STAYS_FLAT tn->words[wi] = (alpha + tn->new_words[wi]) / total_word_count;#else /* A special case for "Misc" nodes: they stay flat */ if (strstr (tn->name, "/Misc/")) tn->words[wi] = 1.0 / tn->words_capacity; else tn->words[wi] = (alpha + tn->new_words[wi]) / total_word_count;#endif /* MISC_STAYS_FLAT */#else tn->words[wi] = (((1.0 - EM_ACCELERATION) * tn->words[wi]) + (EM_ACCELERATION * (alpha + tn->new_words[wi]) / total_word_count)); if (tn->words[wi] < 0) tn->words[wi] = 0;#endif /* USE_ACCELERATED_EM */ assert (tn->words[wi] >= 0); assert (tn->words[wi] <= 1); tn->new_words[wi] = 0; }#if USE_ACCELERATED_EM /* Renormalize after setting some to zero. */ total_word_count = 0; for (wi = 0; wi < tn->words_capacity; wi++) total_word_count += tn->words[wi]; for (wi = 0; wi < tn->words_capacity; wi++) tn->words[wi] /= total_word_count;#endif/* Why was this conditioned on MISC_STAYS_FLAT? The bow_treenode_pr_wi_loo_local function doesn't work with the new_words_normalizer equal to zero! */ if (!MISC_STAYS_FLAT || !strstr (tn->name, "/Misc/")) tn->new_words_normalizer = total_word_count; else tn->new_words_normalizer = 0; /* Also roll over the LOO information. */ bow_treenode_set_loo_from_new_loo (tn, crossbow_docs->length);}/* Over all nodes of the tree, normalize the NEW_WORDS distribution, move it into the WORDS array and zero the NEW_WORDS array. */voidbow_treenode_set_words_from_new_words_all (treenode *root, double alpha){ int ci; bow_treenode_set_words_from_new_words (root, alpha); for (ci = 0; ci < root->children_count; ci++) bow_treenode_set_words_from_new_words_all (root->children[ci], alpha);}/* Set NEW_WORDS counts to zero. */voidbow_treenode_set_new_words_to_zero (treenode *tn){ int wi; for (wi = 0; wi < tn->words_capacity; wi++) tn->new_words[wi] = 0;}/* Same as above, over all nodes of the tree. */voidbow_treenode_set_new_words_to_zero_all (treenode *root){ int ci; bow_treenode_set_new_words_to_zero (root); for (ci = 0; ci < root->children_count; ci++) bow_treenode_set_new_words_to_zero_all (root->children[ci]);}/* Set the NEW_WORDS distribution from the addition of the WORDS distribution and some random noise. NOISE_WEIGHT 0.5 gives equal weight to the data and the noise. */voidbow_treenode_set_new_words_from_perturbed_words (treenode *tn, double noise_weight){ int wi; for (wi = 0; wi < tn->words_capacity; wi++) tn->new_words[wi] = ((1 - noise_weight) * tn->words[wi] + noise_weight * bow_random_01()/tn->words_capacity);}/* Same as above, over all nodes of the tree. */voidbow_treenode_set_new_words_from_perturbed_words_all (treenode *root, double noise_weight){ int ci; bow_treenode_set_new_words_from_perturbed_words (root, noise_weight); for (ci = 0; ci < root->children_count; ci++) bow_treenode_set_new_words_from_perturbed_words_all (root->children[ci], noise_weight);}/* Over all leaves of the tree, set the PRIOR by the results of smoothing and normalizing the NEW_PRIOR distribution. ALPHA is the parameter for the Dirichlet prior. */voidbow_treenode_set_leaf_prior_from_new_prior_all (treenode *root, double alpha){ treenode *iterator, *leaf; double prior_sum = 0; assert (root->parent == NULL); for (iterator = root; (leaf = bow_treenode_iterate_leaves (&iterator)); ) { if (strstr (leaf->name, "/Misc/")) { /* Arbitrarily give /Misc/ node the same weight as the average of the first two children of LEAF's parent. */ assert (leaf->parent->children_count >= 2); leaf->new_prior = (leaf->parent->children[0]->new_prior + leaf->parent->children[1]->new_prior) / 2; } prior_sum += leaf->new_prior + alpha; } assert (prior_sum); for (iterator = root; (leaf = bow_treenode_iterate_leaves (&iterator)); ) { leaf->prior = (leaf->new_prior + alpha) / prior_sum; leaf->new_prior = 0; }}/* Over all nodes (including interior and root) of the tree, set the PRIOR by the results of smoothing and normalizing the NEW_PRIOR distribution. ALPHA is the parameter for the Dirichlet prior. */voidbow_treenode_set_prior_from_new_prior_all (treenode *root, double alpha){ treenode *iterator, *leaf; double prior_sum = 0; assert (root->parent == NULL); for (iterator = root; (leaf = bow_treenode_iterate_all (&iterator)); ) prior_sum += leaf->new_prior + alpha; assert (prior_sum); for (iterator = root; (leaf = bow_treenode_iterate_all (&iterator)); ) { leaf->prior = (leaf->new_prior + alpha) / prior_sum; leaf->new_prior = 0; }}/* Over all nodes (including interior and root) of the tree, plus one "extra" quantity (intended for the prior probability of the uniform distribution), set the PRIOR by the results of smoothing and normalizing the NEW_PRIOR distribution, and set EXTRA as part of the normalization. ALPHA is the parameter for the Dirichlet prior. */voidbow_treenode_set_prior_and_extra_from_new_prior_all (treenode *root, double *new_extra, double *extra, double alpha){ treenode *iterator, *leaf; double prior_sum = 0; assert (root->parent == NULL); for (iterator = root; (leaf = bow_treenode_iterate_all (&iterator)); ) prior_sum += leaf->new_prior + alpha; prior_sum += *new_extra + alpha; assert (prior_sum); for (iterator = root; (leaf = bow_treenode_iterate_all (&iterator)); ) { leaf->prior = (leaf->new_prior + alpha) / prior_sum; leaf->new_prior = 0; } *extra = (*new_extra + alpha) / prior_sum; *new_extra = 0;}/* Normalize the NEW_LAMBDAS distribution, move it into the LAMBDAS array and zero the NEW_LAMBDAS array. ALPHA is the parameter for the Dirichlet prior. */voidbow_treenode_set_lambdas_from_new_lambdas (treenode *tn, double alpha)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -