📄 training_algorithms_multialpha.c
字号:
#ifdef DEBUG_PRIORS printf("Starting emission matrix update\n");#endif /* update emission matrix using Dirichlet prior files if they exist*/ priorp = *(hmmp->ed_ps + k); if(priorp != NULL && use_prior == YES && hmmp->alphabet_type == DISCRETE) {#ifdef DEBUG_PRIORS printf("k = %d\n", k); printf("value = %x\n", priorp);#endif update_emiss_mtx_prior_multi(hmmp, E, k, priorp, 1); } else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type == DISCRETE) /* update emissions matrix "normally" when dirichlet file is missing */ { update_emiss_mtx_pseudocount_multi(hmmp, E, k, 1); } else if(hmmp->alphabet_type == DISCRETE) { update_emiss_mtx_std_multi(hmmp, E, k, 1); } else { update_emiss_mtx_std_continuous_multi(hmmp, E, k, 1); } if(hmmp->nr_alphabets > 1) { priorp = *(hmmp->ed_ps_2 + k); if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_2 == DISCRETE) { update_emiss_mtx_prior_multi(hmmp, E_2, k, priorp, 2); } else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_2 == DISCRETE) /* update emissions matrix "normally" when dirichlet file is missing */ { update_emiss_mtx_pseudocount_multi(hmmp, E_2, k, 2); } else if(hmmp->alphabet_type_2 == DISCRETE) { update_emiss_mtx_std_multi(hmmp, E_2, k, 2); } else { update_emiss_mtx_std_continuous_multi(hmmp, E_2, k, 2); } } if(hmmp->nr_alphabets > 2) { priorp = *(hmmp->ed_ps_3 + k); if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_3 == DISCRETE) { update_emiss_mtx_prior_multi(hmmp, E_3, k, priorp, 3); } else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_3 == DISCRETE) /* update emissions matrix "normally" when dirichlet file is missing */ { update_emiss_mtx_pseudocount_multi(hmmp, E_3, k, 3); } else if(hmmp->alphabet_type_3 == DISCRETE) { update_emiss_mtx_std_multi(hmmp, E_3, k, 3); } else { update_emiss_mtx_std_continuous_multi(hmmp, E_3, k, 3); } } if(hmmp->nr_alphabets > 3) { priorp = *(hmmp->ed_ps_4 + k); if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_4 == DISCRETE) { update_emiss_mtx_prior_multi(hmmp, E_4, k, priorp, 4); } else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_4 == DISCRETE) /* update emissions matrix "normally" when dirichlet file is missing */ { update_emiss_mtx_pseudocount_multi(hmmp, E_4, k, 4); } else if(hmmp->alphabet_type_4 == DISCRETE) { update_emiss_mtx_std_multi(hmmp, E_4, k, 4); } else { update_emiss_mtx_std_continuous_multi(hmmp, E_4, k, 4); } } } #ifdef DEBUG_BW dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->transitions); dump_emiss_matrix(hmmp->nr_v, hmmp->a_size, hmmp->emissions);#endif /* some garbage collection */ free(E); if(hmmp->nr_alphabets > 1) { free(E_2); } if(hmmp->nr_alphabets > 2) { free(E_3); } if(hmmp->nr_alphabets > 3) { free(E_4); } free(T); max_nr_iterations--; iteration++; } while(max_nr_iterations > 0); /* break condition is also when log_likelihood_difference is * smaller than THRESHOLD, checked inside the loop for * better efficiency */ #ifdef DEBUG_BW dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->transitions); dump_emiss_matrix(hmmp->nr_v, hmmp->a_size, hmmp->emissions);#endif }/* implementation of the baum-welch training algorithm using dirichlet prior mixture to * calculate update of emission (and transition) matrices and using a multiple sequence * alignment as the training sequence */void msa_baum_welch_dirichlet_multi(struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int nr_seqs, int annealing, int use_gap_shares, int use_lead_columns, int use_labels, int use_transition_pseudo_counts, int use_emission_pseudo_counts, int normalize, int scoring_method, int use_nr_occ, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4, int use_prior){ struct msa_sequences_multi_s *msa_seq_infop_start; double *T, *E, *E_2, *E_3, *E_4; /* matrices for the estimated number of times * each transition (T) and emission (E) is used */ struct forward_s *forw_mtx; /* forward matrix */ struct backward_s *backw_mtx; /* backward matrix */ double *forw_scale; /* scaling array */ int s,p,k,l,a,d,i; /* loop counters, s loops over the sequences, p over the * positions in the sequence, k and l over states, a over the alphabet, * d over the distribution groups and i is a slush variable */ struct path_element *lp; double t_res, t_res_1, t_res_2, t_res_3; /* for temporary results */ double t_res_4, t_res_5, t_res_6; /* for temporary results */ double e_res, e_res_1, e_res_2, e_res_3; /* for temporary results */ int seq_len; /* length of the sequences */ int a_index; /* holds current letters index in the alphabet */ double old_log_likelihood, new_log_likelihood; /* to calculate when to stop */ double likelihood; /* temporary variable for calculating likelihood of a sequence */ int max_nr_iterations, iteration; double Eka_base; int query_index; /* index of query seq */ /* dirichlet prior variables */ struct emission_dirichlet_s *priorp; struct emission_dirichlet_s *priorp_2; struct emission_dirichlet_s *priorp_3; struct emission_dirichlet_s *priorp_4; /* simulated annealing varialbles */ double temperature; double cooling_factor; int annealing_status; /* help variables for add_to_E */ int alphabet_nr; int alphabet; int a_size; double *E_cur; double *subst_mtx; struct msa_letter_s *msa_seq; double *tmp_emissions; int alphabet_type; /* remember start of sequences */ msa_seq_infop_start = msa_seq_infop; old_log_likelihood = 9999.0; new_log_likelihood = 9999.0; max_nr_iterations = 20; iteration = 1; if(annealing == YES) { temperature = INIT_TEMP; cooling_factor = INIT_COOL; annealing_status = ACTIVE; } else { annealing_status = DONE; } #ifdef DEBUG_BW2 check_for_corrupt_values(hmmp->nr_v, hmmp->a_size, hmmp->emissions , "emiss"); check_for_corrupt_values(hmmp->nr_v, hmmp->nr_v, hmmp->transitions , "trans");#endif do {#ifdef DEBUG_BW2 printf("starting baum-welch loop\n");#endif /* initialize matrices */ T = (double*)(malloc_or_die(hmmp->nr_v * hmmp->nr_v * sizeof(double))); if(hmmp->alphabet_type == DISCRETE) { E = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size * sizeof(double))); } else { E = (double*)(malloc_or_die(hmmp->nr_v * (hmmp->a_size + 1) * sizeof(double))); } if(hmmp->nr_alphabets > 1) { if(hmmp->alphabet_type_2 == DISCRETE) { E_2 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_2 * sizeof(double))); } else { E_2 = (double*)(malloc_or_die(hmmp->nr_v * (hmmp->a_size_2 + 1) * sizeof(double))); } } if(hmmp->nr_alphabets > 2) { if(hmmp->alphabet_type_3 == DISCRETE) { E_3 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_3 * sizeof(double))); } else { E_3 = (double*)(malloc_or_die(hmmp->nr_v * (hmmp->a_size_3 + 1) * sizeof(double))); } } if(hmmp->nr_alphabets > 3) { if(hmmp->alphabet_type_4 == DISCRETE) { E_4 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_4 * sizeof(double))); } else { E_4 = (double*)(malloc_or_die(hmmp->nr_v * (hmmp->a_size_4 + 1) * sizeof(double))); } } /* reset sequence pointer */ msa_seq_infop = msa_seq_infop_start; old_log_likelihood = new_log_likelihood; new_log_likelihood = 0.0; for(s = 0; s < nr_seqs; s++) { if(use_lead_columns == YES) { seq_len = msa_seq_infop->nr_lead_columns; } else { seq_len = msa_seq_infop->msa_seq_length; } /* calculate forward and backward matrices * memory for forw_mtx, scale_mtx and * backw_mtx is allocated in the functions */#ifdef DEBUG_BW2 printf("running forward for seq %d\n", s + 1);#endif msa_forward_multi(hmmp, msa_seq_infop, use_lead_columns, use_gap_shares, NO, &forw_mtx, &forw_scale, use_labels, normalize, scoring_method, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4);#ifdef DEBUG_BW2 dump_forward_matrix(seq_len + 2, hmmp->nr_v, forw_mtx); printf("running backward\n");#endif msa_backward_multi(hmmp, msa_seq_infop, use_lead_columns, use_gap_shares, &backw_mtx, forw_scale, use_labels, normalize, scoring_method, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4);#ifdef DEBUG_BW2 check_for_corrupt_values(seq_len + 2, hmmp->nr_v, forw_mtx , "F"); check_for_corrupt_values(seq_len + 2, hmmp->nr_v, backw_mtx , "B"); printf("done with backward\n");#endif /* update new_log_likelihood */ likelihood = log10((forw_mtx + get_mtx_index(seq_len+1, hmmp->nr_v-1, hmmp->nr_v))->prob); for(k = 0; k <= seq_len; k++) { likelihood = likelihood + log10(*(forw_scale + k)); }#ifdef DEBUG_BW2 dump_scaling_array(k-1,forw_scale); printf("likelihood = %f\n", likelihood);#endif new_log_likelihood += likelihood; for(k = 0; k < hmmp->nr_v-1; k++) /* k = from vertex */ { lp = *(hmmp->to_trans_array + k); while(lp->vertex != END) /* l = to-vertex */ { i = 0; while(1) { if(use_lead_columns == NO) { p = i; } else { p = *(msa_seq_infop->lead_columns_start + i); } /* add T[k][l] contribution for the msa-sequence */ add_Tkl_contribution_msa_multi(hmmp, msa_seq_infop, forw_mtx, backw_mtx, forw_scale, p, k, lp, T, use_gap_shares, use_lead_columns, i, use_labels, scoring_method, normalize, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4); i++; if(use_lead_columns == NO) { if(i >= seq_len) { break; } } else { if(*(msa_seq_infop->lead_columns_start + i) == END) { break; } } } /* move on to next path */ while(lp->next != NULL) { lp++; } lp++; } /* calculate E[k][a] contribution from this sequence */ if(silent_state_multi(k, hmmp) != 0) { i = 0; while(1) { /* get correct index for this letter column */ if(use_lead_columns == NO) { p = i; } else { p = *(msa_seq_infop->lead_columns_start + i); } /* get basic scoring result */ Eka_base = add_Eka_contribution_msa_multi(hmmp, msa_seq_infop, forw_mtx, backw_mtx, p, k, i, use_lead_columns); /* loop over the alphabets */ for(alphabet_nr = 1; alphabet_nr <= hmmp->nr_alphabets; alphabet_nr++) { if(alphabet_nr == 1) { alphabet = hmmp->alphabet; subst_mtx = hmmp->subst_mtx; a_size = hmmp->a_size; E_cur = E; msa_seq = msa_seq_infop->msa_seq_1; alphabet_type = hmmp->alphabet_type; tmp_emissions = hmmp->emissions; } else if(alphabet_nr == 2) { alphabet = hmmp->alphabet_2; subst_mtx = hmmp->subst_mtx_2; a_size = hmmp->a_size_2; E_cur = E_2; msa_seq = msa_seq_infop->msa_seq_2; alphabet_type = hmmp->alphabet_type_2; tmp_emissions = hmmp->emissions_2; } else if(alphabet_nr == 3) { alphabet = hmmp->alphabet_3; subst_mtx = hmmp->subst_mtx_3; a_size = hmmp->a_size_3; E_cur = E_3; msa_seq = msa_seq_infop->msa_seq_3; alphabet_type = hmmp->alphabet_type_3; tmp_emissions = hmmp->emissions_3; } else if(alphabet_nr == 4) { alphabet = hmmp->alphabet_4; subst_mtx = hmmp->subst_mtx_4; a_size = hmmp->a_size_4; E_cur = E_4; msa_seq = msa_seq_infop->msa_seq_4; alphabet_type = hmmp->alphabet_type_4; tmp_emissions = hmmp->emissions_4; } /* get result and add to matrix according to scoring method */ add_to_E_multi(E_cur, Eka_base, msa_seq, p, k, a_size, normalize, subst_mtx, alphabet, scoring_method, use_nr_occ, alphabet_type, tmp_emissions); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -