📄 training_algorithms_multialpha.c
字号:
/* update loop index, check if we are done */ i++; if(use_lead_columns == NO) { if(i >= seq_len) { break; } } else { if(*(msa_seq_infop->lead_columns_start + i) == END) { break; } } } } } /* some garbage collection */ free(forw_mtx); free(backw_mtx); free(forw_scale); msa_seq_infop++; } if(verbose == YES) { printf("log likelihood rd %d: %f\n", iteration, new_log_likelihood); } #ifdef DEBUG_BW2 dump_T_matrix(hmmp->nr_v, hmmp->nr_v, T); dump_E_matrix(hmmp->nr_v, hmmp->a_size, E); if(hmmp->nr_alphabets > 1) { dump_E_matrix(hmmp->nr_v, hmmp->a_size_2 + 1, E_2); }#endif /* check if likelihood change is small enough, then we are done */ if(fabs(new_log_likelihood - old_log_likelihood) < INNER_BW_THRESHOLD && annealing_status == DONE) { break; } /* if simulated annealing is used, scramble results in E and T matrices */ if(annealing == YES && temperature > ANNEAL_THRESHOLD) { anneal_E_matrix_multi(temperature, E, hmmp, 1); if(hmmp->nr_alphabets > 1) { anneal_E_matrix_multi(temperature, E_2, hmmp, 2); } if(hmmp->nr_alphabets > 2) { anneal_E_matrix_multi(temperature, E_3, hmmp, 3); } if(hmmp->nr_alphabets > 3) { anneal_E_matrix_multi(temperature, E_4, hmmp, 4); } anneal_T_matrix_multi(temperature, T, hmmp); temperature = temperature * cooling_factor; } if(temperature < ANNEAL_THRESHOLD) { annealing_status = DONE; } /* recalculate emission expectations according to distribution groups * by simply taking the mean of the expected emissions within this group * for each letter in the alphabet and replacing each expectation for the * letter with this value for every member of the distribution group */ recalculate_emiss_expectations_multi(hmmp, E, 1); if(hmmp->nr_alphabets > 1) { recalculate_emiss_expectations_multi(hmmp, E_2, 2); } if(hmmp->nr_alphabets > 2) { recalculate_emiss_expectations_multi(hmmp, E_3, 3); } if(hmmp->nr_alphabets > 3) { recalculate_emiss_expectations_multi(hmmp, E_4, 4); } /* recalculate transition expectations for tied transitions according * to the same scheme as for emission distribution groups */ recalculate_trans_expectations_multi(hmmp, T); for(k = 0; k < hmmp->nr_v-1; k++) /* k = from-vertex */ { /* update transition matrix */ if(use_transition_pseudo_counts == YES) { update_trans_mtx_pseudocount_multi(hmmp, T, k); } else { update_trans_mtx_std_multi(hmmp, T, k); } #ifdef DEBUG_PRIORS printf("Starting emission matrix update\n");#endif /* update emission matrix using Dirichlet prior files if they exist*/ priorp = *(hmmp->ed_ps + k); if(priorp != NULL && use_prior == YES && hmmp->alphabet_type == DISCRETE) {#ifdef DEBUG_PRIORS printf("k = %d\n", k); printf("value = %x\n", priorp);#endif update_emiss_mtx_prior_multi(hmmp, E, k, priorp, 1); } else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type == DISCRETE) /* update emissions matrix "normally" when dirichlet file is missing */ { update_emiss_mtx_pseudocount_multi(hmmp, E, k, 1); } else if(hmmp->alphabet_type == DISCRETE) { update_emiss_mtx_std_multi(hmmp, E, k, 1); } else { update_emiss_mtx_std_continuous_multi(hmmp, E, k, 1); } if(hmmp->nr_alphabets > 1) { priorp = *(hmmp->ed_ps_2 + k); if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_2 == DISCRETE) { update_emiss_mtx_prior_multi(hmmp, E_2, k, priorp, 2); } else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_2 == DISCRETE) /* update emissions matrix "normally" when dirichlet file is missing */ { update_emiss_mtx_pseudocount_multi(hmmp, E_2, k, 2); } else if(hmmp->alphabet_type_2 == DISCRETE) { update_emiss_mtx_std_multi(hmmp, E_2, k, 2); } else { update_emiss_mtx_std_continuous_multi(hmmp, E_2, k, 2); } } if(hmmp->nr_alphabets > 2) { priorp = *(hmmp->ed_ps_3 + k); if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_3 == DISCRETE) { update_emiss_mtx_prior_multi(hmmp, E_3, k, priorp, 3); } else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_3 == DISCRETE) /* update emissions matrix "normally" when dirichlet file is missing */ { update_emiss_mtx_pseudocount_multi(hmmp, E_3, k, 3); } else if(hmmp->alphabet_type_3 == DISCRETE) { update_emiss_mtx_std_multi(hmmp, E_3, k, 3); } else { update_emiss_mtx_std_continuous_multi(hmmp, E_3, k, 3); } } if(hmmp->nr_alphabets > 3) { priorp = *(hmmp->ed_ps_4 + k); if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_4 == DISCRETE) { update_emiss_mtx_prior_multi(hmmp, E_4, k, priorp, 4); } else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_4 == DISCRETE) /* update emissions matrix "normally" when dirichlet file is missing */ { update_emiss_mtx_pseudocount_multi(hmmp, E_4, k, 4); } else if(hmmp->alphabet_type_4 == DISCRETE) { update_emiss_mtx_std_multi(hmmp, E_4, k, 4); } else { update_emiss_mtx_std_continuous_multi(hmmp, E_4, k, 4); } } } #ifdef DEBUG_BW2 dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->transitions); dump_emiss_matrix(hmmp->nr_v, hmmp->a_size, hmmp->emissions); if(hmmp->nr_alphabets > 1) { dump_emiss_matrix(hmmp->nr_v, hmmp->a_size_2, hmmp->emissions_2); }#endif /* some garbage collection */ free(E); if(hmmp->nr_alphabets > 1) { free(E_2); } if(hmmp->nr_alphabets > 2) { free(E_3); } if(hmmp->nr_alphabets > 3) { free(E_4); } free(T); max_nr_iterations--; iteration++;#ifdef DEBIG_BW2 printf("end of baum-welch-loop\n");#endif } while(max_nr_iterations > 0); /* break condition is also when log_likelihood_difference is * smaller than THRESHOLD, checked inside the loop for * better efficiency */ #ifdef DEBUG_BW dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->transitions); dump_emiss_matrix(hmmp->nr_v, hmmp->a_size, hmmp->emissions);#endif }/* implementation of the conditional maximum likelihood version of the * baum-welch training algorithm using dirichlet prior mixture to * calculate update of emission (and transition) matrices */void extended_baum_welch_dirichlet_multi(struct hmm_multi_s *hmmp, struct sequence_multi_s *seqsp, int nr_seqs, int annealing, int use_labels, int use_transition_pseudo_counts, int use_emission_pseudo_counts, int multi_scoring_method, int use_prior){ double *T, *E, *E_2, *E_3, *E_4; /* matrices for the estimated number of times * each transition (T) and emission (E) is used */ double *T_lab, *E_lab, *E_lab_2, *E_lab_3, *E_lab_4, *T_ulab, *E_ulab, *E_ulab_2, *E_ulab_3, *E_ulab_4; struct forward_s *forw_mtx; /* forward matrix */ struct backward_s *backw_mtx; /* backward matrix */ double *forw_scale; /* scaling array */ int s,p,k,l,a,d; /* loop counters, s loops over the sequences, p over the * positions in the sequence, k and l over states, a over the alphabet * and d over the distribution groups */ struct path_element *lp; double t_res, t_res_1, t_res_2, t_res_3; /* for temporary results */ double t_res_4, t_res_5, t_res_6; /* for temporary results */ double e_res, e_res_1, e_res_2, e_res_3; /* for temporary results */ double t_res_ulab; int seq_len; /* length of the seqences */ int a_index, a_index_2, a_index_3, a_index_4; /* holds current letters index in the alphabet */ struct letter_s *seq, *seq_2, *seq_3, *seq_4; /* pointer to current sequence */ double old_log_likelihood_lab, new_log_likelihood_lab; double old_log_likelihood_ulab, new_log_likelihood_ulab; /* to calculate when to stop */ double likelihood; /* temporary variable for calculating likelihood of a sequence */ int max_nr_iterations, iteration; /* dirichlet prior variables */ struct emission_dirichlet_s *priorp; /* simulated annealing variables */ double temperature; double cooling_factor; int annealing_status; /* some initialization */ old_log_likelihood_lab = 9999.0; new_log_likelihood_lab = 9999.0; old_log_likelihood_ulab = 9999.0; new_log_likelihood_ulab = 9999.0; max_nr_iterations = 70; iteration = 1; if(annealing == YES) { temperature = INIT_TEMP; cooling_factor = INIT_COOL; annealing_status = ACTIVE; } else { annealing_status = DONE; } do { /* allocate per iteration matrices */ T = (double*)(malloc_or_die(hmmp->nr_v * hmmp->nr_v * sizeof(double))); E = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size * sizeof(double))); T_lab = (double*)(malloc_or_die(hmmp->nr_v * hmmp->nr_v * sizeof(double))); E_lab = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size * sizeof(double))); T_ulab = (double*)(malloc_or_die(hmmp->nr_v * hmmp->nr_v * sizeof(double))); E_ulab = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size * sizeof(double))); if(hmmp->nr_alphabets > 1) { E_2 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_2 * sizeof(double))); E_lab_2 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_2 * sizeof(double))); E_ulab_2 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_2 * sizeof(double))); } if(hmmp->nr_alphabets > 2) { E_3 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_3 * sizeof(double))); E_lab_3 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_3 * sizeof(double))); E_ulab_3 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_3 * sizeof(double))); } if(hmmp->nr_alphabets > 3) { E_4 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_4 * sizeof(double))); E_lab_4 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_4 * sizeof(double))); E_ulab_4 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_4 * sizeof(double))); } old_log_likelihood_ulab = new_log_likelihood_ulab; new_log_likelihood_ulab = 0.0; old_log_likelihood_lab = new_log_likelihood_lab; new_log_likelihood_lab = 0.0; for(s = 0; s < nr_seqs; s++) { /* Convert sequence to 1...L for easier indexing */ seq_len = (seqsp + s)->length; seq = (struct letter_s*) (malloc_or_die((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq+1, (seqsp + s)->seq_1, seq_len * sizeof(struct letter_s)); if(hmmp->nr_alphabets > 1) { seq_2 = (struct letter_s*) (malloc_or_die((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq_2+1, (seqsp + s)->seq_2, seq_len * sizeof(struct letter_s)); } if(hmmp->nr_alphabets > 2) { seq_3 = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq_3+1, (seqsp + s)->seq_3, seq_len * sizeof(struct letter_s)); } if(hmmp->nr_alphabets > 3) { seq_4 = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq_4+1, (seqsp + s)->seq_4, seq_len * sizeof(struct letter_s)); } /* calculate forward and backward matrices */ forward_multi(hmmp, (seqsp + s)->seq_1,(seqsp + s)->seq_2, (seqsp + s)->seq_3, (seqsp + s)->seq_4, &forw_mtx, &forw_scale, NO, multi_scoring_method); backward_multi(hmmp, (seqsp + s)->seq_1, (seqsp + s)->seq_2, (seqsp + s)->seq_3, (seqsp + s)->seq_4, &backw_mtx, forw_scale, NO, multi_scoring_method); /* memory for forw_mtx, scale_mtx and * backw_mtx is allocated in the functions */ /* update new_log_likelihood */ likelihood = log10((forw_mtx + get_mtx_index(seq_len+1, hmmp->nr_v-1, hmmp->nr_v))->prob); for(k = 0; k <= seq_len; k++) { likelihood = likelihood + log10(*(forw_scale + k)); }#ifdef DEBUG_BW dump_scaling_array(k-1,forw_scale); printf("likelihood = %f\n", likelihood);#endif new_log_likelihood_ulab += likelihood; for(k = 0; k < hmmp->nr_v-1; k++) /* k = from vertex */ { lp = *(hmmp->to_trans_array + k); while(lp->vertex != END) /* l = to-vertex */ { for(p = 1; p <= seq_len; p++) { /* get alphabet index for c*/ a_index = get_alphabet_index(&seq[p], hmmp->alphabet, hmmp->a_size); if(hmmp->nr_alphabets > 1) { a_index_2 = get_alphabet_index(&seq_2[p], hmmp->alphabet_2, hmmp->a_size_2); } if(hmmp->nr_alphabets > 2) { a_index_3 = get_alphabet_index(&seq_3[p], hmmp->alphabet_3, hmmp->a_size_3); } if(hmmp->nr_alphabets > 3) { a_index_4 = get_alphabet_index(&seq_4[p], hmmp->alphabet_4, hmmp->a_size_4); } /* add T[k][l] contribution for this sequence */ add_Tkl_contribution_multi(hmmp, seq+1, seq_2+1, seq_3+1, seq_4+1, forw_mtx, backw_mtx, forw_scale, p, k, lp, a_index, a_index_2, a_index_3, a_index_4, T_ulab, NO, multi_scoring_method); } /* move on to next path */ while(lp->next != NULL) { lp++; } lp++; } /* calculate E[k][a] contribution from this sequence */ if(silent_state_multi(k, hmmp) != 0) { for(p = 1; p <= seq_len; p++) { a_index = get_alphabet_index(&seq[p], hmmp->alphabet, hmmp->a_size); if(hmmp->nr_alphabets > 1) { a_index_2 = get_alphabet_index(&seq_2[p], hmmp->alphabet_2, hmmp->a_size_2); } if(hmmp->nr_alphabets > 2) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -