📄 core_algorithms_multialpha.c
字号:
/************************************************************************************* ************************** the forward, backward and viterbi algorithms ************* ************************** for scoring an msa against the hmm *********************** *************************************************************************************//************************* the msa_forward algorithm **********************************/int msa_forward_multi(struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int use_lead_columns, int use_gap_shares, int use_prior_shares, struct forward_s **ret_forw_mtxpp, double **ret_scale_fspp, int use_labels, int normalize, int scoring_method, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4){ struct forward_s *forw_mtxp, *cur_rowp, *prev_rowp; /* pointers to forward matrix */ double *scale_fsp; /* pointer to array of scaling factors */ int seq_len; /* length of the sequence */ int c, v, w; /* looping indices, c loops over the sequence, v and w over the vertices in the HMM */ int i,j; /* general loop indices */ int a_index, a_index_2, a_index_3, a_index_4; /* holds the current letter's position in the alphabet */ double row_sum, res, t_res1, t_res2, t_res3; /* temporary variables to calculate probabilities */ struct path_element *wp; /* for traversing the paths from v to w */ int nr_v;#ifdef DEBUG_FW printf("entering msa_forward\n");#endif /* Allocate memory for matrix and scaling factors + some initial setup: * Note 1: forward probability matrix has the sequence indices vertically * and the vertex indices horizontally meaning it will be filled row by row * Note 2: *ret_forw_mtxpp and *ret_scale_fspp are allocated here, but must be * freed by caller */ nr_v = hmmp->nr_v; if(use_lead_columns == YES) { seq_len = msa_seq_infop->nr_lead_columns; } else { seq_len = msa_seq_infop->msa_seq_length; } *ret_forw_mtxpp = (struct forward_s*)(malloc_or_die((seq_len+2) * hmmp->nr_v * sizeof(struct forward_s))); forw_mtxp = *ret_forw_mtxpp; *ret_scale_fspp = (double*)(malloc_or_die((seq_len+2) * sizeof(double))); scale_fsp = *ret_scale_fspp; /* Initialize the first row of the matrix */ forw_mtxp->prob = 1.0; /* sets index (0,0) to 1.0, the rest are already 0.0 as they should be */ *scale_fsp = 1.0; /* Fill in middle rows */ prev_rowp = forw_mtxp; cur_rowp = forw_mtxp + hmmp->nr_v; j = 0; if(use_lead_columns == YES) { c = *(msa_seq_infop->lead_columns_start + j); } else { c = 0; } while(c != END && c < msa_seq_infop->msa_seq_length) { a_index_2 = -1; a_index_3 = -1; a_index_4 = -1; if(hmmp->alphabet_type == DISCRETE) { a_index = get_alphabet_index((msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->query_letter, hmmp->alphabet, hmmp->a_size); if(a_index < 0) { a_index = hmmp->a_size; /* if letter is wild card, use default column in subst matrix */ } } if(hmmp->nr_alphabets > 1) { if(hmmp->alphabet_type_2 == DISCRETE) { a_index_2 = get_alphabet_index((msa_seq_infop->msa_seq_2 + (c * (hmmp->a_size_2+1)))->query_letter, hmmp->alphabet_2, hmmp->a_size_2); if(a_index_2 < 0) { a_index_2 = hmmp->a_size_2; /* if letter is wild card, use default column in subst matrix */ } } } if(hmmp->nr_alphabets > 2) { if(hmmp->alphabet_type_3 == DISCRETE) { a_index_3 = get_alphabet_index((msa_seq_infop->msa_seq_3 + (c * (hmmp->a_size_3+1)))->query_letter, hmmp->alphabet_3, hmmp->a_size_3); if(a_index_3 < 0) { a_index_3 = hmmp->a_size_3; /* if letter is wild card, use default column in subst matrix */ } } } if(hmmp->nr_alphabets > 3) { if(hmmp->alphabet_type_4 == DISCRETE) { a_index_4 = get_alphabet_index((msa_seq_infop->msa_seq_4 + (c * (hmmp->a_size_4+1)))->query_letter, hmmp->alphabet_4, hmmp->a_size_4); if(a_index_4 < 0) { a_index_4 = hmmp->a_size_4; /* if letter is wild card, use default column in subst matrix */ } } } /* calculate sum of probabilities */ row_sum = 0; for(v = 1; v < hmmp->nr_v - 1; v++) /* v = to-vertex */ {#ifdef DEBUG_FW printf("prob to vertex %d:\n", v);#endif res = 0; wp = *(hmmp->tot_from_trans_array + v); while((w = wp->vertex) != END) /* w = from-vertex */ { /* calculate intermediate results */ res += (prev_rowp + w)->prob * *((hmmp->tot_transitions) + (w * nr_v + v)); if(*((hmmp->tot_transitions) + (w * nr_v + v)) < 0) { printf("found model transition prob from %d to %d < 0.0\n", w, v); exit(0); } wp++;#ifdef DEBUG_FW printf("prev = %f: ",(prev_rowp + w)->prob ); printf("trans = %f\n",*((hmmp->tot_transitions) + (w * nr_v + v)) );#endif } /* calculate the prob of producing letters l in v*/ t_res3 = get_msa_emission_score_multi(msa_seq_infop, hmmp, c, v, use_labels, normalize, a_index, a_index_2, a_index_3, a_index_4, scoring_method, use_gap_shares, use_prior_shares, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4); res = res * t_res3; row_sum += res; /* save result in matrices */ (cur_rowp + v)->prob = res;#ifdef DEBUG_FW printf("letter = %f\n", t_res3); printf("res = %f\n", res);#endif } /* scale the results, row_sum = the total probability of * having produced the sequence up to and including character c */ if(use_lead_columns == YES) { scale_fsp[j+1] = row_sum; } else { scale_fsp[c+1] = row_sum; } #ifdef DEBUG_FW printf("rowsum for row %d = %f\n", c, row_sum); printf("scaling set to: %f\n", scale_fsp[c+1]);#endif if(row_sum == 0.0) { printf("Probability for this msa = 0.0, pos = %d\n", c); printf("in forward\n"); exit(0); return NOPROB; } for(v = 0; v < hmmp->nr_v; v++) { if((cur_rowp + v)->prob != 0){ (cur_rowp + v)->prob = ((cur_rowp + v)->prob)/row_sum; /* scaling */ } } /* move row pointers one row forward */ prev_rowp = cur_rowp; cur_rowp = cur_rowp + hmmp->nr_v; /* update current column */ if(use_lead_columns == YES) { j++; c = *(msa_seq_infop->lead_columns_start + j); } else { c++; } } /* Fill in transition to end state */#ifdef DEBUG_FW printf("\ntransition to end state:\n");#endif res = 0; wp = *(hmmp->tot_from_trans_array + hmmp->nr_v - 1); while(wp->vertex != END) { t_res1 = (prev_rowp + wp->vertex)->prob; t_res2 = *((hmmp->tot_transitions) + get_mtx_index(wp->vertex, hmmp->nr_v - 1, hmmp->nr_v)); if(t_res2 > 1.0) { t_res2 = 1.0; } res += t_res1 * t_res2; wp++; }#ifdef DEBUG_FW printf("res = %f\n", res);#endif (cur_rowp + hmmp->nr_v - 1)->prob = res; /* obs: no scaling performed here */#ifdef DEBUG_FW dump_forward_matrix(seq_len + 2, hmmp->nr_v, forw_mtxp); dump_scaling_array(seq_len + 1, scale_fsp); printf("exiting msa_forward\n\n\n");#endif /* Garbage collection and return */ return OK;}/**************************** the msa_backward algorithm **********************************/int msa_backward_multi(struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int use_lead_columns, int use_gap_shares, struct backward_s **ret_backw_mtxpp, double *scale_fsp, int use_labels, int normalize, int scoring_method, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4){ struct backward_s *backw_mtxp, *cur_rowp, *prev_rowp; /* pointers to backward matrix */ int seq_len; /* length of the sequence */ int c, v, w; /* looping indices, c loops over the sequence, v and w over the vertices in the HMM */ int i,j; /* general loop indices */ int all_columns; /* boolean for checking whether all columns have been looped over */ int a_index, a_index_2, a_index_3, a_index_4; /* holds the current letter's position in the alphabet */ double row_sum, res,t_res1,t_res2, t_res3; /* temporary variables to calculate results */ struct path_element *wp; /* Allocate memory for matrix + some initial setup: * Note 1: probability matrix has the sequence indices vertically * and the vertex indices horizontally meaning it will be filled row by row * Note 2: *ret_backw_mtxpp is allocated here, but must be * freed by caller * Note 3: *scale_fspp is the scaling array produced by forward() meaning backward() * must be called after forward() */ if(use_lead_columns == YES) { seq_len = msa_seq_infop->nr_lead_columns; } else { seq_len = msa_seq_infop->msa_seq_length; } *ret_backw_mtxpp = (struct backward_s*)(malloc_or_die((seq_len+2) * hmmp->nr_v * sizeof(struct backward_s))); backw_mtxp = *ret_backw_mtxpp; /* Initialize the last row of the matrix */ (backw_mtxp + get_mtx_index(seq_len + 1, hmmp->nr_v - 1, hmmp->nr_v))->prob = 1.0; /* sets index * (seq_len+1,nr_v-1) i.e. * the lower right * corner of the matrix to * 1.0,the rest are already * 0.0 as they should be*/ /* Fill in next to last row in matrix (i.e. add prob for path to end state for all states * that have a transition path to the end state) */ prev_rowp = backw_mtxp + get_mtx_index(seq_len + 1, 0, hmmp->nr_v); cur_rowp = prev_rowp - hmmp->nr_v; wp = *(hmmp->from_trans_array + hmmp->nr_v - 1); while(wp->vertex != END) { w = wp->vertex; t_res1 = 1.0; while(wp->next != NULL) { t_res1 = t_res1 * *(hmmp->transitions + get_mtx_index(wp->vertex, (wp + 1)->vertex, hmmp->nr_v)); wp++; } (cur_rowp + w)->prob += t_res1; if((cur_rowp + w)->prob > 1.0) { (cur_rowp + w)->prob = 1.0; } wp++; } prev_rowp = cur_rowp; cur_rowp = cur_rowp - hmmp->nr_v; /* Fill in first rows moving upwards in matrix */ all_columns = NO; j = 1; if(use_lead_columns == YES) { c = *(msa_seq_infop->lead_columns_end - j); } else { c = seq_len - 1; } while(c >= 0 && all_columns == NO) { a_index_2 = -1; a_index_3 = -1; a_index_4 = -1; if(hmmp->alphabet_type == DISCRETE) { a_index = get_alphabet_index((msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->query_letter, hmmp->alphabet, hmmp->a_size); if(a_index < 0) { a_index = hmmp->a_size; /* if letter is wild card, use default column in subst matrix */ } } if(hmmp->nr_alphabets > 1) { if(hmmp->alphabet_type_2 == DISCRETE) { a_index_2 = get_alphabet_index((msa_seq_infop->msa_seq_2 + (c * (hmmp->a_size_2+1)))->query_letter, hmmp->alphabet_2, hmmp->a_size_2); if(a_index_2 < 0) { a_index_2 = hmmp->a_size_2; /* if letter is wild card, use default column in subst matrix */ } } } if(hmmp->nr_alphabets > 2) { if(hmmp->alphabet_type_3 == DISCRETE) { a_index_3 = get_alphabet_index((msa_seq_infop->msa_seq_3 + (c * (hmmp->a_size_3+1)))->query_letter, hmmp->alphabet_3, hmmp->a_size_3); if(a_index_3 < 0) { a_index_3 = hmmp->a_size_3; /* if letter is wild card, use default column in subst matrix */ } } } if(hmmp->nr_alphabets > 3) { if(hmmp->alphabet_type_4 == DISCRETE) { a_index_4 = get_alphabet_index((msa_seq_infop->msa_seq_4 + (c * (hmmp->a_size_4+1)))->query_letter, hmmp->alphabet_4, hmmp->a_size_4); if(a_index_4 < 0) { a_index_4 = hmmp->a_size_4; /* if letter is wild card, use default column in subst matrix */ } } } /* calculate sum of probabilities */ for(v = 0; v < hmmp->nr_v - 1; v++) /* v = from-vertex */{#ifdef DEBUG_BW printf("prob passing through vertex %d:\n", v);#endif res = 0; wp = *(hmmp->tot_to_trans_array + v); while(wp->vertex != END) /* w = to-vertex */ { /* probability of transiting from v to w on this particular path */ t_res2 = *((hmmp->tot_transitions) + get_mtx_index(v, wp->vertex, hmmp->nr_v)); if(t_res2 < 0) { printf("found model transition prob from %d to %d < 0.0\n", v, wp->vertex); exit(0); } t_res1 = (prev_rowp + wp->vertex)->prob; /* probability of having produced the * sequence after c passing through * vertex w */ /* calculate the prob of producing letters l in v*/ t_res3 = get_msa_emission_score_multi(msa_seq_infop, hmmp, c, wp->vertex, use_labels, normalize, a_index, a_index_2, a_index_3, a_index_4, scoring_method, use_gap_shares, NO, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4); res += t_res1 * t_res2 * t_res3; /* multiply the probabilities and * add to previous sum */ wp++; }#ifdef DEBUG_BW printf("prev = %f: ", t_res1); printf("trans = %f\n", t_res2);#endif /* save result in matrices */ (cur_rowp + v)->prob = res;#ifdef DEBUG_BW printf("res = %f\n", res);#endif } /* scale the results using the scaling factors produced by forward() */#ifdef DEBUG_BW printf("c = %d\n", c); dump_scaling_array(seq_len + 1, scale_fsp);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -