📄 core_algorithms_multialpha.c
字号:
#endif for(v = 0; v < hmmp->nr_v; v++) { if((cur_rowp + v)->prob != 0){ if(use_lead_columns == YES) { if(scale_fsp[msa_seq_infop->nr_lead_columns + 1 - j] != 0.0) { (cur_rowp + v)->prob = ((cur_rowp + v)->prob)/scale_fsp[msa_seq_infop->nr_lead_columns + 1 - j]; /* scaling */ } else { printf("This msa cannot be produced by hmm\n"); return NOPROB; } } else { if(scale_fsp[c+1] != 0.0) { (cur_rowp + v)->prob = ((cur_rowp + v)->prob)/scale_fsp[c+1]; /* scaling */ } else { printf("This msa cannot be produced by hmm\n"); return NOPROB; } } } } /* move row pointers one row backward */ prev_rowp = cur_rowp; cur_rowp = cur_rowp - hmmp->nr_v; /* update current column */ if(use_lead_columns == YES) { if(c == *(msa_seq_infop->lead_columns_start)) { all_columns = YES; } j++; c = *(msa_seq_infop->lead_columns_end - j); } else { c = c - 1; } }#ifdef DEBUG_BW dump_backward_matrix(seq_len + 2, hmmp->nr_v, backw_mtxp); dump_scaling_array(seq_len + 1, scale_fsp);#endif /* Garbage collection and return */ return OK;}/************************* the msa_viterbi algorithm **********************************/int msa_viterbi_multi(struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int use_lead_columns, int use_gap_shares, int use_prior_shares, struct viterbi_s **ret_viterbi_mtxpp, int use_labels, int normalize, int scoring_method, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4){ struct viterbi_s *viterbi_mtxp, *cur_rowp, *prev_rowp; /* pointers to viterbi matrix */ int seq_len; /* length of the sequence */ int c, v, w; /* looping indices, c loops over the sequence, v and w over the vertices in the HMM */ int i,j; /* general loop indices */ int a_index, a_index_2, a_index_3, a_index_4; /* holds the current letter's position in the alphabet */ double max, res, t_res1, t_res2, t_res3; /* temporary variables to calculate probabilities */ struct path_element *wp, *from_vp, *prevp; /* for traversing the paths from v to w */ int nr_v; int prev; double seq_normalizer, state_normalizer; /* Allocate memory for matrix + some initial setup: * Note 1: viterbi probability matrix has the sequence indices vertically * and the vertex indices horizontally meaning it will be filled row by row * Note 2: *ret_viterbi_mtxpp and *ret_scale_fspp are allocated here, but must be * freed by caller */ nr_v = hmmp->nr_v; if(use_lead_columns == YES) { seq_len = msa_seq_infop->nr_lead_columns; } else { seq_len = msa_seq_infop->msa_seq_length; } *ret_viterbi_mtxpp = (struct viterbi_s*)(malloc_or_die((seq_len+2) * hmmp->nr_v * sizeof(struct viterbi_s))); init_viterbi_s_mtx(*ret_viterbi_mtxpp, DEFAULT, (seq_len+2) * hmmp->nr_v); viterbi_mtxp = *ret_viterbi_mtxpp; /* Initialize the first row of the matrix */ viterbi_mtxp->prob = 0.0; /* sets index (0,0) to 0.0 (i.e. prob 1.0), the rest are already DEFAULT as they should be */ /* Fill in middle rows */ prev_rowp = viterbi_mtxp; cur_rowp = viterbi_mtxp + hmmp->nr_v; j = 0; if(use_lead_columns == YES) { c = *(msa_seq_infop->lead_columns_start + j); } else { c = 0; } while(c != END && c < msa_seq_infop->msa_seq_length) { a_index_2 = -1; a_index_3 = -1; a_index_4 = -1; if(hmmp->alphabet_type == DISCRETE) { a_index = get_alphabet_index((msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->query_letter, hmmp->alphabet, hmmp->a_size); if(a_index < 0) { a_index = hmmp->a_size; /* if letter is wild card, use default column in subst matrix */ } } if(hmmp->nr_alphabets > 1) { if(hmmp->alphabet_type_2 == DISCRETE) { a_index_2 = get_alphabet_index((msa_seq_infop->msa_seq_2 + (c * (hmmp->a_size_2+1)))->query_letter, hmmp->alphabet_2, hmmp->a_size_2); if(a_index_2 < 0) { a_index_2 = hmmp->a_size_2; /* if letter is wild card, use default column in subst matrix */ } } } if(hmmp->nr_alphabets > 2) { if(hmmp->alphabet_type_3 == DISCRETE) { a_index_3 = get_alphabet_index((msa_seq_infop->msa_seq_3 + (c * (hmmp->a_size_3+1)))->query_letter, hmmp->alphabet_3, hmmp->a_size_3); if(a_index_3 < 0) { a_index_3 = hmmp->a_size_3; /* if letter is wild card, use default column in subst matrix */ } } } if(hmmp->nr_alphabets > 3) { if(hmmp->alphabet_type_4 == DISCRETE) { a_index_4 = get_alphabet_index((msa_seq_infop->msa_seq_4 + (c * (hmmp->a_size_4+1)))->query_letter, hmmp->alphabet_4, hmmp->a_size_4); if(a_index_4 < 0) { a_index_4 = hmmp->a_size_4; /* if letter is wild card, use default column in subst matrix */ } } } /* calculate sum of probabilities */#ifdef DEBUG_VI printf("label nr %d for pos %d = %c\n", j, c, (msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->label);#endif for(v = 1; v < hmmp->nr_v - 1; v++) /* v = to-vertex */ {#ifdef DEBUG_VI printf("prob to vertex %d:\n", v);#endif prev = NO_PREV; max = DEFAULT; wp = *(hmmp->tot_from_trans_array + v); res = 0; while((w = wp->vertex) != END) /* w = from-vertex */ { /* calculate intermediate results */ from_vp = wp; t_res1 = (prev_rowp + w)->prob; /* probability of having produced the * sequence up to the last letter ending * in vertex w */ /* calculate intermediate results */ t_res2 = *(hmmp->max_log_transitions + (w * hmmp->nr_v + v)); if(t_res1 != DEFAULT && t_res2 != DEFAULT) { res = t_res1 + t_res2; } else { res = DEFAULT; } if(prev == NO_PREV && res != DEFAULT && (use_labels == NO || (msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->label == *(hmmp->vertex_labels + v) || (msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->label == '.')) { max = res; prev = from_vp->vertex; prevp = from_vp; continue; } if(res > max && res != DEFAULT && (use_labels == NO || (msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->label == *(hmmp->vertex_labels + v) || (msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->label == '.')) { max = res; prev = from_vp->vertex; prevp = from_vp; } wp++;#ifdef DEBUG_VI printf("prev in pos(%d) = %f: ",from_vp->vertex, t_res1); printf("trans from %d to %d = %f\n", from_vp->vertex, v, t_res2); printf("tot score = %f\n", res);#endif } #ifdef DEBUG_VI printf("max before setting: %f\n", max);#endif /* calculate the prob of producing letters l in v*/ t_res3 = get_msa_emission_score_multi(msa_seq_infop, hmmp, c, v, use_labels, normalize, a_index, a_index_2, a_index_3, a_index_4, scoring_method, use_gap_shares, use_prior_shares, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4); if(t_res3 == 0.0) { max = DEFAULT; } else { t_res3 = log10(t_res3); if(max != DEFAULT) { max = max + t_res3; } } /* save result in matrices */ (cur_rowp + v)->prob = max; (cur_rowp + v)->prev = prev; (cur_rowp + v)->prevp = prevp;#ifdef DEBUG_VI printf("max after setting: %f\n", max);#endif } /* move row pointers one row viterbi */ prev_rowp = cur_rowp; cur_rowp = cur_rowp + hmmp->nr_v; /* update current column */ if(use_lead_columns == YES) { j++; c = *(msa_seq_infop->lead_columns_start + j); } else { c++; } } /* Fill in transition to end state */#ifdef DEBUG_VI printf("\ntransition to end state:\n");#endif max = DEFAULT; prev = NO_PREV; prevp = NULL; wp = *(hmmp->from_trans_array + hmmp->nr_v-1); while(wp->vertex != END) /* w = from-vertex */ { from_vp = wp; t_res1 = (prev_rowp + wp->vertex)->prob; /* probability of having produced the * sequence up to the last letter ending * in vertex w */ t_res2 = 0.0; while(wp->next != NULL) { t_res2 = t_res2 + *((hmmp->log_transitions) + get_mtx_index(wp->vertex, (wp+1)->vertex, hmmp->nr_v)); wp++; } t_res2 = t_res2 + *((hmmp->log_transitions) + get_mtx_index(wp->vertex, hmmp->nr_v - 1, hmmp->nr_v)); #ifdef DEBUG_VI printf("prev = %f: ", t_res1); printf("trans = %f\n", t_res2);#endif if(t_res1 != DEFAULT && t_res2 != DEFAULT) { res = t_res1 + t_res2; } else { res = DEFAULT; } if(prev == NO_PREV && res != DEFAULT) { prev = from_vp->vertex; prevp = from_vp; max = res; continue; } if(res > max && res != DEFAULT) { prev = from_vp->vertex; prevp = from_vp; max = res; } wp++; }#ifdef DEBUG_VI printf("res = %f\n", max);#endif if(max != DEFAULT) { (cur_rowp + hmmp->nr_v-1)->prob = max; (cur_rowp + hmmp->nr_v-1)->prev = prev; (cur_rowp + hmmp->nr_v-1)->prevp = prevp; } else {#ifdef DEBUG_VI dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->transitions); dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->log_transitions);#endif printf("This msa cannot be produced by hmm\n"); printf("inne\n"); return NOPROB; } #ifdef PATH //dump_viterbi_matrix(seq_len + 2, hmmp->nr_v, viterbi_mtxp); //dump_viterbi_path((cur_rowp + w)->prevp); printf("normalized log likelihood for most probable path = %f\n", 0.0 - (((cur_rowp + hmmp->nr_v - 1)->prob) / seq_len)); printf("and most probable path is: "); dump_viterbi_path((cur_rowp + hmmp->nr_v - 1), hmmp, viterbi_mtxp, seq_len + 1, hmmp->nr_v); printf("%d\n",hmmp->nr_v - 1); printf("log prob = %f\n", (cur_rowp + hmmp->nr_v-1)->prob); printf("real prob = %f\n", pow(10,(cur_rowp + hmmp->nr_v-1)->prob)); dump_viterbi_label_path((cur_rowp + hmmp->nr_v - 1), hmmp, viterbi_mtxp, seq_len + 1, hmmp->nr_v); printf("\n");#endif /* Garbage collection and return */ return OK;}/************************* the msa_one_best algorithm **********************************/int msa_one_best_multi(struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int use_lead_columns, int use_gap_shares, int use_prior_shares, struct one_best_s **ret_one_best_mtxpp, double **ret_scale_fspp, int use_labels, char *best_labeling, int normalize, int scoring_method, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4){ struct one_best_s *one_best_mtxp, *cur_rowp, *prev_rowp; /* pointers to one_best matrix */ double *scale_fsp; /* pointer to array of scaling factors */ int seq_len; /* length of the sequence */ int c, v, w; /* looping indices, c loops over the sequence, v and w over the vertices in the HMM */ int i,j; /* general loop indices */ int a_index, a_index_2, a_index_3, a_index_4; /* holds the current letter's position in the alphabet */ double row_sum, res, t_res1, t_res2, t_res3; /* temporary variables to calculate probabilities */ struct path_element *wp; /* for traversing the paths from v to w */ int nr_v; double scaled_result; int *sorted_v_list; int v_list_index; /* Allocate memory for matrix and scaling factors + some initial setup: * Note 1: one_best probability matrix has the sequence indices vertically * and the vertex indices horizontally meaning it will be filled row by row * Note 2: *ret_one_best_mtxpp and *ret_scale_fspp are allocated here, but must be * freed by caller */ nr_v = hmmp->nr_v; if(use_lead_columns == YES) { seq_len = msa_seq_infop->nr_lead_columns; } else { seq_len = msa_seq_infop->msa_seq_length; } *ret_one_best_mtxpp = (struct one_best_s*)(malloc_or_die((seq_len+2) * hmmp->nr_v * sizeof(struct one_best_s))); one_best_mtxp = *ret_one_best_mtxpp; *ret_scale_fspp = (double*)(malloc_or_die((seq_len+2) * sizeof(double))); scale_fsp = *ret_scale_fspp; sorted_v_list = (int*)(malloc_or_die((hmmp->nr_v * 2 + 1) * sizeof(int))); /* Initialize the first row of the matrix */ one_best_mtxp->prob = 1.0; /* sets index (0,0) to 1.0, the rest are already 0.0 as they should be */ one_best_mtxp->labeling = (char*)(malloc_or_die(1 * sizeof(char))); for(i = 1; i < hmmp->nr_v; i++) { (one_best_mtxp+i)->labeling = NULL; } *scale_fsp = 1.0; /* create initial sorted v-list*/ *(sorted_v_list) = 0; /* 0 is always the number of the start st
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -