📄 core_algorithms_multialpha.c
字号:
#include <stdio.h>#include <math.h>#include <stdlib.h>#include <string.h>#include "structs.h"#include "funcs.h"#define V_LIST_END -99#define V_LIST_NEXT -9//#define DEBUG_LABELING_UPDATE//#define DEBUG_DEALLOCATE_LABELINGS//#define DEBUG_FW//#define DEBUG_BW//#define DEBUG_VI//#define PATHextern int verbose;double dot_product_multi(struct hmm_multi_s *hmmp, int use_prior_shares, int use_gap_shares, struct msa_sequences_multi_s *msa_seq_infop, int c, int v, int normalize, int multi_scoring_method);double dot_prouct_picasso_multi(struct hmm_multi_s *hmmp, int use_prior_shares, int use_gap_shares, struct msa_sequences_multi_s *msa_seq_infop, int c, int v, int normalize, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4);double picasso_multi(struct hmm_multi_s *hmmp, int use_prior_shares, int use_gap_shares, struct msa_sequences_multi_s *msa_seq_infop, int c, int v, int normalize, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4);double picasso_sym_multi(struct hmm_multi_s *hmmp, int use_prior_shares, int use_gap_shares, struct msa_sequences_multi_s *msa_seq_infop, int c, int v, int normalize, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4);double sjolander_score_multi(struct hmm_multi_s *hmmp, int use_prior_shares, int use_gap_shares, struct msa_sequences_multi_s *msa_seq_infop, int c, int v, int normalize, int multi_scoring_method);double sjolander_reversed_score_multi(struct hmm_multi_s *hmmp, int use_prior_shares, int use_gap_shares, struct msa_sequences_multi_s *msa_seq_infop, int c, int v, int normalize, int multi_scoring_method);double subst_mtx_product_multi(struct hmm_multi_s *hmmp, int use_prior_shares, int use_gap_shares, struct msa_sequences_multi_s *msa_seq_infop, int c, int v, int normalize, int multi_scoring_method);double subst_mtx_dot_product_multi(struct hmm_multi_s *hmmp, int use_prior_shares, int use_gap_shares, struct msa_sequences_multi_s *msa_seq_infop, int c, int v, int a_index, int a_index_2, int a_index_3, int a_index_4, int normalize, int multi_scoring_method);double subst_mtx_dot_product_prior_multi(struct hmm_multi_s *hmmp, int use_prior_shares, int use_gap_shares, struct msa_sequences_multi_s *msa_seq_infop, int c, int v, int a_index, int a_index_2, int a_index_3, int a_index_4, int normalize, int multi_scoring_method);double get_msa_emission_score_multi(struct msa_sequences_multi_s *msa_seq_infop, struct hmm_multi_s *hmmp, int c, int v, int use_labels, int normalize, int a_index, int a_index_2, int a_index_3, int a_index_4, int scoring_method, int use_gap_shares, int use_prior_shares, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4);double get_single_emission_score_multi(struct hmm_multi_s *hmmp, struct letter_s *seq, struct letter_s *seq_2, struct letter_s *seq_3, struct letter_s *seq_4, int c, int v, int replacement_letter_c, int replacement_letter_c_2, int replacement_letter_c_3, int replacement_letter_c_4, int use_labels, int a_index, int a_index_2, int a_index_3, int a_index_4, int multi_scoring_method);/************************* the forward algorithm **********************************/int forward_multi(struct hmm_multi_s *hmmp, struct letter_s *s, struct letter_s *s_2, struct letter_s *s_3, struct letter_s *s_4, struct forward_s **ret_forw_mtxpp, double **ret_scale_fspp, int use_labels, int multi_scoring_method){ struct forward_s *forw_mtxp, *cur_rowp, *prev_rowp; /* pointers to forward matrix */ double *scale_fsp; /* pointer to array of scaling factors */ struct letter_s *seq, *seq_2, *seq_3, *seq_4; /* pointer to the sequence */ int i,j; /* loop variables */ int seq_len; /* length of the sequence */ int c, v, w; /* looping indices, c loops over the sequence, v and w over the vertices in the HMM */ double row_sum, res, t_res1, t_res2, t_res3; /* temporary variables to calculate probabilities */ double t_res3_1, t_res3_2, t_res3_3, t_res3_4; int nr_v; struct path_element *wp; /* for traversing the paths from v to w */ int a_index, a_index_2, a_index_3, a_index_4; /* holds the current letter's position in the alphabet */ int replacement_letter_c, replacement_letter_c_2, replacement_letter_c_3, replacement_letter_c_4; #ifdef DEBUG_FW printf("running forward\n");#endif /* Allocate memory for matrix and scaling factors + some initial setup: * Note 1: forward probability matrix has the sequence indices vertically * and the vertex indices horizontally meaning it will be filled row by row * Note 2: *ret_forw_mtxpp and *ret_scale_fspp are allocated here, but must be * freed by caller */ nr_v = hmmp->nr_v; seq_len = get_seq_length(s); *ret_forw_mtxpp = (struct forward_s*)(malloc_or_die((seq_len+2) * hmmp->nr_v * sizeof(struct forward_s))); forw_mtxp = *ret_forw_mtxpp; *ret_scale_fspp = (double*)(malloc_or_die((seq_len+2) * sizeof(double))); scale_fsp = *ret_scale_fspp; /* Convert sequence to 1...L for easier indexing */ seq = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq+1, s, seq_len * sizeof(struct letter_s)); if(hmmp->nr_alphabets > 1) { seq_2 = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq_2+1, s_2, seq_len * sizeof(struct letter_s)); } if(hmmp->nr_alphabets > 2) { seq_3 = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq_3+1, s_3, seq_len * sizeof(struct letter_s)); } if(hmmp->nr_alphabets > 3) { seq_4 = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq_4+1, s_4, seq_len * sizeof(struct letter_s)); } /* Initialize the first row of the matrix */ forw_mtxp->prob = 1.0; /* sets index (0,0) to 1.0, the rest are already 0.0 as they should be */ *scale_fsp = 1.0; /* Fill in middle rows */ prev_rowp = forw_mtxp; cur_rowp = forw_mtxp + hmmp->nr_v; for(c = 1; c <= seq_len; c++) { /* get alphabet index for c*/ if(hmmp->alphabet_type == DISCRETE) { replacement_letter_c = NO; a_index = get_alphabet_index(&seq[c], hmmp->alphabet, hmmp->a_size); if(a_index < 0) { a_index = get_replacement_letter_index_multi(&seq[c], hmmp->replacement_letters, 1); if(a_index < 0) { printf("Letter '%s' is not in alphabet\n", (&seq[c])->letter); return NOPROB; } replacement_letter_c = YES; } } if(hmmp->nr_alphabets > 1) { if(hmmp->alphabet_type_2 == DISCRETE) { replacement_letter_c_2 = NO; a_index_2 = get_alphabet_index(&seq_2[c], hmmp->alphabet_2, hmmp->a_size_2); if(a_index_2 < 0) { a_index_2 = get_replacement_letter_index_multi(&seq_2[c], hmmp->replacement_letters, 2); if(a_index_2 < 0) { printf("Letter '%s' is not in alphabet\n", (&seq_2[c])->letter); return NOPROB; } replacement_letter_c_2 = YES; } } } if(hmmp->nr_alphabets > 2) { if(hmmp->alphabet_type_3 == DISCRETE) { replacement_letter_c_3 = NO; a_index_3 = get_alphabet_index(&seq_3[c], hmmp->alphabet_3, hmmp->a_size_3); if(a_index_3 < 0) { a_index_3 = get_replacement_letter_index_multi(&seq_3[c], hmmp->replacement_letters, 3); if(a_index_3 < 0) { printf("Letter '%s' is not in alphabet\n", (&seq_3[c])->letter); return NOPROB; } replacement_letter_c_3 = YES; } } } if(hmmp->nr_alphabets > 3) { if(hmmp->alphabet_type_4 == DISCRETE) { replacement_letter_c_4 = NO; a_index_4 = get_alphabet_index(&seq_4[c], hmmp->alphabet_4, hmmp->a_size_4); if(a_index_4 < 0) { a_index_4 = get_replacement_letter_index_multi(&seq_4[c], hmmp->replacement_letters, 4); if(a_index_4 < 0) { printf("Letter '%s' is not in alphabet\n", (&seq_4[c])->letter); return NOPROB; } replacement_letter_c_4 = YES; } } }#ifdef DEBUG_FW printf("seq[c] = %s\n", &seq[c]); if(hmmp->alphabet_type == DISCRETE) { printf("a_index = %d\n", a_index); }#endif /* calculate sum of probabilities */ row_sum = 0; for(v = 1; v < hmmp->nr_v - 1; v++) /* v = to-vertex */ {#ifdef DEBUG_FW printf("prob to vertex %d\n", v);#endif res = 0.0; wp = *(hmmp->tot_from_trans_array + v); while((w = wp->vertex) != END) /* w = from-vertex */ { /* calculate intermediate results */ res += (prev_rowp + w)->prob * *(hmmp->tot_transitions + (w * nr_v + v)); if(*(hmmp->tot_transitions + (w * nr_v + v)) < 0) { printf("found model transition prob from %d to %d < 0.0\n", w, v); exit(0); } wp++;#ifdef DEBUG_FW printf("prev = %f: ", (prev_rowp + w)->prob); printf("trans = %f\n", *(hmmp->tot_transitions + (w * nr_v + v)));#endif } /* calculate prob of producing letter l in v */ t_res3 = get_single_emission_score_multi(hmmp, seq, seq_2, seq_3, seq_4, c, v, replacement_letter_c, replacement_letter_c_2, replacement_letter_c_3, replacement_letter_c_4, use_labels, a_index, a_index_2, a_index_3, a_index_4, multi_scoring_method); res = res * t_res3; row_sum += res; /* save result in matrices */ (cur_rowp + v)->prob = res; #ifdef DEBUG_FW printf("res = %f\n", res);#endif } /* scale the results, row_sum = the total probability of * having produced the sequence up to and including character c */ scale_fsp[c] = row_sum;#ifdef DEBUG_FW printf("rowsum = %f\n",row_sum); printf("scaling set to: %f\n", scale_fsp[c]);#endif if(row_sum == 0.0) { printf("Sequence cannot be produced by this hmm\n"); sequence_as_string(s); printf("pos = %d\n",c); return NOPROB; } for(v = 0; v < hmmp->nr_v; v++) { if((cur_rowp + v)->prob != 0){ (cur_rowp + v)->prob = ((cur_rowp + v)->prob)/row_sum; /* scaling */ } } /* move row pointers one row forward */ prev_rowp = cur_rowp; cur_rowp = cur_rowp + hmmp->nr_v; } /* Fill in transition to end state */ res = 0; wp = *(hmmp->tot_from_trans_array + hmmp->nr_v - 1); while(wp->vertex != END) { t_res1 = (prev_rowp + wp->vertex)->prob; t_res2 = *((hmmp->tot_transitions) + get_mtx_index(wp->vertex, hmmp->nr_v-1, hmmp->nr_v)); if(t_res2 > 1.0) { t_res2 = 1.0; } res += t_res1 * t_res2; wp++; }#ifdef DEBUG_FW printf("res = %f\n", res);#endif (cur_rowp + hmmp->nr_v - 1)->prob = res; /* obs: no scaling performed here */#ifdef DEBUG_FW dump_forward_matrix(seq_len + 2, hmmp->nr_v, forw_mtxp); dump_scaling_array(seq_len + 1, scale_fsp);#endif /* Garbage collection and return */ free(seq); if(hmmp->nr_alphabets > 1) { free(seq_2); } if(hmmp->nr_alphabets > 2) { free(seq_3); } if(hmmp->nr_alphabets > 3) { free(seq_4); } return OK;}/**************************** the backward algorithm **********************************/int backward_multi(struct hmm_multi_s *hmmp, struct letter_s *s, struct letter_s *s_2, struct letter_s *s_3, struct letter_s *s_4, struct backward_s **ret_backw_mtxpp, double *scale_fsp, int use_labels, int multi_scoring_method){ struct backward_s *backw_mtxp, *cur_rowp, *prev_rowp; /* pointers to backward matrix */ struct letter_s *seq, *seq_2, *seq_3, *seq_4; /* pointer to the sequence */ int i; /* loop index */ int seq_len; /* length of the sequence */ int c, v, w; /* looping indices, c loops over the sequence, v and w over the vertices in the HMM */ int a_index, a_index_2, a_index_3, a_index_4; /* holds the current letter's position in the alphabet */ double row_sum, res,t_res1,t_res2, t_res3; /* temporary variables to calculate results */ double t_res3_1, t_res3_2, t_res3_3, t_res3_4; struct path_element *wp; int replacement_letter_c, replacement_letter_c_2, replacement_letter_c_3, replacement_letter_c_4; /* Allocate memory for matrix + some initial setup: * Note 1: probability matrix has the sequence indices vertically * and the vertex indices horizontally meaning it will be filled row by row * Note 2: *ret_backw_mtxpp is allocated here, but must be * freed by caller * Note 3: *scale_fspp is the scaling array produced by forward() meaning backward() * must be called after forward() */ seq_len = get_seq_length(s); *ret_backw_mtxpp = (struct backward_s*)(malloc_or_die((seq_len+2) * hmmp->nr_v * sizeof(struct backward_s))); backw_mtxp = *ret_backw_mtxpp; /* Convert sequence to 1...L for easier indexing */ seq = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq+1, s, seq_len * sizeof(struct letter_s)); if(hmmp->nr_alphabets > 1) { seq_2 = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq_2+1, s_2, seq_len * sizeof(struct letter_s)); } if(hmmp->nr_alphabets > 2) { seq_3 = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq_3+1, s_3, seq_len * sizeof(struct letter_s)); } if(hmmp->nr_alphabets > 3) { seq_4 = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s))); memcpy(seq_4+1, s_4, seq_len * sizeof(struct letter_s)); } /* Initialize the last row of the matrix */ (backw_mtxp + get_mtx_index(seq_len + 1, hmmp->nr_v - 1, hmmp->nr_v))->prob = 1.0; /* sets index * (seq_len+1,nr_v-1) i.e. * the lower right * corner of the matrix to * 1.0,the rest are already * 0.0 as they should be*/ /* Fill in next to last row in matrix (i.e. add prob for path to end state for all states * that have a transition path to the end state) */ prev_rowp = backw_mtxp + get_mtx_index(seq_len + 1 , 0, hmmp->nr_v); cur_rowp = prev_rowp - hmmp->nr_v; wp = *(hmmp->from_trans_array + hmmp->nr_v - 1); while(wp->vertex != END) { w = wp->vertex; t_res1 = 1.0; while(wp->next != NULL) { t_res1 = t_res1 * *(hmmp->transitions + get_mtx_index(wp->vertex, (wp + 1)->vertex, hmmp->nr_v)); wp++; } (cur_rowp + w)->prob += t_res1; if((cur_rowp + w)->prob > 1.0) { (cur_rowp + w)->prob = 1.0; } wp++; } prev_rowp = cur_rowp; cur_rowp = cur_rowp - hmmp->nr_v; /* Fill in first rows moving upwards in matrix */ for(c = seq_len; c >= 1; c--) { /* get alphabet index for c */#ifdef DEBUG_BW printf("c = %d\n", c);#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -