📄 core_algorithms_multialpha.c

📁 马尔科夫模型的java版本实现
💻 C
📖 第 1 页 / 共 5 页
字号:
/************************************************************************************* ************************** the forward, backward and viterbi algorithms ************* ************************** for scoring an msa against the hmm *********************** *************************************************************************************//************************* the msa_forward algorithm **********************************/int msa_forward_multi(struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int use_lead_columns,		      int use_gap_shares, int use_prior_shares, struct forward_s **ret_forw_mtxpp, double **ret_scale_fspp,		      int use_labels, int normalize, int scoring_method, int multi_scoring_method, double *aa_freqs,		      double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4){  struct forward_s *forw_mtxp, *cur_rowp, *prev_rowp; /* pointers to forward matrix */  double *scale_fsp; /* pointer to array of scaling factors */  int seq_len; /* length of the sequence */  int c, v, w; /* looping indices, c loops over the sequence,		  v and w over the vertices in the HMM */  int i,j; /* general loop indices */  int a_index, a_index_2, a_index_3, a_index_4; /* holds the current letter's position in the alphabet */  double row_sum, res, t_res1, t_res2, t_res3; /* temporary variables to calculate probabilities */  struct path_element *wp; /* for traversing the paths from v to w */  int nr_v;#ifdef DEBUG_FW  printf("entering msa_forward\n");#endif  /* Allocate memory for matrix and scaling factors + some initial setup:   * Note 1: forward probability matrix has the sequence indices vertically   * and the vertex indices horizontally meaning it will be filled row by row   * Note 2: *ret_forw_mtxpp and *ret_scale_fspp are allocated here, but must be   * freed by caller */  nr_v = hmmp->nr_v;  if(use_lead_columns == YES) {    seq_len = msa_seq_infop->nr_lead_columns;  }  else {    seq_len = msa_seq_infop->msa_seq_length;  }  *ret_forw_mtxpp = (struct forward_s*)(malloc_or_die((seq_len+2) *						      hmmp->nr_v *						      sizeof(struct forward_s)));  forw_mtxp = *ret_forw_mtxpp;  *ret_scale_fspp = (double*)(malloc_or_die((seq_len+2) * sizeof(double)));  scale_fsp = *ret_scale_fspp;  /* Initialize the first row of the matrix */  forw_mtxp->prob = 1.0; /* sets index (0,0) to 1.0,			    the rest are already 0.0 as they should be */  *scale_fsp = 1.0;    /* Fill in middle rows */  prev_rowp = forw_mtxp;  cur_rowp = forw_mtxp + hmmp->nr_v;  j = 0;  if(use_lead_columns == YES) {    c = *(msa_seq_infop->lead_columns_start + j);  }  else {    c = 0;  }  while(c != END && c < msa_seq_infop->msa_seq_length)  {    a_index_2 = -1;    a_index_3 = -1;    a_index_4 = -1;        if(hmmp->alphabet_type == DISCRETE) {            a_index = get_alphabet_index((msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->query_letter, hmmp->alphabet, hmmp->a_size);      if(a_index < 0) {	a_index = hmmp->a_size; /* if letter is wild card, use default column in subst matrix */      }    }    if(hmmp->nr_alphabets > 1) {      if(hmmp->alphabet_type_2 == DISCRETE) {	a_index_2 = get_alphabet_index((msa_seq_infop->msa_seq_2 + (c * (hmmp->a_size_2+1)))->query_letter,				       hmmp->alphabet_2, hmmp->a_size_2);	if(a_index_2 < 0) {	  a_index_2 = hmmp->a_size_2; /* if letter is wild card, use default column in subst matrix */	}      }    }    if(hmmp->nr_alphabets > 2) {      if(hmmp->alphabet_type_3 == DISCRETE) {	a_index_3 = get_alphabet_index((msa_seq_infop->msa_seq_3 + (c * (hmmp->a_size_3+1)))->query_letter,				       hmmp->alphabet_3, hmmp->a_size_3);	if(a_index_3 < 0) {	  a_index_3 = hmmp->a_size_3; /* if letter is wild card, use default column in subst matrix */	}      }    }    if(hmmp->nr_alphabets > 3) {      if(hmmp->alphabet_type_4 == DISCRETE) {	a_index_4 = get_alphabet_index((msa_seq_infop->msa_seq_4 + (c * (hmmp->a_size_4+1)))->query_letter,				       hmmp->alphabet_4, hmmp->a_size_4);	if(a_index_4 < 0) {	  a_index_4 = hmmp->a_size_4; /* if letter is wild card, use default column in subst matrix */	}      }    }    /* calculate sum of probabilities */    row_sum = 0;    for(v = 1; v < hmmp->nr_v - 1; v++) /* v = to-vertex */ {#ifdef DEBUG_FW      printf("prob to vertex %d:\n", v);#endif      res = 0;      wp = *(hmmp->tot_from_trans_array + v);      while((w = wp->vertex) != END) /* w = from-vertex */ {	/* calculate intermediate results */	res += (prev_rowp + w)->prob * *((hmmp->tot_transitions) + (w * nr_v + v));	if(*((hmmp->tot_transitions) + (w * nr_v + v)) < 0) {	  printf("found model transition prob from %d to %d < 0.0\n", w, v);	  exit(0);	}	wp++;#ifdef DEBUG_FW	printf("prev = %f: ",(prev_rowp + w)->prob );	printf("trans = %f\n",*((hmmp->tot_transitions) + (w * nr_v + v)) );#endif      }            /* calculate the prob of producing letters l in v*/      t_res3 = get_msa_emission_score_multi(msa_seq_infop, hmmp, c, v, use_labels, normalize, a_index, a_index_2,					    a_index_3, a_index_4, scoring_method, use_gap_shares,					    use_prior_shares, multi_scoring_method, aa_freqs, aa_freqs_2,					    aa_freqs_3, aa_freqs_4);            res = res * t_res3;      row_sum += res;      /* save result in matrices */      (cur_rowp + v)->prob = res;#ifdef DEBUG_FW      printf("letter = %f\n", t_res3);      printf("res = %f\n", res);#endif    }        /* scale the results, row_sum = the total probability of      * having produced the sequence up to and including character c */     if(use_lead_columns == YES) {      scale_fsp[j+1] = row_sum;    }    else {      scale_fsp[c+1] = row_sum;    }    #ifdef DEBUG_FW    printf("rowsum for row %d = %f\n", c, row_sum);    printf("scaling set to: %f\n", scale_fsp[c+1]);#endif    if(row_sum == 0.0) {      printf("Probability for this msa = 0.0, pos = %d\n", c);      printf("in forward\n");      exit(0);      return NOPROB;    }    for(v = 0; v < hmmp->nr_v; v++) {      if((cur_rowp + v)->prob != 0){	(cur_rowp + v)->prob = ((cur_rowp + v)->prob)/row_sum; /* scaling */      }    }        /* move row pointers one row forward */    prev_rowp = cur_rowp;    cur_rowp = cur_rowp + hmmp->nr_v;    /* update current column */    if(use_lead_columns == YES) {      j++;      c = *(msa_seq_infop->lead_columns_start + j);    }    else {      c++;    }  }  /* Fill in transition to end state */#ifdef DEBUG_FW  printf("\ntransition to end state:\n");#endif  res = 0;  wp = *(hmmp->tot_from_trans_array + hmmp->nr_v - 1);  while(wp->vertex != END) {    t_res1 = (prev_rowp + wp->vertex)->prob;    t_res2 = *((hmmp->tot_transitions) + get_mtx_index(wp->vertex, hmmp->nr_v - 1, hmmp->nr_v));    if(t_res2 > 1.0) {      t_res2 = 1.0;    }    res += t_res1 * t_res2;       wp++;  }#ifdef DEBUG_FW  printf("res = %f\n", res);#endif  (cur_rowp + hmmp->nr_v - 1)->prob = res; /* obs: no scaling performed here */#ifdef DEBUG_FW  dump_forward_matrix(seq_len + 2, hmmp->nr_v, forw_mtxp);  dump_scaling_array(seq_len + 1, scale_fsp);  printf("exiting msa_forward\n\n\n");#endif  /* Garbage collection and return */  return OK;}/**************************** the msa_backward algorithm **********************************/int msa_backward_multi(struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int use_lead_columns,		       int use_gap_shares, struct backward_s **ret_backw_mtxpp, double *scale_fsp, int use_labels, int normalize,		       int scoring_method, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3,		       double *aa_freqs_4){  struct backward_s *backw_mtxp, *cur_rowp, *prev_rowp; /* pointers to backward matrix */  int seq_len; /* length of the sequence */  int c, v, w; /* looping indices, c loops over the sequence,		  v and w over the vertices in the HMM */  int i,j; /* general loop indices */  int all_columns; /* boolean for checking whether all columns have been looped over */  int a_index, a_index_2, a_index_3, a_index_4; /* holds the current letter's position in the alphabet */  double row_sum, res,t_res1,t_res2, t_res3; /* temporary variables to calculate results */  struct path_element *wp;    /* Allocate memory for matrix + some initial setup:   * Note 1: probability matrix has the sequence indices vertically   * and the vertex indices horizontally meaning it will be filled row by row   * Note 2: *ret_backw_mtxpp is allocated here, but must be   * freed by caller   * Note 3: *scale_fspp is the scaling array produced by forward() meaning backward()   * must be called after forward() */  if(use_lead_columns == YES) {    seq_len = msa_seq_infop->nr_lead_columns;  }  else {    seq_len = msa_seq_infop->msa_seq_length;  }  *ret_backw_mtxpp = (struct backward_s*)(malloc_or_die((seq_len+2) *							hmmp->nr_v * 							sizeof(struct backward_s)));  backw_mtxp = *ret_backw_mtxpp;    /* Initialize the last row of the matrix */  (backw_mtxp + get_mtx_index(seq_len + 1, hmmp->nr_v - 1,			      hmmp->nr_v))->prob = 1.0; /* sets index							 * (seq_len+1,nr_v-1) i.e.							 * the lower right							 * corner of the matrix to 							 * 1.0,the rest are already							 * 0.0 as they should be*/   /* Fill in next to last row in matrix (i.e. add prob for path to end state for all states   * that have a transition path to the end state) */  prev_rowp = backw_mtxp + get_mtx_index(seq_len + 1, 0, hmmp->nr_v);  cur_rowp = prev_rowp - hmmp->nr_v;  wp = *(hmmp->from_trans_array + hmmp->nr_v - 1);  while(wp->vertex != END) {    w = wp->vertex;    t_res1 = 1.0;    while(wp->next != NULL) {      t_res1 = t_res1 * *(hmmp->transitions +			  get_mtx_index(wp->vertex, (wp + 1)->vertex, hmmp->nr_v));      wp++;    }    (cur_rowp + w)->prob += t_res1;    if((cur_rowp + w)->prob > 1.0) {      (cur_rowp + w)->prob = 1.0;    }    wp++;  }  prev_rowp = cur_rowp;  cur_rowp = cur_rowp - hmmp->nr_v;  /* Fill in first rows moving upwards in matrix */  all_columns = NO;  j = 1;  if(use_lead_columns == YES) {    c = *(msa_seq_infop->lead_columns_end - j);  }  else {    c = seq_len - 1;  }  while(c >= 0 && all_columns == NO) {    a_index_2 = -1;    a_index_3 = -1;    a_index_4 = -1;    if(hmmp->alphabet_type == DISCRETE) {      a_index = get_alphabet_index((msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->query_letter, hmmp->alphabet, hmmp->a_size);      if(a_index < 0) {	a_index = hmmp->a_size; /* if letter is wild card, use default column in subst matrix */      }    }    if(hmmp->nr_alphabets > 1) {      if(hmmp->alphabet_type_2 == DISCRETE) {	a_index_2 = get_alphabet_index((msa_seq_infop->msa_seq_2 + (c * (hmmp->a_size_2+1)))->query_letter,				       hmmp->alphabet_2, hmmp->a_size_2);	if(a_index_2 < 0) {	  a_index_2 = hmmp->a_size_2; /* if letter is wild card, use default column in subst matrix */	}      }    }    if(hmmp->nr_alphabets > 2) {      if(hmmp->alphabet_type_3 == DISCRETE) {	a_index_3 = get_alphabet_index((msa_seq_infop->msa_seq_3 + (c * (hmmp->a_size_3+1)))->query_letter,				       hmmp->alphabet_3, hmmp->a_size_3);	if(a_index_3 < 0) {	  a_index_3 = hmmp->a_size_3; /* if letter is wild card, use default column in subst matrix */	}      }    }    if(hmmp->nr_alphabets > 3) {      if(hmmp->alphabet_type_4 == DISCRETE) {	a_index_4 = get_alphabet_index((msa_seq_infop->msa_seq_4 + (c * (hmmp->a_size_4+1)))->query_letter,				       hmmp->alphabet_4, hmmp->a_size_4);	if(a_index_4 < 0) {	  a_index_4 = hmmp->a_size_4; /* if letter is wild card, use default column in subst matrix */	}      }    }    /* calculate sum of probabilities */    for(v = 0; v < hmmp->nr_v - 1; v++) /* v = from-vertex */{#ifdef DEBUG_BW      printf("prob passing through vertex %d:\n", v);#endif            res = 0;      wp = *(hmmp->tot_to_trans_array + v);      while(wp->vertex != END) 	/* w = to-vertex */ {	/* probability of transiting from v to w on this particular path */	t_res2 = *((hmmp->tot_transitions) + get_mtx_index(v, wp->vertex, hmmp->nr_v));	if(t_res2 < 0) {	  printf("found model transition prob from %d to %d < 0.0\n", v, wp->vertex);	  exit(0);	}	t_res1 = (prev_rowp + wp->vertex)->prob; /* probability of having produced the						  * sequence after c passing through						  * vertex w */		/* calculate the prob of producing letters l in v*/	t_res3 = get_msa_emission_score_multi(msa_seq_infop, hmmp, c, wp->vertex, use_labels, normalize, a_index, a_index_2,					      a_index_3, a_index_4, scoring_method, use_gap_shares,					      NO, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4);			res += t_res1 * t_res2 * t_res3; /* multiply the probabilities and					  * add to previous sum */	wp++;      }#ifdef DEBUG_BW      printf("prev = %f: ", t_res1);      printf("trans = %f\n", t_res2);#endif                 /* save result in matrices */      (cur_rowp + v)->prob = res;#ifdef DEBUG_BW      printf("res = %f\n", res);#endif    }        /* scale the results using the scaling factors produced by forward() */#ifdef DEBUG_BW    printf("c = %d\n", c);    dump_scaling_array(seq_len + 1, scale_fsp);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -