📄 core_algorithms_multialpha.c

📁 马尔科夫模型的java版本实现
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
#endif    for(v = 0; v < hmmp->nr_v; v++) {      if((cur_rowp + v)->prob != 0){	if(use_lead_columns == YES) {	  if(scale_fsp[msa_seq_infop->nr_lead_columns + 1 - j] != 0.0) { 	    (cur_rowp + v)->prob =	      ((cur_rowp + v)->prob)/scale_fsp[msa_seq_infop->nr_lead_columns + 1 - j]; /* scaling */	  }	  else {	    printf("This msa cannot be produced by hmm\n");	    return NOPROB;	  }	}	else {	  if(scale_fsp[c+1] != 0.0) { 	    (cur_rowp + v)->prob = ((cur_rowp + v)->prob)/scale_fsp[c+1]; /* scaling */	  }	  else {	    printf("This msa cannot be produced by hmm\n");	    return NOPROB;	  }	}      }    }        /* move row pointers one row backward */    prev_rowp = cur_rowp;    cur_rowp = cur_rowp - hmmp->nr_v;    /* update current column */    if(use_lead_columns == YES) {      if(c == *(msa_seq_infop->lead_columns_start)) {	all_columns = YES;      }      j++;      c = *(msa_seq_infop->lead_columns_end - j);    }    else {      c = c - 1;    }  }#ifdef DEBUG_BW  dump_backward_matrix(seq_len + 2, hmmp->nr_v, backw_mtxp);  dump_scaling_array(seq_len + 1, scale_fsp);#endif  /* Garbage collection and return */  return OK;}/************************* the msa_viterbi algorithm **********************************/int msa_viterbi_multi(struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int use_lead_columns,		      int use_gap_shares, int use_prior_shares, struct viterbi_s **ret_viterbi_mtxpp, int use_labels, int normalize,		      int scoring_method, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3,		      double *aa_freqs_4){  struct viterbi_s *viterbi_mtxp, *cur_rowp, *prev_rowp; /* pointers to viterbi matrix */  int seq_len; /* length of the sequence */  int c, v, w; /* looping indices, c loops over the sequence,		  v and w over the vertices in the HMM */  int i,j; /* general loop indices */  int a_index, a_index_2, a_index_3, a_index_4; /* holds the current letter's position in the alphabet */  double max, res, t_res1, t_res2, t_res3; /* temporary variables to calculate probabilities */  struct path_element *wp, *from_vp, *prevp; /* for traversing the paths from v to w */  int nr_v;  int prev;  double seq_normalizer, state_normalizer;    /* Allocate memory for matrix  + some initial setup:   * Note 1: viterbi probability matrix has the sequence indices vertically   * and the vertex indices horizontally meaning it will be filled row by row   * Note 2: *ret_viterbi_mtxpp and *ret_scale_fspp are allocated here, but must be   * freed by caller */  nr_v = hmmp->nr_v;  if(use_lead_columns == YES) {    seq_len = msa_seq_infop->nr_lead_columns;  }  else {    seq_len = msa_seq_infop->msa_seq_length;  }  *ret_viterbi_mtxpp = (struct viterbi_s*)(malloc_or_die((seq_len+2) *							 hmmp->nr_v *							 sizeof(struct viterbi_s)));  init_viterbi_s_mtx(*ret_viterbi_mtxpp, DEFAULT, (seq_len+2) * hmmp->nr_v);  viterbi_mtxp = *ret_viterbi_mtxpp;    /* Initialize the first row of the matrix */  viterbi_mtxp->prob = 0.0; /* sets index (0,0) to 0.0 (i.e. prob 1.0),			       the rest are already DEFAULT as they should be */    /* Fill in middle rows */  prev_rowp = viterbi_mtxp;  cur_rowp = viterbi_mtxp + hmmp->nr_v;  j = 0;  if(use_lead_columns == YES) {    c = *(msa_seq_infop->lead_columns_start + j);  }  else {    c = 0;  }   while(c != END && c < msa_seq_infop->msa_seq_length)  {    a_index_2 = -1;    a_index_3 = -1;    a_index_4 = -1;    if(hmmp->alphabet_type == DISCRETE) {      a_index = get_alphabet_index((msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->query_letter, hmmp->alphabet, hmmp->a_size);      if(a_index < 0) {	a_index = hmmp->a_size; /* if letter is wild card, use default column in subst matrix */      }    }    if(hmmp->nr_alphabets > 1) {      if(hmmp->alphabet_type_2 == DISCRETE) {	a_index_2 = get_alphabet_index((msa_seq_infop->msa_seq_2 + (c * (hmmp->a_size_2+1)))->query_letter,				       hmmp->alphabet_2, hmmp->a_size_2);	if(a_index_2 < 0) {	  a_index_2 = hmmp->a_size_2; /* if letter is wild card, use default column in subst matrix */	}      }    }    if(hmmp->nr_alphabets > 2) {      if(hmmp->alphabet_type_3 == DISCRETE) {	a_index_3 = get_alphabet_index((msa_seq_infop->msa_seq_3 + (c * (hmmp->a_size_3+1)))->query_letter,				       hmmp->alphabet_3, hmmp->a_size_3);	if(a_index_3 < 0) {	  a_index_3 = hmmp->a_size_3; /* if letter is wild card, use default column in subst matrix */	}      }    }    if(hmmp->nr_alphabets > 3) {      if(hmmp->alphabet_type_4 == DISCRETE) {	a_index_4 = get_alphabet_index((msa_seq_infop->msa_seq_4 + (c * (hmmp->a_size_4+1)))->query_letter,				       hmmp->alphabet_4, hmmp->a_size_4);	if(a_index_4 < 0) {	  a_index_4 = hmmp->a_size_4; /* if letter is wild card, use default column in subst matrix */	}      }    }        /* calculate sum of probabilities */#ifdef DEBUG_VI    printf("label nr %d for pos %d = %c\n", j, c, (msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->label);#endif        for(v = 1; v < hmmp->nr_v - 1; v++) /* v = to-vertex */ {#ifdef DEBUG_VI      printf("prob to vertex %d:\n", v);#endif      prev = NO_PREV;      max = DEFAULT;      wp = *(hmmp->tot_from_trans_array + v);      res = 0;      while((w = wp->vertex) != END) /* w = from-vertex */ {	/* calculate intermediate results */	from_vp = wp;	t_res1 = (prev_rowp + w)->prob; /* probability of having produced the					 * sequence up to the last letter ending					 * in vertex w */	/* calculate intermediate results */	t_res2 = *(hmmp->max_log_transitions + (w * hmmp->nr_v + v));	if(t_res1 != DEFAULT && t_res2 != DEFAULT) {	  res = t_res1 + t_res2;	}	else {	  res = DEFAULT;	}		if(prev == NO_PREV && res != DEFAULT &&	   (use_labels == NO || (msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->label == *(hmmp->vertex_labels + v) ||	    (msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->label == '.')) {	  max = res;	  prev = from_vp->vertex;	  prevp = from_vp;	  continue;	}	  	if(res > max && res != DEFAULT &&	   (use_labels == NO || (msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->label == *(hmmp->vertex_labels + v) ||	    (msa_seq_infop->msa_seq_1 + (c * (hmmp->a_size+1)))->label == '.')) {	  max = res;	  prev = from_vp->vertex;	  prevp = from_vp;	  }	wp++;#ifdef DEBUG_VI	printf("prev in pos(%d) = %f: ",from_vp->vertex, t_res1);	printf("trans from %d to %d = %f\n", from_vp->vertex, v, t_res2);	printf("tot score = %f\n", res);#endif      }      #ifdef DEBUG_VI      printf("max before setting: %f\n", max);#endif            /* calculate the prob of producing letters l in v*/      t_res3 = get_msa_emission_score_multi(msa_seq_infop, hmmp, c, v, use_labels, normalize, a_index, a_index_2,					    a_index_3, a_index_4, scoring_method, use_gap_shares,					    use_prior_shares, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3,					    aa_freqs_4);      if(t_res3 == 0.0) {	max = DEFAULT;      }      else {	t_res3 = log10(t_res3);	if(max != DEFAULT) {	  max = max + t_res3;	}      }          /* save result in matrices */      (cur_rowp + v)->prob = max;      (cur_rowp + v)->prev = prev;      (cur_rowp + v)->prevp = prevp;#ifdef DEBUG_VI      printf("max after setting: %f\n", max);#endif    }         /* move row pointers one row viterbi */    prev_rowp = cur_rowp;    cur_rowp = cur_rowp + hmmp->nr_v;    /* update current column */    if(use_lead_columns == YES) {      j++;      c = *(msa_seq_infop->lead_columns_start + j);    }    else {      c++;    }  }    /* Fill in transition to end state */#ifdef DEBUG_VI  printf("\ntransition to end state:\n");#endif  max = DEFAULT;  prev = NO_PREV;  prevp = NULL;  wp = *(hmmp->from_trans_array + hmmp->nr_v-1);  while(wp->vertex != END) /* w = from-vertex */ {    from_vp = wp;    t_res1 = (prev_rowp + wp->vertex)->prob; /* probability of having produced the					      * sequence up to the last letter ending					      * in vertex w */    t_res2 = 0.0;    while(wp->next != NULL) {      t_res2 = t_res2 + *((hmmp->log_transitions) +			  get_mtx_index(wp->vertex, (wp+1)->vertex, hmmp->nr_v));      wp++;    }    t_res2 = t_res2 + *((hmmp->log_transitions) +			get_mtx_index(wp->vertex, hmmp->nr_v - 1, hmmp->nr_v));    #ifdef DEBUG_VI    printf("prev = %f: ", t_res1);    printf("trans = %f\n", t_res2);#endif    if(t_res1 != DEFAULT && t_res2 != DEFAULT) {      res = t_res1 + t_res2;    }    else {      res = DEFAULT;    }    if(prev == NO_PREV && res != DEFAULT) {      prev = from_vp->vertex;      prevp = from_vp;      max = res;      continue;    }    if(res > max && res != DEFAULT) {      prev = from_vp->vertex;      prevp = from_vp;      max = res;    }    wp++;  }#ifdef DEBUG_VI  printf("res = %f\n", max);#endif  if(max != DEFAULT) {    (cur_rowp + hmmp->nr_v-1)->prob = max;    (cur_rowp + hmmp->nr_v-1)->prev = prev;    (cur_rowp + hmmp->nr_v-1)->prevp = prevp;  }  else {#ifdef DEBUG_VI    dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->transitions);    dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->log_transitions);#endif        printf("This msa cannot be produced by hmm\n");    printf("inne\n");    return NOPROB;  }  #ifdef PATH  //dump_viterbi_matrix(seq_len + 2, hmmp->nr_v, viterbi_mtxp);  //dump_viterbi_path((cur_rowp + w)->prevp);  printf("normalized log likelihood for most probable path = %f\n",	 0.0 - (((cur_rowp + hmmp->nr_v - 1)->prob) / seq_len));  printf("and most probable path is: ");  dump_viterbi_path((cur_rowp + hmmp->nr_v - 1), hmmp, viterbi_mtxp, seq_len + 1, hmmp->nr_v);  printf("%d\n",hmmp->nr_v - 1);  printf("log prob = %f\n", (cur_rowp + hmmp->nr_v-1)->prob);  printf("real prob = %f\n", pow(10,(cur_rowp + hmmp->nr_v-1)->prob));  dump_viterbi_label_path((cur_rowp + hmmp->nr_v - 1), hmmp, viterbi_mtxp, seq_len + 1, hmmp->nr_v);  printf("\n");#endif  /* Garbage collection and return */  return OK;}/************************* the msa_one_best algorithm **********************************/int msa_one_best_multi(struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int use_lead_columns,		       int use_gap_shares, int use_prior_shares, struct one_best_s **ret_one_best_mtxpp, double **ret_scale_fspp,		       int use_labels, char *best_labeling, int normalize, int scoring_method, int multi_scoring_method,		       double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4){  struct one_best_s *one_best_mtxp, *cur_rowp, *prev_rowp; /* pointers to one_best matrix */  double *scale_fsp; /* pointer to array of scaling factors */  int seq_len; /* length of the sequence */  int c, v, w; /* looping indices, c loops over the sequence,		  v and w over the vertices in the HMM */  int i,j; /* general loop indices */  int a_index, a_index_2, a_index_3, a_index_4; /* holds the current letter's position in the alphabet */  double row_sum, res, t_res1, t_res2, t_res3; /* temporary variables to calculate probabilities */  struct path_element *wp; /* for traversing the paths from v to w */  int nr_v;  double scaled_result;  int *sorted_v_list;  int v_list_index;    /* Allocate memory for matrix and scaling factors + some initial setup:   * Note 1: one_best probability matrix has the sequence indices vertically   * and the vertex indices horizontally meaning it will be filled row by row   * Note 2: *ret_one_best_mtxpp and *ret_scale_fspp are allocated here, but must be   * freed by caller */  nr_v = hmmp->nr_v;  if(use_lead_columns == YES) {    seq_len = msa_seq_infop->nr_lead_columns;  }  else {    seq_len = msa_seq_infop->msa_seq_length;  }  *ret_one_best_mtxpp = (struct one_best_s*)(malloc_or_die((seq_len+2) *						      hmmp->nr_v *						      sizeof(struct one_best_s)));  one_best_mtxp = *ret_one_best_mtxpp;  *ret_scale_fspp = (double*)(malloc_or_die((seq_len+2) * sizeof(double)));  scale_fsp = *ret_scale_fspp;  sorted_v_list = (int*)(malloc_or_die((hmmp->nr_v * 2 + 1) * sizeof(int)));    /* Initialize the first row of the matrix */  one_best_mtxp->prob = 1.0; /* sets index (0,0) to 1.0,			    the rest are already 0.0 as they should be */  one_best_mtxp->labeling = (char*)(malloc_or_die(1 * sizeof(char)));  for(i = 1; i < hmmp->nr_v; i++) {    (one_best_mtxp+i)->labeling = NULL;  }  *scale_fsp = 1.0;    /* create initial sorted v-list*/  *(sorted_v_list) = 0; /* 0 is always the number of the start st
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -