📄 training_algorithms_multialpha.c

📁 马尔科夫模型的java版本实现
💻 C
📖 第 1 页 / 共 5 页
字号:
      #ifdef DEBUG_PRIORS      printf("Starting emission matrix update\n");#endif            /* update emission matrix using Dirichlet prior files if they exist*/      priorp = *(hmmp->ed_ps + k);      if(priorp != NULL && use_prior == YES && hmmp->alphabet_type == DISCRETE) {#ifdef DEBUG_PRIORS	printf("k = %d\n", k);	printf("value = %x\n", priorp);#endif	update_emiss_mtx_prior_multi(hmmp, E, k, priorp, 1);      }      else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type == DISCRETE)	/* update emissions matrix "normally" when dirichlet file is missing */ {	update_emiss_mtx_pseudocount_multi(hmmp, E, k, 1);      }      else if(hmmp->alphabet_type == DISCRETE) {	update_emiss_mtx_std_multi(hmmp, E, k, 1);      }      else {	update_emiss_mtx_std_continuous_multi(hmmp, E, k, 1);      }            if(hmmp->nr_alphabets > 1) {	priorp = *(hmmp->ed_ps_2 + k);	if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_2 == DISCRETE) {	  update_emiss_mtx_prior_multi(hmmp, E_2, k, priorp, 2);	}	else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_2 == DISCRETE)	  /* update emissions matrix "normally" when dirichlet file is missing */ {	  update_emiss_mtx_pseudocount_multi(hmmp, E_2, k, 2);	}	else if(hmmp->alphabet_type_2 == DISCRETE) {	  update_emiss_mtx_std_multi(hmmp, E_2, k, 2);	}	else {	  update_emiss_mtx_std_continuous_multi(hmmp, E_2, k, 2);	}      }      if(hmmp->nr_alphabets > 2) {	priorp = *(hmmp->ed_ps_3 + k);	if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_3 == DISCRETE) {	  update_emiss_mtx_prior_multi(hmmp, E_3, k, priorp, 3);	}	else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_3 == DISCRETE)	  /* update emissions matrix "normally" when dirichlet file is missing */ {	  update_emiss_mtx_pseudocount_multi(hmmp, E_3, k, 3);	}	else if(hmmp->alphabet_type_3 == DISCRETE) {	  update_emiss_mtx_std_multi(hmmp, E_3, k, 3);	}	else {	  update_emiss_mtx_std_continuous_multi(hmmp, E_3, k, 3);	}      }      if(hmmp->nr_alphabets > 3) {	priorp = *(hmmp->ed_ps_4 + k);	if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_4 == DISCRETE) {	  update_emiss_mtx_prior_multi(hmmp, E_4, k, priorp, 4);	}	else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_4 == DISCRETE) 	  /* update emissions matrix "normally" when dirichlet file is missing */ {	  update_emiss_mtx_pseudocount_multi(hmmp, E_4, k, 4);	}	else if(hmmp->alphabet_type_4 == DISCRETE) {	  update_emiss_mtx_std_multi(hmmp, E_4, k, 4);	}	else {	  update_emiss_mtx_std_continuous_multi(hmmp, E_4, k, 4);	}      }    }   #ifdef DEBUG_BW    dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->transitions);    dump_emiss_matrix(hmmp->nr_v, hmmp->a_size, hmmp->emissions);#endif            /* some garbage collection */    free(E);    if(hmmp->nr_alphabets > 1) {      free(E_2);    }    if(hmmp->nr_alphabets > 2) {      free(E_3);    }    if(hmmp->nr_alphabets > 3) {      free(E_4);    }    free(T);    max_nr_iterations--;    iteration++;  }  while(max_nr_iterations > 0); /* break condition is also when log_likelihood_difference is				 * smaller than THRESHOLD, checked inside the loop for				 * better efficiency */    #ifdef DEBUG_BW  dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->transitions);  dump_emiss_matrix(hmmp->nr_v, hmmp->a_size, hmmp->emissions);#endif      }/* implementation of the baum-welch training algorithm using dirichlet prior mixture to * calculate update of emission (and transition) matrices and using a multiple sequence * alignment as the training sequence */void msa_baum_welch_dirichlet_multi(struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int nr_seqs,				    int annealing, int use_gap_shares, int use_lead_columns, int use_labels,				    int use_transition_pseudo_counts, int use_emission_pseudo_counts, int normalize,				    int scoring_method, int use_nr_occ, int multi_scoring_method, double *aa_freqs,				    double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4, int use_prior){  struct msa_sequences_multi_s *msa_seq_infop_start;  double *T, *E, *E_2, *E_3, *E_4; /* matrices for the estimated number of times				    * each transition (T) and emission (E) is used */  struct forward_s *forw_mtx; /* forward matrix */  struct backward_s *backw_mtx; /* backward matrix */  double *forw_scale; /* scaling array */  int s,p,k,l,a,d,i; /* loop counters, s loops over the sequences, p over the		      * positions in the sequence, k and l over states, a over the alphabet,		      * d over the distribution groups and i is a slush variable  */  struct path_element *lp;  double t_res, t_res_1, t_res_2, t_res_3; /* for temporary results */  double t_res_4, t_res_5, t_res_6; /* for temporary results */  double e_res, e_res_1, e_res_2, e_res_3; /* for temporary results */  int seq_len; /* length of the sequences */  int a_index; /* holds current letters index in the alphabet */  double old_log_likelihood, new_log_likelihood; /* to calculate when to stop */  double likelihood; /* temporary variable for calculating likelihood of a sequence */  int max_nr_iterations, iteration;  double Eka_base;  int query_index; /* index of query seq */  /* dirichlet prior variables */  struct emission_dirichlet_s *priorp;  struct emission_dirichlet_s *priorp_2;  struct emission_dirichlet_s *priorp_3;  struct emission_dirichlet_s *priorp_4;  /* simulated annealing varialbles */  double temperature;  double cooling_factor;  int annealing_status;  /* help variables for add_to_E */  int alphabet_nr;  int alphabet;  int a_size;  double *E_cur;  double *subst_mtx;  struct msa_letter_s *msa_seq;  double *tmp_emissions;  int alphabet_type;  /* remember start of sequences */  msa_seq_infop_start = msa_seq_infop;   old_log_likelihood = 9999.0;  new_log_likelihood = 9999.0;  max_nr_iterations = 20;  iteration = 1;  if(annealing == YES) {    temperature = INIT_TEMP;    cooling_factor = INIT_COOL;    annealing_status = ACTIVE;  }  else {    annealing_status = DONE;  }  #ifdef DEBUG_BW2  check_for_corrupt_values(hmmp->nr_v, hmmp->a_size, hmmp->emissions , "emiss");  check_for_corrupt_values(hmmp->nr_v, hmmp->nr_v, hmmp->transitions , "trans");#endif    do {#ifdef DEBUG_BW2    printf("starting baum-welch loop\n");#endif    /* initialize matrices */    T = (double*)(malloc_or_die(hmmp->nr_v * hmmp->nr_v * 			       sizeof(double)));    if(hmmp->alphabet_type == DISCRETE) {      E = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size * 				  sizeof(double)));    }    else {      E = (double*)(malloc_or_die(hmmp->nr_v * (hmmp->a_size + 1) * 				  sizeof(double)));    }    if(hmmp->nr_alphabets > 1) {      if(hmmp->alphabet_type_2 == DISCRETE) {	E_2 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_2 * 				      sizeof(double)));      }      else {	E_2 = (double*)(malloc_or_die(hmmp->nr_v * (hmmp->a_size_2 + 1) * 				      sizeof(double)));      }    }    if(hmmp->nr_alphabets > 2) {      if(hmmp->alphabet_type_3 == DISCRETE) {	E_3 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_3 * 				      sizeof(double)));      }      else {	E_3 = (double*)(malloc_or_die(hmmp->nr_v * (hmmp->a_size_3 + 1) * 				      sizeof(double)));      }    }    if(hmmp->nr_alphabets > 3) {      if(hmmp->alphabet_type_4 == DISCRETE) {	E_4 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_4 * 				      sizeof(double)));      }      else {	E_4 = (double*)(malloc_or_die(hmmp->nr_v * (hmmp->a_size_4 + 1) * 				      sizeof(double)));      }    }    /* reset sequence pointer */    msa_seq_infop = msa_seq_infop_start;        old_log_likelihood = new_log_likelihood;    new_log_likelihood = 0.0;        for(s = 0; s < nr_seqs; s++) {      if(use_lead_columns == YES) {	seq_len = msa_seq_infop->nr_lead_columns;      }      else {	seq_len = msa_seq_infop->msa_seq_length;      }            /* calculate forward and backward matrices       * memory for forw_mtx, scale_mtx and       * backw_mtx is allocated in the functions */#ifdef DEBUG_BW2      printf("running forward for seq %d\n", s + 1);#endif      msa_forward_multi(hmmp, msa_seq_infop, use_lead_columns, use_gap_shares, NO, &forw_mtx, &forw_scale, use_labels, normalize,			scoring_method, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4);#ifdef DEBUG_BW2      dump_forward_matrix(seq_len + 2, hmmp->nr_v, forw_mtx);      printf("running backward\n");#endif      msa_backward_multi(hmmp, msa_seq_infop, use_lead_columns, use_gap_shares, &backw_mtx, forw_scale, use_labels, normalize,			 scoring_method, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4);#ifdef DEBUG_BW2      check_for_corrupt_values(seq_len + 2, hmmp->nr_v, forw_mtx , "F");      check_for_corrupt_values(seq_len + 2, hmmp->nr_v, backw_mtx , "B");      printf("done with backward\n");#endif      /* update new_log_likelihood */      likelihood = log10((forw_mtx +			  get_mtx_index(seq_len+1, hmmp->nr_v-1, hmmp->nr_v))->prob);      for(k = 0; k <= seq_len; k++) {	likelihood = likelihood + log10(*(forw_scale + k));      }#ifdef DEBUG_BW2      dump_scaling_array(k-1,forw_scale);      printf("likelihood = %f\n", likelihood);#endif            new_log_likelihood += likelihood;            for(k = 0; k < hmmp->nr_v-1; k++) /* k = from vertex */ {	lp = *(hmmp->to_trans_array + k);	while(lp->vertex != END) /* l = to-vertex */ {	  i = 0;	  while(1) {	    if(use_lead_columns == NO) {	      p = i;	    }	    else {	      p = *(msa_seq_infop->lead_columns_start + i);	    }	    /* add T[k][l] contribution for the msa-sequence */	    add_Tkl_contribution_msa_multi(hmmp, msa_seq_infop, forw_mtx, backw_mtx,					   forw_scale, p, k, lp, T, use_gap_shares, use_lead_columns, i, use_labels, scoring_method,					   normalize, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4);	    i++;	    if(use_lead_columns == NO) {	      if(i >= seq_len) {		break;	      }	    }	    else {	      if(*(msa_seq_infop->lead_columns_start + i) == END) {		break;	      }	    }	  }	  /* move on to next path */	  while(lp->next != NULL) {	    lp++;	  }	  lp++;	}		/* calculate E[k][a] contribution from this sequence */	if(silent_state_multi(k, hmmp) != 0) {	  i = 0;	  while(1) {	    	    /* get correct index for this letter column */	    if(use_lead_columns == NO) {	      p = i;	    }	    else {	      p = *(msa_seq_infop->lead_columns_start + i);	    }	    /* get basic scoring result */	    Eka_base = add_Eka_contribution_msa_multi(hmmp, msa_seq_infop, forw_mtx, backw_mtx, p, k,						      i, use_lead_columns);	    	    /* loop over the alphabets */	    for(alphabet_nr = 1; alphabet_nr <= hmmp->nr_alphabets; alphabet_nr++) {	      if(alphabet_nr == 1) {		alphabet = hmmp->alphabet;		subst_mtx = hmmp->subst_mtx;		a_size = hmmp->a_size;		E_cur = E;		msa_seq = msa_seq_infop->msa_seq_1;		alphabet_type = hmmp->alphabet_type;		tmp_emissions = hmmp->emissions;	      }	      else if(alphabet_nr == 2) {		alphabet = hmmp->alphabet_2;		subst_mtx = hmmp->subst_mtx_2;		a_size = hmmp->a_size_2;		E_cur = E_2;		msa_seq = msa_seq_infop->msa_seq_2;		alphabet_type = hmmp->alphabet_type_2;		tmp_emissions = hmmp->emissions_2;	      }	      else if(alphabet_nr == 3) {		alphabet = hmmp->alphabet_3;		subst_mtx = hmmp->subst_mtx_3;		a_size = hmmp->a_size_3;		E_cur = E_3;		msa_seq = msa_seq_infop->msa_seq_3;		alphabet_type = hmmp->alphabet_type_3;		tmp_emissions = hmmp->emissions_3;	      }	      else if(alphabet_nr == 4) {		alphabet = hmmp->alphabet_4;		subst_mtx = hmmp->subst_mtx_4;		a_size = hmmp->a_size_4;		E_cur = E_4;		msa_seq = msa_seq_infop->msa_seq_4;		alphabet_type = hmmp->alphabet_type_4;		tmp_emissions = hmmp->emissions_4;	      }	      	      /* get result and add to matrix according to scoring method */	      add_to_E_multi(E_cur, Eka_base, msa_seq, p, k, a_size, normalize, subst_mtx,			     alphabet, scoring_method, use_nr_occ, alphabet_type, tmp_emissions);	    }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -