📄 training_algorithms_multialpha.c

📁 马尔科夫模型的java版本实现
💻 C
📖 第 1 页 / 共 5 页
字号:
	    /* update loop index, check if we are done */	    i++;	    if(use_lead_columns == NO) {	      if(i >= seq_len) {		break;	      }	    }	    else {	      if(*(msa_seq_infop->lead_columns_start + i) == END) {		break;	      }	    }	  }	}      }            /* some garbage collection */      free(forw_mtx);      free(backw_mtx);      free(forw_scale);      msa_seq_infop++;    }	    if(verbose == YES) {      printf("log likelihood rd %d: %f\n", iteration, new_log_likelihood);    }         #ifdef DEBUG_BW2    dump_T_matrix(hmmp->nr_v, hmmp->nr_v, T);    dump_E_matrix(hmmp->nr_v, hmmp->a_size, E);    if(hmmp->nr_alphabets > 1) {      dump_E_matrix(hmmp->nr_v, hmmp->a_size_2 + 1, E_2);    }#endif    /* check if likelihood change is small enough, then we are done */    if(fabs(new_log_likelihood - old_log_likelihood) < INNER_BW_THRESHOLD && annealing_status == DONE) {      break;    }        /* if simulated annealing is used, scramble results in E and T matrices */    if(annealing == YES && temperature > ANNEAL_THRESHOLD) {      anneal_E_matrix_multi(temperature, E, hmmp, 1);      if(hmmp->nr_alphabets > 1) {	anneal_E_matrix_multi(temperature, E_2, hmmp, 2);      }      if(hmmp->nr_alphabets > 2) {	anneal_E_matrix_multi(temperature, E_3, hmmp, 3);      }      if(hmmp->nr_alphabets > 3) {	anneal_E_matrix_multi(temperature, E_4, hmmp, 4);      }      anneal_T_matrix_multi(temperature, T, hmmp);      temperature = temperature * cooling_factor;    }    if(temperature < ANNEAL_THRESHOLD) {      annealing_status = DONE;    }    /* recalculate emission expectations according to distribution groups      * by simply taking the mean of the expected emissions within this group     * for each letter in the alphabet and replacing each expectation for the     * letter with this value for every member of the distribution group */    recalculate_emiss_expectations_multi(hmmp, E, 1);    if(hmmp->nr_alphabets > 1) {      recalculate_emiss_expectations_multi(hmmp, E_2, 2);    }    if(hmmp->nr_alphabets > 2) {      recalculate_emiss_expectations_multi(hmmp, E_3, 3);    }    if(hmmp->nr_alphabets > 3) {      recalculate_emiss_expectations_multi(hmmp, E_4, 4);    }        /* recalculate transition expectations for tied transitions according     * to the same scheme as for emission distribution groups */    recalculate_trans_expectations_multi(hmmp, T);    for(k = 0; k < hmmp->nr_v-1; k++) /* k = from-vertex */ {      /* update transition matrix */      if(use_transition_pseudo_counts == YES) {	update_trans_mtx_pseudocount_multi(hmmp, T, k);      }      else {	update_trans_mtx_std_multi(hmmp, T, k);      }            #ifdef DEBUG_PRIORS      printf("Starting emission matrix update\n");#endif            /* update emission matrix using Dirichlet prior files if they exist*/      priorp = *(hmmp->ed_ps + k);      if(priorp != NULL && use_prior == YES && hmmp->alphabet_type == DISCRETE) {#ifdef DEBUG_PRIORS		printf("k = %d\n", k);	printf("value = %x\n", priorp);#endif	update_emiss_mtx_prior_multi(hmmp, E, k, priorp, 1);      }      else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type == DISCRETE)	/* update emissions matrix "normally" when dirichlet file is missing */ {	update_emiss_mtx_pseudocount_multi(hmmp, E, k, 1);      }      else if(hmmp->alphabet_type == DISCRETE) {	update_emiss_mtx_std_multi(hmmp, E, k, 1);      }      else {	update_emiss_mtx_std_continuous_multi(hmmp, E, k, 1);      }            if(hmmp->nr_alphabets > 1) {	priorp = *(hmmp->ed_ps_2 + k);	if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_2 == DISCRETE) {	  update_emiss_mtx_prior_multi(hmmp, E_2, k, priorp, 2);	}	else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_2 == DISCRETE)	  /* update emissions matrix "normally" when dirichlet file is missing */ {	  update_emiss_mtx_pseudocount_multi(hmmp, E_2, k, 2);	}	else if(hmmp->alphabet_type_2 == DISCRETE) {	  update_emiss_mtx_std_multi(hmmp, E_2, k, 2);	}	else {	  update_emiss_mtx_std_continuous_multi(hmmp, E_2, k, 2);	}      }      if(hmmp->nr_alphabets > 2) {	priorp = *(hmmp->ed_ps_3 + k);	if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_3 == DISCRETE) {	  update_emiss_mtx_prior_multi(hmmp, E_3, k, priorp, 3);	}	else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_3 == DISCRETE)	  /* update emissions matrix "normally" when dirichlet file is missing */ {	  update_emiss_mtx_pseudocount_multi(hmmp, E_3, k, 3);	}	else if(hmmp->alphabet_type_3 == DISCRETE) {	  update_emiss_mtx_std_multi(hmmp, E_3, k, 3);	}	else {	  update_emiss_mtx_std_continuous_multi(hmmp, E_3, k, 3);	}      }      if(hmmp->nr_alphabets > 3) {	priorp = *(hmmp->ed_ps_4 + k);	if(priorp != NULL && use_prior == YES && hmmp->alphabet_type_4 == DISCRETE) {	  update_emiss_mtx_prior_multi(hmmp, E_4, k, priorp, 4);	}	else if(use_emission_pseudo_counts == YES && hmmp->alphabet_type_4 == DISCRETE) 	  /* update emissions matrix "normally" when dirichlet file is missing */ {	  update_emiss_mtx_pseudocount_multi(hmmp, E_4, k, 4);	}	else if(hmmp->alphabet_type_4 == DISCRETE) {	  update_emiss_mtx_std_multi(hmmp, E_4, k, 4);	}	else {	  update_emiss_mtx_std_continuous_multi(hmmp, E_4, k, 4);	}      }    }    #ifdef DEBUG_BW2    dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->transitions);    dump_emiss_matrix(hmmp->nr_v, hmmp->a_size, hmmp->emissions);    if(hmmp->nr_alphabets > 1) {      dump_emiss_matrix(hmmp->nr_v, hmmp->a_size_2, hmmp->emissions_2);    }#endif            /* some garbage collection */    free(E);    if(hmmp->nr_alphabets > 1) {      free(E_2);    }    if(hmmp->nr_alphabets > 2) {      free(E_3);    }    if(hmmp->nr_alphabets > 3) {      free(E_4);    }    free(T);    max_nr_iterations--;    iteration++;#ifdef DEBIG_BW2    printf("end of baum-welch-loop\n");#endif  }  while(max_nr_iterations > 0); /* break condition is also when log_likelihood_difference is				 * smaller than THRESHOLD, checked inside the loop for				 * better efficiency */    #ifdef DEBUG_BW  dump_trans_matrix(hmmp->nr_v, hmmp->nr_v, hmmp->transitions);  dump_emiss_matrix(hmmp->nr_v, hmmp->a_size, hmmp->emissions);#endif    }/* implementation of the conditional maximum likelihood version of the * baum-welch training algorithm using dirichlet prior mixture to * calculate update of emission (and transition) matrices */void extended_baum_welch_dirichlet_multi(struct hmm_multi_s *hmmp, struct sequence_multi_s *seqsp, int nr_seqs,					 int annealing, int use_labels,					 int use_transition_pseudo_counts, int use_emission_pseudo_counts,					 int multi_scoring_method, int use_prior){  double *T, *E, *E_2, *E_3, *E_4; /* matrices for the estimated number of times		 * each transition (T) and emission (E) is used */  double *T_lab, *E_lab, *E_lab_2, *E_lab_3, *E_lab_4, *T_ulab, *E_ulab, *E_ulab_2, *E_ulab_3, *E_ulab_4;  struct forward_s *forw_mtx; /* forward matrix */  struct backward_s *backw_mtx; /* backward matrix */  double *forw_scale; /* scaling array */  int s,p,k,l,a,d; /* loop counters, s loops over the sequences, p over the			* positions in the sequence, k and l over states, a over the alphabet			* and d over the distribution groups */  struct path_element *lp;  double t_res, t_res_1, t_res_2, t_res_3; /* for temporary results */  double t_res_4, t_res_5, t_res_6; /* for temporary results */  double e_res, e_res_1, e_res_2, e_res_3; /* for temporary results */  double t_res_ulab;  int seq_len; /* length of the seqences */  int a_index, a_index_2, a_index_3, a_index_4; /* holds current letters index in the alphabet */  struct letter_s *seq, *seq_2, *seq_3, *seq_4; /* pointer to current sequence */  double old_log_likelihood_lab, new_log_likelihood_lab;  double old_log_likelihood_ulab, new_log_likelihood_ulab; /* to calculate when to stop */  double likelihood; /* temporary variable for calculating likelihood of a sequence */  int max_nr_iterations, iteration;  /* dirichlet prior variables */  struct emission_dirichlet_s *priorp;  /* simulated annealing variables */  double temperature;  double cooling_factor;  int annealing_status;    /* some initialization */  old_log_likelihood_lab = 9999.0;  new_log_likelihood_lab = 9999.0;  old_log_likelihood_ulab = 9999.0;  new_log_likelihood_ulab = 9999.0;  max_nr_iterations = 70;  iteration = 1;  if(annealing == YES) {    temperature = INIT_TEMP;    cooling_factor = INIT_COOL;    annealing_status = ACTIVE;  }  else {    annealing_status = DONE;  }    do {    /* allocate per iteration matrices */    T = (double*)(malloc_or_die(hmmp->nr_v * hmmp->nr_v * sizeof(double)));    E = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size * sizeof(double)));    T_lab = (double*)(malloc_or_die(hmmp->nr_v * hmmp->nr_v * sizeof(double)));    E_lab = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size * sizeof(double)));    T_ulab = (double*)(malloc_or_die(hmmp->nr_v * hmmp->nr_v * sizeof(double)));    E_ulab = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size * sizeof(double)));     if(hmmp->nr_alphabets > 1) {      E_2 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_2 * sizeof(double)));      E_lab_2 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_2 * sizeof(double)));      E_ulab_2 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_2 * sizeof(double)));    }    if(hmmp->nr_alphabets > 2) {      E_3 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_3 * sizeof(double)));      E_lab_3 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_3 * sizeof(double)));      E_ulab_3 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_3 * sizeof(double)));    }    if(hmmp->nr_alphabets > 3) {      E_4 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_4 * sizeof(double)));      E_lab_4 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_4 * sizeof(double)));      E_ulab_4 = (double*)(malloc_or_die(hmmp->nr_v * hmmp->a_size_4 * sizeof(double)));    }    old_log_likelihood_ulab = new_log_likelihood_ulab;    new_log_likelihood_ulab = 0.0;    old_log_likelihood_lab = new_log_likelihood_lab;    new_log_likelihood_lab = 0.0;    for(s = 0; s < nr_seqs; s++) {      /* Convert sequence to 1...L for easier indexing */      seq_len = (seqsp + s)->length;      seq = (struct letter_s*) (malloc_or_die((seq_len + 2) * sizeof(struct letter_s)));      memcpy(seq+1, (seqsp + s)->seq_1, seq_len * sizeof(struct letter_s));      if(hmmp->nr_alphabets > 1) {	seq_2 = (struct letter_s*) (malloc_or_die((seq_len + 2) * sizeof(struct letter_s)));	memcpy(seq_2+1, (seqsp + s)->seq_2, seq_len * sizeof(struct letter_s));      }      if(hmmp->nr_alphabets > 2) {	seq_3 = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s)));	memcpy(seq_3+1, (seqsp + s)->seq_3, seq_len * sizeof(struct letter_s));      }      if(hmmp->nr_alphabets > 3) {	seq_4 = (struct letter_s*) malloc_or_die(((seq_len + 2) * sizeof(struct letter_s)));	memcpy(seq_4+1, (seqsp + s)->seq_4, seq_len * sizeof(struct letter_s));      }            /* calculate forward and backward matrices */      forward_multi(hmmp, (seqsp + s)->seq_1,(seqsp + s)->seq_2, (seqsp + s)->seq_3, (seqsp + s)->seq_4,		    &forw_mtx, &forw_scale, NO, multi_scoring_method);      backward_multi(hmmp, (seqsp + s)->seq_1, (seqsp + s)->seq_2, (seqsp + s)->seq_3, (seqsp + s)->seq_4,		     &backw_mtx, forw_scale, NO, multi_scoring_method);            /* memory for forw_mtx, scale_mtx and       * backw_mtx is allocated in the functions */            /* update new_log_likelihood */      likelihood = log10((forw_mtx +			  get_mtx_index(seq_len+1, hmmp->nr_v-1, hmmp->nr_v))->prob);      for(k = 0; k <= seq_len; k++) {	likelihood = likelihood + log10(*(forw_scale + k));      }#ifdef DEBUG_BW      dump_scaling_array(k-1,forw_scale);      printf("likelihood = %f\n", likelihood);#endif            new_log_likelihood_ulab += likelihood;            for(k = 0; k < hmmp->nr_v-1; k++) /* k = from vertex */ {	lp = *(hmmp->to_trans_array + k);	while(lp->vertex != END) /* l = to-vertex */ {	  for(p = 1; p <= seq_len; p++) {	    	    /* get alphabet index for c*/	    a_index = get_alphabet_index(&seq[p], hmmp->alphabet, hmmp->a_size);	    if(hmmp->nr_alphabets > 1) {	      a_index_2 = get_alphabet_index(&seq_2[p], hmmp->alphabet_2, hmmp->a_size_2);	    }	    if(hmmp->nr_alphabets > 2) {	      a_index_3 = get_alphabet_index(&seq_3[p], hmmp->alphabet_3, hmmp->a_size_3);	    }	    if(hmmp->nr_alphabets > 3) {	      a_index_4 = get_alphabet_index(&seq_4[p], hmmp->alphabet_4, hmmp->a_size_4);	    } 	    	    /* add T[k][l] contribution for this sequence */	    add_Tkl_contribution_multi(hmmp, seq+1, seq_2+1, seq_3+1, seq_4+1, forw_mtx, backw_mtx,				       forw_scale, p, k, lp, a_index, a_index_2, a_index_3, a_index_4, T_ulab, NO,				       multi_scoring_method);	  }	  /* move on to next path */	  while(lp->next != NULL) {	    lp++;	  }	  lp++;	}	/* calculate E[k][a] contribution from this sequence */	if(silent_state_multi(k, hmmp) != 0) {	  for(p = 1; p <= seq_len; p++) {	    a_index = get_alphabet_index(&seq[p], hmmp->alphabet, hmmp->a_size);	    if(hmmp->nr_alphabets > 1) {	      a_index_2 = get_alphabet_index(&seq_2[p], hmmp->alphabet_2, hmmp->a_size_2);	    }	    if(hmmp->nr_alphabets > 2) {
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -