⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 opt_prfhmm.c

📁 马尔科夫模型的java版本实现
💻 C
📖 第 1 页 / 共 2 页
字号:
  for(i = 0; i < msa_seq_info.msa_seq_length; i++) {    tot_nr_chars = tot_nr_chars + 2;    if(((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label == 'M') {      nr_mtx_columns++;    }    if(((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label == 'I') {      nr_insert_columns++;    }  }    row = (char*)(malloc_or_die(tot_nr_chars * sizeof(char)));  nr_mtx_columns++;  /* allocate memory for matrices, add one (pseudocount) to each post */  transitions = (int*)malloc_or_die(8 * nr_mtx_columns * sizeof(int));  insert_emissions = (double*)malloc_or_die(nr_mtx_columns * (hmm.a_size+1) * sizeof(double));  for(i = 1; i < 8; i++) {    for(j = 0; j < nr_mtx_columns; j++) {      *(transitions + get_mtx_index(i,j,nr_mtx_columns)) = 1;    }  }    /* store indices of match columns */  cur_msa_column = 1;  cur_mtx_column = 0;  for(i = 0; i < msa_seq_info.msa_seq_length; i++) {    if(((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label == 'M') {      *(transitions + cur_mtx_column) = cur_msa_column;      cur_mtx_column++;      cur_msa_column++;    }    if(((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label == 'I') {      cur_msa_column++;    }  }    /* for all rows, add counts to matrix */  rewind(seq_infile);  for(cur_alphabet = 1; cur_alphabet <= 1; cur_alphabet++) {    alphabet_done = NO;    while((fgets(row, tot_nr_chars, seq_infile)) != NULL && alphabet_done == NO) {      cur_msa_column = 1;      cur_mtx_column = 0;      cur_insert_column = 0;       last_sign = NEW;      cur_sign = NEW;      for(i = cur_alphabet;;i++) {	/* read symbol */	cur_mtx_col_nr = *(transitions + cur_mtx_column);	cur = row[i];	if(row[0] == '/') {	  break;	}	if(cur == '<') {	  alphabet_done = YES;	  break;	}	else if(cur == '>') {	  break;	}	else if(cur == ' ' || cur == '-' || cur == '_' || cur == '.') {	  last_sign = cur_sign;	  if(cur_msa_column == cur_mtx_col_nr) /* gap */{	    if(last_sign == ACID || last_sign == NEW) {	      cur_sign = GAP;	      *(transitions + get_mtx_index(2,cur_mtx_column,nr_mtx_columns)) += 1;	    }	    else if(last_sign == GAP) {	      cur_sign = GAP;	      *(transitions + get_mtx_index(4,cur_mtx_column,nr_mtx_columns)) += 1;	    }	    else {	      	    }	    if(is_gap == YES) {	      is_gap = NO;	      cur_insert_column++;	    }	    cur_msa_column++;	    cur_mtx_column++;	  }	  else if(cur_msa_column < cur_mtx_col_nr) /* insert gap*/{	    if(last_sign == ACID || last_sign == NEW) {	      cur_sign = ACID;	    }	    else if(last_sign == INSERT_ACID) {	      cur_sign = INSERT_ACID;	    }	    else if(last_sign == GAP) {	      cur_sign = GAP;	    }	    is_gap = YES;	    cur_msa_column++;	    	  }	}	else if(cur == ';') {	  	}	else /* ACID */{	  last_sign = cur_sign;	  if(cur_msa_column == cur_mtx_col_nr) /* match */{	    if(last_sign == ACID || last_sign == NEW) {	      cur_sign = ACID;	      *(transitions + get_mtx_index(1,cur_mtx_column,nr_mtx_columns)) += 1;	    }	    else if(last_sign == GAP) {	      cur_sign = ACID;	      *(transitions + get_mtx_index(5,cur_mtx_column,nr_mtx_columns)) += 1;	    }	    else if(last_sign == INSERT_ACID) {	      cur_sign = ACID;	      *(transitions + get_mtx_index(7,cur_mtx_column,nr_mtx_columns)) += 1;	    }	    if(is_gap == YES) {	      is_gap = NO;	      cur_insert_column++;	    }	    cur_msa_column++;	    cur_mtx_column++;	  }	  else if(cur_msa_column < cur_mtx_col_nr) /* insert */{	    if(last_sign == ACID || last_sign == NEW) {	      cur_sign = INSERT_ACID;	      *(transitions + get_mtx_index(3,cur_mtx_column,nr_mtx_columns)) += 1;	    }	    else if(last_sign == INSERT_ACID) {	      cur_sign = INSERT_ACID;	      *(transitions + get_mtx_index(6,cur_mtx_column,nr_mtx_columns)) += 1;	    }	    else if(last_sign == GAP) {	      cur_sign = INSERT_ACID;	    }	    is_gap = YES;	    cur_msa_column++;	  }	}      }    }  }    /* check that msa length and nr columns match */  if((nr_mtx_columns-1) * 3 + 6 != hmm.nr_v) {    printf("Warning: length of hmm is not equal to length of profile\n");  }  else {    /* start-transitions */    s_tot = *(transitions + get_mtx_index(1,0,nr_mtx_columns)) + *(transitions + get_mtx_index(2,0,nr_mtx_columns));    sd = *(transitions + get_mtx_index(2,0,nr_mtx_columns));    sm = *(transitions + get_mtx_index(1,0,nr_mtx_columns));    *(hmm.transitions + get_mtx_index(1, 2, hmm.nr_v)) = SI_PROB;    *(hmm.transitions + get_mtx_index(1, 3, hmm.nr_v)) = (double)sd / (double)s_tot *(1.0 - SI_PROB);    *(hmm.transitions + get_mtx_index(1, 4, hmm.nr_v)) = (double)sm / (double)s_tot *(1.0 - SI_PROB);    if(local == YES) {      tot_local_im_prob =  *(hmm.transitions + get_mtx_index(1, 3, hmm.nr_v)) + *(hmm.transitions + get_mtx_index(1, 4, hmm.nr_v));      local_im_prob = tot_local_im_prob / nr_mtx_columns;      local_me_prob = local_im_prob;      *(hmm.transitions + get_mtx_index(1, 3, hmm.nr_v)) = 0.0;      *(hmm.transitions + get_mtx_index(1, 4, hmm.nr_v)) = local_im_prob * 2;      for(i = 2; i < nr_mtx_columns; i++) {	*(hmm.transitions + get_mtx_index(1, 3*i + 1, hmm.nr_v)) = local_im_prob;      }    }        /* for all dd-trans and dm-trans, calculate values */    for(i = 1; i < nr_mtx_columns - 1; i++) {      d_tot = *(transitions + get_mtx_index(4,i,nr_mtx_columns)) + *(transitions + get_mtx_index(5,i,nr_mtx_columns));      dd = *(transitions + get_mtx_index(4,i,nr_mtx_columns));      dm = *(transitions + get_mtx_index(5,i,nr_mtx_columns));      *(hmm.transitions + get_mtx_index(3 * i, 3 * (i+1), hmm.nr_v)) = (double)dd / (double)d_tot;      *(hmm.transitions + get_mtx_index(3 * i, 3 * (i+1) + 1, hmm.nr_v)) = (double)dm / (double)d_tot;    }        /* for all mm-trans, calculate values */    /* for all md-trans, calculate values */    /* for all mi-trans, calculate values */    for(i = 1; i < nr_mtx_columns - 1; i++) {      m_tot = *(transitions + get_mtx_index(1,i,nr_mtx_columns)) + *(transitions + get_mtx_index(2,i,nr_mtx_columns)) +	*(transitions + get_mtx_index(3,i,nr_mtx_columns));      mm = *(transitions + get_mtx_index(1,i,nr_mtx_columns));      md = *(transitions + get_mtx_index(2,i,nr_mtx_columns));      mi = *(transitions + get_mtx_index(3,i,nr_mtx_columns));      //printf("mm = %d, md = %d, mi = %d\n", mm, md, mi);      *(hmm.transitions + get_mtx_index(3 * i + 1, 3 * (i+1) + 1, hmm.nr_v)) = (double)mm / (double)m_tot * (1.0 - local_me_prob);      *(hmm.transitions + get_mtx_index(3 * i + 1, 3 * (i+1), hmm.nr_v)) = (double)md / (double)m_tot * (1.0 - local_me_prob);      *(hmm.transitions + get_mtx_index(3 * i + 1, 3 * i + 2, hmm.nr_v)) = (double)mi / (double)m_tot * (1.0 - local_me_prob);      // if local, save small prob of exiting the model for all match states      if(local == YES) {	*(hmm.transitions + get_mtx_index(3 * i + 1, hmm.nr_v - 4, hmm.nr_v)) = local_me_prob;      }          }        /* for end insertions, set default values */    *(hmm.transitions + get_mtx_index((nr_mtx_columns - 1)*3 + 2, (nr_mtx_columns - 1)*3 + 3, hmm.nr_v)) = EI_PROB;;    *(hmm.transitions + get_mtx_index((nr_mtx_columns - 1)*3 + 2, (nr_mtx_columns - 1)*3 + 4, hmm.nr_v)) = 1.0 - EI_PROB;        /* for all ii-trans, calculate values     * for all im-trans, calculate values */    for(i = 1; i < nr_mtx_columns - 1; i++) {      i_tot = *(transitions + get_mtx_index(6,i,nr_mtx_columns)) + *(transitions + get_mtx_index(7,i,nr_mtx_columns));      ii = *(transitions + get_mtx_index(6,i,nr_mtx_columns));      im = *(transitions + get_mtx_index(7,i,nr_mtx_columns));      *(hmm.transitions + get_mtx_index(3 * i + 2, 3 * i + 2 , hmm.nr_v)) = (double)ii / (double)i_tot;      *(hmm.transitions + get_mtx_index(3 * i + 2, 3 * (i+1) + 1, hmm.nr_v)) = (double)im / (double)i_tot;    }        /* start and end insertion lengths */    transprob_ii = (double)(INSERT_SIZE) / (double)(INSERT_SIZE + 1);    *(hmm.transitions + get_mtx_index(2, 1, hmm.nr_v)) = 1.0 - transprob_ii;    *(hmm.transitions + get_mtx_index(2, 2, hmm.nr_v)) = transprob_ii;    *(hmm.transitions + get_mtx_index((nr_mtx_columns - 1)*3 + 3, (nr_mtx_columns - 1)*3 + 2, hmm.nr_v)) = 1.0 - transprob_ii;    *(hmm.transitions + get_mtx_index((nr_mtx_columns - 1)*3 + 3, (nr_mtx_columns - 1)*3 + 3, hmm.nr_v)) = transprob_ii;      }  /* cleanup and return */  free(transitions);  free(insert_emissions);  free(row);  return;}void label_msa_fast(){  int i;  for(i = 0; i < msa_seq_info.msa_seq_length; i++) {    if(*((msa_seq_info.gap_shares) + i) < FAST_MATCH_LIMIT) {      ((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label = 'M';    }    else {      ((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label = 'I';    }  }}void label_msa_query(FILE *seq_infile){  char cur;  int i;  rewind(seq_infile);    cur = (char)fgetc(seq_infile);  if(cur != '<') {      }  i = 0;  while(1) {    /* read symbol */    cur = (char)fgetc(seq_infile);    if(cur == '>') {      break;    }    else if(cur == ' ' || cur == '-' || cur == '_' || cur == '.') {      ((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label = 'I';    }    else {      ((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label = 'M';    }    i++;    /* read ';' */    cur = (char)fgetc(seq_infile);    if(cur != ';') {          }  }  rewind(seq_infile);}void henikoff_weighting(struct msa_sequences_multi_s *msa_seq_infop, FILE *seq_infile, struct hmm_multi_s *hmmp){  struct sequences_multi_s seq_info;  double *m, *m_2, *m_3, *m_4;  double *weights, *total_weights, *total_weights_1, *total_weights_2, *total_weights_3, *total_weights_4, *total_weights_all;  int i, j, k;  int nr_alphabets;  struct letter_s *seq;  int a_index;  double divider;  char *alphabet;  int alphabet_nr;  struct msa_letter_s *msa_seq;  int a_size;  double weight_sum, *occ_sums;  nr_alphabets = hmmp->nr_alphabets;  /* read seqfile */  rewind(seq_infile);  //get_sequences_std_multi(seq_infile, &seq_info, &hmm);    total_weights_1 = (double*)(malloc_or_die(msa_seq_infop->nr_seqs * sizeof(double)));  total_weights_2 = (double*)(malloc_or_die(msa_seq_infop->nr_seqs * sizeof(double)));  total_weights_3 = (double*)(malloc_or_die(msa_seq_infop->nr_seqs * sizeof(double)));  total_weights_4 = (double*)(malloc_or_die(msa_seq_infop->nr_seqs * sizeof(double)));  for(i = 0; i < msa_seq_infop->nr_seqs; i++) {    *(total_weights_1 + i) = 0.0;    *(total_weights_2 + i) = 0.0;    *(total_weights_3 + i) = 0.0;    *(total_weights_4 + i) = 0.0;  }    for(alphabet_nr = 1; alphabet_nr <= nr_alphabets; alphabet_nr++) {    switch(alphabet_nr) {    case 1:      a_size = hmmp->a_size;      msa_seq = msa_seq_infop->msa_seq_1;      alphabet = hmmp->alphabet;      total_weights = total_weights_1;      break;    case 2:      a_size = hmmp->a_size_2;      msa_seq = msa_seq_infop->msa_seq_2;      alphabet = hmmp->alphabet_2;      total_weights = total_weights_2;      break;    case 3:      a_size = hmmp->a_size_3;      msa_seq = msa_seq_infop->msa_seq_3;      alphabet = hmmp->alphabet_3;      total_weights = total_weights_3;      break;    case 4:      a_size = hmmp->a_size_4;      msa_seq = msa_seq_infop->msa_seq_4;      alphabet = hmmp->alphabet_4;      total_weights = total_weights_4;      break;    }        /* for all columns, count m_i */    m = (double*)(malloc_or_die(msa_seq_infop->msa_seq_length * sizeof(double)));    for(i = 0; i < msa_seq_infop->msa_seq_length; i++) {      *(m + i) = 0.0;    }        for(i = 0; i < msa_seq_infop->msa_seq_length; i++) {      for(j = 0; j < a_size; j++) {	if((msa_seq + get_mtx_index(i,j,a_size+1))->nr_occurences >= 1.0) {	  *(m + i) += 1.0;	}      }    }        /* for all columns, and sequences, set weigth = 1/m_i * nr_of_letter_j */    weights = (double*)(malloc_or_die(msa_seq_infop->msa_seq_length * msa_seq_infop->nr_seqs * sizeof(double)));        // loop over all sequences, over all positions    // get alphabet index for this position, set weight at this postition to 1/(nr_of_occs_of_this_aa_in_this_pos_of_msa * tot_nr_occs)    // if alphabet index < 0, set weight = 0.0 (gap or replacement letter)    for(i = 0; i < msa_seq_infop->nr_seqs; i++) {      if(alphabet_nr == 1) {	seq = (seq_info.seqs + i)->seq_1;      }      else if(alphabet_nr == 2) {	seq = (seq_info.seqs + i)->seq_2;      }      else if(alphabet_nr == 3) {	seq = (seq_info.seqs + i)->seq_3;      }      else if(alphabet_nr == 4) {	seq = (seq_info.seqs + i)->seq_4;      }      j = 0;      while((seq->letter)[0] != '\0') {	a_index = get_alphabet_index(seq->letter, alphabet, a_size);	if(a_index < 0) {	  *(weights + get_mtx_index(i, j, msa_seq_infop->msa_seq_length)) = 0.0;	}	else {	  *(weights + get_mtx_index(i, j, msa_seq_infop->msa_seq_length)) =	    1.0 / (*(m + j) * (msa_seq + get_mtx_index(j, a_index, a_size + 1))->nr_occurences);	}	j++;	seq++;      }    }    /* set total sequence weights to average over whole sequence */    for(i = 0; i < msa_seq_infop->nr_seqs; i++) {      *(total_weights + i) = 0.0;      divider = 0.0;      for(j = 0; j < msa_seq_infop->msa_seq_length; j++) {	if(*(weights + get_mtx_index(i, j, msa_seq_infop->msa_seq_length)) != 0.0) {	  divider += 1.0;	  *(total_weights + i) += *(weights + get_mtx_index(i, j, msa_seq_infop->msa_seq_length));	}      }      if(divider != 0.0) {	*(total_weights + i) =  *(total_weights + i) / divider;      }          }    /* normalize weights */    weight_sum = 0.0;    for(i = 0; i < msa_seq_infop->nr_seqs; i++) {      weight_sum += *(total_weights + i);    }    for(i = 0; i < msa_seq_infop->nr_seqs; i++) {      *(total_weights + i) = *(total_weights + i) / weight_sum;    }        //dump_weights(weights, msa_seq_info.nr_seqs * msa_seq_info.msa_seq_length);    //dump_weights(total_weights, msa_seq_info.nr_seqs);    free(m);    free(weights);  }    /* get average weight over all alphabets + normalize with number of sequences */  total_weights_all = (double*)(malloc_or_die(msa_seq_infop->nr_seqs * sizeof(double)));  for(i = 0; i < msa_seq_infop->nr_seqs; i++) {    *(total_weights_all + i) = 0.0;    *(total_weights_all + i) += *(total_weights_1 + i);    *(total_weights_all + i) += *(total_weights_2 + i);    *(total_weights_all + i) += *(total_weights_3 + i);    *(total_weights_all + i) += *(total_weights_4 + i);    *(total_weights_all + i) = *(total_weights_all + i)/(double)(nr_alphabets);    *(total_weights_all + i) = *(total_weights_all + i) * (double)(msa_seq_infop->nr_seqs);  }  /* insert weighted shares and nr_occs in msa_seq_info struct */  for(alphabet_nr = 1; alphabet_nr <= nr_alphabets; alphabet_nr++) {    switch(alphabet_nr) {    case 1:      a_size = hmmp->a_size;      msa_seq = msa_seq_infop->msa_seq_1;      alphabet = hmmp->alphabet;      break;    case 2:      a_size = hmmp->a_size_2;      msa_seq = msa_seq_infop->msa_seq_2;      alphabet = hmmp->alphabet_2;      break;    case 3:      a_size = hmmp->a_size_3;      msa_seq = msa_seq_infop->msa_seq_3;      alphabet = hmmp->alphabet_3;      break;    case 4:      a_size = hmmp->a_size_4;      msa_seq = msa_seq_infop->msa_seq_4;      alphabet = hmmp->alphabet_4;      break;    }    for(i = 0; i < msa_seq_infop->msa_seq_length; i++) {      for(j = 0; j < a_size + 1; j++) {	(msa_seq + get_mtx_index(i,j,a_size + 1))->share = 0.0;	(msa_seq + get_mtx_index(i,j,a_size + 1))->prior_share = 0.0;	(msa_seq + get_mtx_index(i,j,a_size + 1))->nr_occurences = 0.0;      }    }    occ_sums = (double*)(malloc_or_die(msa_seq_infop->msa_seq_length * sizeof(double)));        for(i = 0; i < msa_seq_infop->nr_seqs; i++) {      if(alphabet_nr == 1) {	seq = (seq_info.seqs + i)->seq_1;      }      else if(alphabet_nr == 2) {	seq = (seq_info.seqs + i)->seq_2;      }      else if(alphabet_nr == 3) {	seq = (seq_info.seqs + i)->seq_3;      }      else if(alphabet_nr == 4) {	seq = (seq_info.seqs + i)->seq_4;      }      j = 0;      while((seq->letter)[0] != '\0') {	a_index = get_alphabet_index(seq->letter, alphabet, a_size);	if(a_index < 0) {	  if(seq->letter[0] == '-' || seq->letter[0] == ' ' || seq->letter[0] == '_' || seq->letter[0] == '.') {	    (msa_seq + get_mtx_index(j, a_size, a_size + 1))->nr_occurences += 1.0 * *(total_weights_all + i);	    *(occ_sums + j) += 1.0 * *(total_weights_all + i);	  }	  else {	    for(k = 0; k < hmm.a_size; k++) {	      (msa_seq + get_mtx_index(j, k, a_size + 1))->nr_occurences += 1.0/(double)(a_size) * *(total_weights_all + i);	      *(occ_sums + j) += 1.0 * *(total_weights_all + i);	    }	  }	}	else {	  (msa_seq + get_mtx_index(j, a_index, a_size + 1))->nr_occurences += 1.0 * *(total_weights_all + i);	  *(occ_sums + j) += 1.0 * *(total_weights_all + i);	}	j++;	seq++;      }    }        for(i = 0; i < msa_seq_infop->msa_seq_length; i++) {      for(j = 0; j < a_size + 1; j++) {	if(*(occ_sums + i) != 0.0) {	  (msa_seq + get_mtx_index(i, j, a_size + 1))->share = (msa_seq + get_mtx_index(i, j, a_size + 1))->nr_occurences /	    *(occ_sums + i);	}      }    }        free(occ_sums);  }    free(seq_info.seqs);  free(total_weights_1);  free(total_weights_2);  free(total_weights_3);  free(total_weights_4);  free(total_weights_all);    //dump_msa_seqs_multi(&msa_seq_info, &hmm);  //exit(0);}void printhelp(){  printhelp_opt();}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -