⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 readseqs_multialpha.c

📁 马尔科夫模型的java版本实现
💻 C
📖 第 1 页 / 共 5 页
字号:
	    seq_pos++;	  }	}	break;      }      else if(c == '>') {	inside_seq = NO;	get_query_letters = NO;	m++;	last = '!';      }      else if(inside_seq == YES) {	/* reading one letter */	letter_pos = 0;	cur_letter.letter[letter_pos] = c;	letter_pos++;	while((c = (char)fgetc(seqfile)) != ';') {	  cur_letter.letter[letter_pos] = c;	  letter_pos++;	}	if(letter_pos > 4) {	  printf("max letter size = 4 characters\n");	  exit(0);	}	cur_letter.letter[letter_pos] = '\0';	if(seq_pos + 1 > msa_length) {	  msa_length = seq_pos + 1; /* update nr of positions */	}	if(cur_letter.letter[0] == '_' || cur_letter.letter[0] == ' ' ||	   cur_letter.letter[0] == '-' || cur_letter.letter[0] == '.') /* gap */ {	  (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos, hmmp->a_size_2, hmmp->a_size_2+1))->nr_occurences	    += 1;	  seq_pos++;	}	else /* non gap character */ {	  if(use_replacement_letters == YES &&	     replacement_letter_multi(&cur_letter, replacement_letters, msa_seq_infop, hmmp, seq_pos,2) == YES){	    /* adding of nr_occurences for these letters is done on the fly in replacement_letter-function */	  }	  else /* add occurence of this letter */{	    a_index = get_alphabet_index(&cur_letter, hmmp->alphabet_2, hmmp->a_size_2);	    if(a_index < 0) {	      printf("Could not read msa seq from file: letter '%s' is not in alphabet\n", cur_letter.letter);	      exit(0);	    }	    else {	      (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos,a_index, hmmp->a_size_2+1))->nr_occurences += 1;	    }	  }	  seq_pos++;	}	if(get_query_letters == YES) {	  memcpy((msa_seq_infop->msa_seq_2 + l * (hmmp->a_size_2 + 1))->query_letter, &(cur_letter.letter),		 sizeof(char) * 5); /* query letter */	  l++;	}      }    }  }  /* read third alphabet, save nr of occurences of each letter in each position,   * including nr of gaps */  if(hmmp->nr_alphabets > 2) {    seq_pos = 0;    inside_seq = NO;    seq_index = 0;    get_letter_columns = NO;    k = 0;    l = 0;    nr_lead_columns = 0;    last = '!';    for(m = 0; m < nr_seqs;) {      i = fgetc(seqfile);      if(i == EOF) {	break;      }      c = (char)i;      //printf("alpha3:c = %c\n",c);            if(c == '<' && last != '<') {	seq_pos = 0;	inside_seq = YES;	seq_index++;	if(seq_index == lead_seq) {	  get_query_letters = YES;	}	last = c;      }      else if(c == '<') {	last = c;      }      else if(c == '#') {	fgets(line, MAX_LINE, seqfile);	cur_line = line;	while(*cur_line == '#') {	  cur_line++;	}	while(*cur_line != '+') {	  if(*cur_line == '_' || *cur_line == ' ' ||	     (*cur_line == '-' && *(cur_line + 1) == ';') || *cur_line == '.') /* gap */ {	    (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos, 0, hmmp->a_size_3+1))->share = 0.0;	    (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos, 0, hmmp->a_size_3+1))->nr_occurences = 0.0;	    seq_pos++;	    cur_line++;	  }	  else if(*cur_line == 'X') {	    (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos, 0, hmmp->a_size+3))->share = 0.0;	    (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos, 0, hmmp->a_size+3))->nr_occurences = -1.0;	    seq_pos++;	    cur_line++;	  }	  else {	    letter_val = strtod(cur_line, &cur_line);	    (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos,0, hmmp->a_size_3+1))->share = letter_val;	    (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos,0, hmmp->a_size_3+1))->nr_occurences = 1.0;	  }	  if(*cur_line != ';') {	    printf("Strange continuous sequence file format\n");	    exit(0);	  }	  else {	    cur_line = cur_line + 1;	    seq_pos++;	  }	}	break;      }      else if(c == '>') {	inside_seq = NO;	get_query_letters = NO;	m++;	last = '!';      }      else if(inside_seq == YES) {	/* reading one letter */	letter_pos = 0;	cur_letter.letter[letter_pos] = c;	letter_pos++;	while((c = (char)fgetc(seqfile)) != ';') {	  cur_letter.letter[letter_pos] = c;	  letter_pos++;	}	if(letter_pos > 4) {	  printf("max letter size = 4 characters\n");	  exit(0);	}	cur_letter.letter[letter_pos] = '\0';	if(seq_pos + 1 > msa_length) {	  msa_length = seq_pos + 1; /* update nr of positions */	}	if(cur_letter.letter[0] == '_' || cur_letter.letter[0] == ' ' ||	   cur_letter.letter[0] == '-' || cur_letter.letter[0] == '.') /* gap */ {	  (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos, hmmp->a_size_3, hmmp->a_size_3+1))->nr_occurences	    += 1;	  seq_pos++;	}	else /* non gap character */ {	  if(use_replacement_letters == YES &&	     replacement_letter_multi(&cur_letter, replacement_letters, msa_seq_infop, hmmp, seq_pos,3) == YES){	    /* adding of nr_occurences for these letters is done on the fly in replacement_letter-function */	  }	  else /* add occurence of this letter */{	    a_index = get_alphabet_index(&cur_letter, hmmp->alphabet_3, hmmp->a_size_3);	    if(a_index < 0) {	      printf("Could not read msa seq from file: letter '%s' is not in alphabet\n", cur_letter.letter);	      exit(0);	    }	    else {	      (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos,a_index, hmmp->a_size_3+1))->nr_occurences += 1;	    }	  }	  seq_pos++;	}	if(get_query_letters == YES) {	  memcpy((msa_seq_infop->msa_seq_3 + l * (hmmp->a_size_3 + 1))->query_letter, &(cur_letter.letter),		 sizeof(char) * 5); /* query letter */	  l++;	}      }    }  }  /* read fourth alphabet, save nr of occurences of each letter in each position,   * including nr of gaps */  if(hmmp->nr_alphabets > 3) {    seq_pos = 0;    inside_seq = NO;    seq_index = 0;    k = 0;    l = 0;    nr_lead_columns = 0;    last = '!';    for(m = 0; m < nr_seqs;) {      i = fgetc(seqfile);      if(i == EOF) {	break;      }      c = (char)i;      //printf("alpha4:c = %c\n",c);            if(c == '<' && last != '<') {	seq_pos = 0;	inside_seq = YES;	seq_index++;	if(seq_index == lead_seq) {	  get_query_letters = YES;	}	last = c;      }      else if(c == '<') {	last = c;      }       else if(c == '#') {	fgets(line, MAX_LINE, seqfile);	cur_line = line;	while(*cur_line == '#') {	  cur_line++;	}	while(*cur_line != '+') {	  if(*cur_line == '_' || *cur_line == ' ' ||	     (*cur_line == '-' && *(cur_line + 1) == ';') || *cur_line == '.') /* gap */ {	    (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos, 0, hmmp->a_size_4+1))->share = 0.0;	    (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos, 0, hmmp->a_size_4+1))->nr_occurences = 0.0;	    seq_pos++;	    cur_line++;	  }	  else if(*cur_line == 'X') {	    (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos, 0, hmmp->a_size_4+1))->share = 0.0;	    (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos, 0, hmmp->a_size_4+1))->nr_occurences = -1.0;	    seq_pos++;	    cur_line++;	  }	  else {	    letter_val = strtod(cur_line, &cur_line);	    (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos,0, hmmp->a_size_4+1))->share = letter_val;	    (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos,0, hmmp->a_size_4+1))->nr_occurences = 1.0;	  }	  if(*cur_line != ';') {	    printf("Strange continuous sequence file format\n");	    exit(0);	  }	  else {	    cur_line = cur_line + 1;	    seq_pos++;	  }	}	break;      }      else if(c == '>') {	inside_seq = NO;	get_query_letters = NO;	m++;	last = '!';      }      else if(inside_seq == YES) {	/* reading one letter */	letter_pos = 0;	cur_letter.letter[letter_pos] = c;	letter_pos++;	while((c = (char)fgetc(seqfile)) != ';') {	  cur_letter.letter[letter_pos] = c;	  letter_pos++;	}	if(letter_pos > 4) {	  printf("max letter size = 4 characters\n");	  exit(0);	}	cur_letter.letter[letter_pos] = '\0';	if(seq_pos + 1 > msa_length) {	  msa_length = seq_pos + 1; /* update nr of positions */	}	if(cur_letter.letter[0] == '_' || cur_letter.letter[0] == ' ' ||	   cur_letter.letter[0] == '-' || cur_letter.letter[0] == '.') /* gap */ {	  (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos, hmmp->a_size_4, hmmp->a_size_4+1))->nr_occurences	    += 1;	  seq_pos++;	}	else /* non gap character */ {	  if(use_replacement_letters == YES &&	     replacement_letter_multi(&cur_letter, replacement_letters, msa_seq_infop, hmmp, seq_pos,4) == YES){	    /* adding of nr_occurences for these letters is done on the fly in replacement_letter-function */	  }	  else /* add occurence of this letter */{	    a_index = get_alphabet_index(&cur_letter, hmmp->alphabet_4, hmmp->a_size_4);	    if(a_index < 0) {	      printf("Could not read msa seq from file: letter '%s' is not in alphabet\n", cur_letter.letter);	      exit(0);	    }	    else {	      (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos,a_index, hmmp->a_size_4+1))->nr_occurences += 1;	    }	  }	  seq_pos++;	}	if(get_query_letters == YES) {	  memcpy((msa_seq_infop->msa_seq_4 + l * (hmmp->a_size_4 + 1))->query_letter, &(cur_letter.letter),		 sizeof(char) * 5); /* query letter */	  l++;	}      }    }  }  #ifdef DEBUG_MSA_SEQ_STD  printf("reached checkpoint 2\n");  printf("total_nr_gaps = %d\n", tot_nr_gaps);#endif  *(msa_seq_infop->lead_columns_start + k) = END;  msa_seq_infop->lead_columns_end = msa_seq_infop->lead_columns_start + k;  msa_seq_infop->nr_lead_columns = nr_lead_columns;  //msa_seq_infop->nr_seqs = nr_seqs;  msa_seq_infop->msa_seq_length = msa_length;  msa_seq_infop->gap_shares = (double*)malloc_or_die(msa_length * sizeof(double));   tot_nr_gaps = 0;  gaps_per_column = 0;  /* go through each position and calculate distribution for all letters in alphabet 1 */  for(i = 0; i < msa_length; i++) {    occurences_per_column = 0;    gaps_per_column = 0;    for(j = 0; j < hmmp->a_size+1; j++) {      occurences_per_column += (msa_seq_infop->msa_seq_1 +				      get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences;      if(j == hmmp->a_size) {	tot_nr_gaps += (int)(msa_seq_infop->msa_seq_1 +			     get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences;	gaps_per_column = (int)(msa_seq_infop->msa_seq_1 +				get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences;      }    }    if(use_priordistribution_1 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ {      update_shares_prior_multi(&em_di_1, hmmp, msa_seq_infop, i,1);          }    /* simple share update just using the standard quotient */     for(j = 0; j < hmmp->a_size+1; j++) {      if(hmmp->alphabet_type == DISCRETE) {	(msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->share =	  (msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences / 	  occurences_per_column;	(msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->label = '.';      }    }            /* calculate gap_share */    if(hmmp->alphabet_type == DISCRETE) {      *(msa_seq_infop->gap_shares + i) = (double)(gaps_per_column) / occurences_per_column;    }  }    /* go through each position and calculate distribution for all letters in alphabet 2 */  if(hmmp->nr_alphabets > 1) {    for(i = 0; i < msa_length; i++) {      occurences_per_column = 0;      gaps_per_column = 0;      for(j = 0; j < hmmp->a_size_2+1; j++) {	occurences_per_column += (msa_seq_infop->msa_seq_2 +				      get_mtx_index(i,j, hmmp->a_size_2+1))->nr_occurences;      }      if(use_priordistribution_2 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ {	update_shares_prior_multi(&em_di_2, hmmp, msa_seq_infop, i,2);	//printf("inne if\n");      }      /* simple share update just using the standard quotient */       for(j = 0; j < hmmp->a_size_2+1; j++) {	if(hmmp->alphabet_type_2 == DISCRETE) {	  (msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->share =	    (msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->nr_occurences / 	    occurences_per_column;	  (msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->label = '.';	}      }        }  }  /* go through each position and calculate distribution for all letters in alphabet 3 */  if(hmmp->nr_alphabets > 2) {    for(i = 0; i < msa_length; i++) {      occurences_per_column = 0;      gaps_per_column = 0;      for(j = 0; j < hmmp->a_size_3+1; j++) {	occurences_per_column += (msa_seq_infop->msa_seq_3 +				  get_mtx_index(i,j, hmmp->a_size_3+1))->nr_occurences;      }      if(use_priordistribution_3 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ {	update_shares_prior_multi(&em_di_3, hmmp, msa_seq_infop, i,3);      }      /* simple share update just using the standard quotient */       for(j = 0; j < hmmp->a_size_3+1; j++) {	if(hmmp->alphabet_type_3 == DISCRETE) {	  (msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->share =	    (msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->nr_occurences / 	    occurences_per_column;	  (msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->label = '.';	}          }    }  }    /* go through each position and calculate distribution for all letters in alphabet 4 */  if(hmmp->nr_alphabets > 3) {    for(i = 0; i < msa_length; i++) {      occurences_per_column = 0;      gaps_per_column = 0;      for(j = 0; j < hmmp->a_size_4+1; j++) {	occurences_per_column += (msa_seq_infop->msa_seq_4 +				  get_mtx_index(i,j, hmmp->a_size_4+1))->nr_occurences;      }      if(use_priordistribution_4 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ {	update_shares_prior_multi(&em_di_4, hmmp, msa_seq_infop, i,4);      }      /* simple share update just using the standard quotient */       for(j = 0; j < hmmp->a_size_4+1; j++) {	if(hmmp->alphabet_type_4 == DISCRETE) {	  (msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->share =	    (msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->nr_occurences / 	    occurences_per_column;	  (msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->label = '.';	}      }        }  }  #ifdef DEBUG_MSA_SEQ_STD

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -