⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 readseqs_multialpha.c

📁 马尔科夫模型的java版本实现
💻 C
📖 第 1 页 / 共 5 页
字号:
  printf("reached checkpoint 3\n");  printf("total_nr_gaps = %d\n", tot_nr_gaps); #endif/* allocate memory for gaps array and set initial pointers for each position*/  msa_seq_infop->gaps = (int**)malloc_or_die(msa_length * sizeof(int*) +					     (tot_nr_gaps + msa_length) * sizeof(int));  nr_gaps = 0;  for(i = 0; i < msa_length; i++) {    *(msa_seq_infop->gaps + i) = (int*)(msa_seq_infop->gaps + msa_length) + nr_gaps;    nr_gaps += (int)(msa_seq_infop->msa_seq_1 + get_mtx_index(i, hmmp->a_size, hmmp->a_size+1))->nr_occurences;    nr_gaps += 1;  }#ifdef DEBUG_MSA_SEQ_STD  printf("%x\n", msa_seq_infop->gaps);  printf("%x\n", *(msa_seq_infop->gaps));  printf("%x\n", *(msa_seq_infop->gaps + 1));  printf("%x\n", *(msa_seq_infop->gaps + 2));  printf("%x\n", *(msa_seq_infop->gaps + 3));  printf("nr_seqs: %d\n", nr_seqs);#endif  /********** verify that this is done correctly even if there are multiple alphabets */   /* go through every sequence and store which sequences have gaps at what   * positions in gaps array */  rewind(seqfile);  inside_seq = NO;  seq_pos = 0;  cur_posp = msa_seq_infop->gaps;  last = '!';  for(i = 0; i < nr_seqs;) {    c = (char)fgetc(seqfile);    if(c == '<' && last == '<') {      break;    }    else if(c == '<') {      inside_seq = YES;      cur_posp = msa_seq_infop->gaps;      last = '<';    }    else if(c == '>') {      inside_seq = NO;      i++;      last = '!';    }    else if(inside_seq == YES && (c == '_' || c == ' ' || c == '-' || c == '.')) {      (**cur_posp) = i+1;      #ifdef DEBUG_MSA_SEQ_STD      printf("posp = %x\n", cur_posp);      printf("*posp = %x\n", *cur_posp);      printf("**posp = %x\n", **cur_posp);#endif      (*cur_posp)++;      cur_posp++;      while((c = (char)fgetc(seqfile)) != ';') {	      }      last = '!';    }    else if(inside_seq == YES) {      while((c = (char)fgetc(seqfile)) != ';') {	      }      cur_posp++;      last = '!';    }  }  cur_posp = msa_seq_infop->gaps;  for(i = 0; i < msa_seq_infop->msa_seq_length; i++) {     (**cur_posp) = END;     cur_posp++;  }  nr_gaps = 0;  for(i = 0; i < msa_length; i++) {    *(msa_seq_infop->gaps + i) = (int*)(msa_seq_infop->gaps + msa_length) + nr_gaps;    nr_gaps += (int)(msa_seq_infop->msa_seq_1 + get_mtx_index(i, hmmp->a_size, hmmp->a_size+1))->nr_occurences;    nr_gaps += 1;  }  #ifdef DEBUG_MSA_SEQ_STD  dump_msa_seqs_multi(msa_seq_infop, hmmp);  printf("reached checkpoint 4\n");  exit(0);#endif  /* cleanup and return */  if(use_priordistribution_1 == DIRICHLET) {    if(em_di_1.nr_components > 0) {      free(em_di_1.q_values);      free(em_di_1.alpha_sums);      free(em_di_1.logbeta_values);      free(em_di_1.prior_values);    }  }  if(use_priordistribution_2 == DIRICHLET) {    if(em_di_2.nr_components > 0) {      free(em_di_2.q_values);      free(em_di_2.alpha_sums);      free(em_di_2.logbeta_values);      free(em_di_2.prior_values);    }  }  if(use_priordistribution_3 == DIRICHLET) {    if(em_di_3.nr_components > 0) {      free(em_di_3.q_values);      free(em_di_3.alpha_sums);      free(em_di_3.logbeta_values);      free(em_di_3.prior_values);    }  }  if(use_priordistribution_4 == DIRICHLET) {    if(em_di_4.nr_components > 0) {      free(em_di_4.q_values);      free(em_di_4.alpha_sums);      free(em_di_4.logbeta_values);      free(em_di_4.prior_values);    }  }  return;}/* Note: msa_seq_infop->seq and msa_seq_infop->gaps will be allocated here but must be * freed by caller */void get_sequences_msa_prf_multi(FILE *seqfile, FILE *priorfile, struct msa_sequences_multi_s *msa_seq_infop,				 struct hmm_multi_s *hmmp){    int i,j,k,l,m;  int done, inside_seq, seq_pos, letter_pos, a_index, nr_seqs, cur_pos, cur_seq;  int msa_length, seq_length, longest_seq, longest_seq_length;  int seq_index, nr_lead_columns;  int gaps_per_column, tot_nr_gaps, nr_gaps;  double occurences_per_column;  int get_letter_columns, get_query_letters;  int use_priordistribution,use_priordistribution_2, use_priordistribution_3, use_priordistribution_4;   char c;  char s[MAX_LINE];  struct letter_s cur_letter;  int **cur_posp;  struct emission_dirichlet_s em_di,em_di_2, em_di_3, em_di_4;  int is_first;  int use_replacement_letters;  int is_empty;  int col_pos;  char *seq_ptr;  char *endptr;  double nr_occurences;  int label_pos, cur_letter_pos;  char label, letter;  int read_priorfile;      /* find out length of alignment and allocate memory for probability matrix by   * reading all sequence rows and remembering the longest */  done = NO;  inside_seq = NO;  msa_length = 0;  seq_length = 0;  seq_index = 0;if(priorfile == NULL) {    use_priordistribution = NONE;    use_priordistribution_2 = NONE;    use_priordistribution_3 = NONE;    use_priordistribution_4 = NONE;  }  else {    read_priorfile = read_multi_prior_file_multi(&em_di, hmmp, priorfile, 1);    if(read_priorfile > 0) {      use_priordistribution = DIRICHLET;    }    else if(read_priorfile < 0) {      printf("Error: Incorrect priorfile format\n");      exit(0);    }    else {      use_priordistribution = NONE;    }    if(hmmp->nr_alphabets > 1) {      read_priorfile = read_multi_prior_file_multi(&em_di_2, hmmp, priorfile, 2);      if(read_priorfile > 0) {	use_priordistribution_2 = DIRICHLET;      }      else if(read_priorfile < 0) {	printf("Error: Incorrect priorfile format\n");	exit(0);      }      else {	use_priordistribution_2 = NONE;      }    }    if(hmmp->nr_alphabets > 2) {      read_priorfile = read_multi_prior_file_multi(&em_di_3, hmmp, priorfile, 3);      if(read_priorfile > 0) {	use_priordistribution_3 = DIRICHLET;      }      else if(read_priorfile < 0) {	printf("Error: Incorrect priorfile format\n");	exit(0);      }      else {	use_priordistribution_3 = NONE;      }    }    if(hmmp->nr_alphabets > 3) {      read_priorfile = read_multi_prior_file_multi(&em_di_4, hmmp, priorfile, 4);      if(read_priorfile > 0) {	use_priordistribution_4 = DIRICHLET;      }      else if(read_priorfile < 0) {	printf("Error: Incorrect priorfile format\n");	exit(0);      }      else {	use_priordistribution_4 = NONE;      }    }  }      /* check if file is empty */  is_empty = YES;  while(done != YES) {    c = (char)fgetc(seqfile);    if((int)c == EOF) {      break;    }    else {      is_empty = NO;      break;    }  }  if(is_empty == YES) {    if(verbose == YES) {      printf("File is empty\n");    }   }  else {  }  rewind(seqfile);  while(fgets(s, MAX_LINE, seqfile) != NULL) {    if(strncmp(s,"COL",3) == 0) {      msa_length = atoi(&s[4]);    }  }#ifdef DEBUG_MSA_SEQ_PRF  printf("reached checkpoint 1\n");  printf("msa_length = %d\n", msa_length);#endif    msa_seq_infop->lead_columns_start = (int*)malloc_or_die((msa_length+1) * sizeof(int));  msa_seq_infop->msa_seq_1 = (struct msa_letter_s*)    malloc_or_die(msa_length * (hmmp->a_size+1) * sizeof(struct msa_letter_s));#ifdef DEBUG_MSA_SEQ_PRF  printf("malloced ok\n");#endif  /* read alphabet 1, save nr of occurences of each letter in each position,   * including nr of gaps */  rewind(seqfile);  seq_pos = 0;  nr_lead_columns = 0;  k = 0;  l = 0;  inside_seq = NO;  while(fgets(s, MAX_LINE, seqfile) != NULL) {    if(strncmp(s,"END 1",5 ) == 0) {      break;    }    if(strncmp(s,"START 1",7) == 0) {      inside_seq = YES;    }    else if(strncmp(s,"NR of aligned sequences",23) == 0) {      nr_seqs = strtol(s + 24, NULL, 10);    }    else if(strncmp(s,"COL",3) == 0 && inside_seq == YES) {      /* split into columns depending on a_size, add '-' */      col_pos = 11;      seq_ptr = s + col_pos;      for(m = 0; m < hmmp->a_size + 1; m++) {	nr_occurences = strtod(seq_ptr, &endptr);	if(endptr == seq_ptr) {	  printf("Error reading column: no frequency was read\n");	  exit(0);	}	else {	  /* add nr of occurences to datastructure */	  seq_ptr = endptr;	  (msa_seq_infop->msa_seq_1 + get_mtx_index(seq_pos,m, hmmp->a_size+1))->nr_occurences = nr_occurences;	}      }      /* read space */      strtod(seq_ptr, &endptr);      seq_ptr = endptr;      /* read label */      label_pos = 0;      for(label_pos = 0;seq_ptr[label_pos] != '\n';label_pos++) {	if(seq_ptr[label_pos] == ' ') {	  	}	else {	  label = seq_ptr[label_pos];	  seq_ptr = seq_ptr + label_pos + 1;	  break;	}      }          /* read query letter */      letter_pos = 0;      cur_letter_pos = 0;      for(letter_pos = 0;seq_ptr[letter_pos] != '\n';letter_pos++) {	if(seq_ptr[letter_pos] == ' ') {	  	}	else {	  letter = seq_ptr[letter_pos];	  cur_letter.letter[cur_letter_pos] = letter;	  cur_letter_pos++;	  seq_ptr = seq_ptr + letter_pos + 1;	  break;	}      }          if(cur_letter_pos >= 5) {	printf("Maximum of four characters for one letter\n");	exit(0);      }      else {	cur_letter.letter[cur_letter_pos] = '\0';      }      /* store lead seq and query columns + label */      if(cur_letter.letter[0] != '-') {	*(msa_seq_infop->lead_columns_start + k) = seq_pos; /* letter column */	if(label == '-') {	  label = '.';	}	/* only add label to first msa_letter of the first alphabet */	(msa_seq_infop->msa_seq_1 + (*(msa_seq_infop->lead_columns_start + k)) * (hmmp->a_size+1))->label = label;	k++;	nr_lead_columns++;      }      if(1 == 1) {	memcpy((msa_seq_infop->msa_seq_1 + l * (hmmp->a_size + 1))->query_letter, &(cur_letter.letter),	       sizeof(char) * 5); /* query letter */	l++;      }      seq_pos++;    }  }  #ifdef DEBUG_MSA_SEQ_PRF  printf("reached checkpoint 2\n");  printf("total_nr_gaps = %d\n", tot_nr_gaps);#endif        /* read alphabet 2, save nr of occurences of each letter in each position,   * including nr of gaps */  if(hmmp->nr_alphabets > 1) {    msa_seq_infop->msa_seq_2 = (struct msa_letter_s*)    malloc_or_die(msa_length * (hmmp->a_size_2+1) * sizeof(struct msa_letter_s));    rewind(seqfile);    seq_pos = 0;    k = 0;    l = 0;    inside_seq = NO;    while(fgets(s, MAX_LINE, seqfile) != NULL) {      if(strncmp(s,"END 2",5 ) == 0) {	break;      }      if(strncmp(s,"START 2",7) == 0) {	inside_seq = YES;      }      else if(strncmp(s,"NR of aligned sequences",23) == 0) {	nr_seqs = strtol(s + 24, NULL, 10);      }      else if(strncmp(s,"COL",3) == 0 && inside_seq == YES) {	/* split into columns depending on a_size, add '-' */	col_pos = 11;	seq_ptr = s + col_pos;	for(m = 0; m < hmmp->a_size_2 + 1; m++) {	  nr_occurences = strtod(seq_ptr, &endptr);	  if(endptr == seq_ptr) {	    printf("Error reading column: no frequency was read\n");	    exit(0);	  }	  else {	    /* add nr of occurences to datastructure */	    seq_ptr = endptr;	    (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos,m, hmmp->a_size_2+1))->nr_occurences = nr_occurences;	  }	}	/* read space */	strtod(seq_ptr, &endptr);	seq_ptr = endptr;		/* read label */	label_pos = 0;	for(label_pos = 0;seq_ptr[label_pos] != '\n';label_pos++) {	  if(seq_ptr[label_pos] == ' ') {	    	  }	  else {	    label = seq_ptr[label_pos];	    seq_ptr = seq_ptr + label_pos + 1;	    break;	  }	}		/* read query letter */	letter_pos = 0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -