📄 readseqs_multialpha.c
字号:
seq_pos++; } } break; } else if(c == '>') { inside_seq = NO; get_query_letters = NO; m++; last = '!'; } else if(inside_seq == YES) { /* reading one letter */ letter_pos = 0; cur_letter.letter[letter_pos] = c; letter_pos++; while((c = (char)fgetc(seqfile)) != ';') { cur_letter.letter[letter_pos] = c; letter_pos++; } if(letter_pos > 4) { printf("max letter size = 4 characters\n"); exit(0); } cur_letter.letter[letter_pos] = '\0'; if(seq_pos + 1 > msa_length) { msa_length = seq_pos + 1; /* update nr of positions */ } if(cur_letter.letter[0] == '_' || cur_letter.letter[0] == ' ' || cur_letter.letter[0] == '-' || cur_letter.letter[0] == '.') /* gap */ { (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos, hmmp->a_size_2, hmmp->a_size_2+1))->nr_occurences += 1; seq_pos++; } else /* non gap character */ { if(use_replacement_letters == YES && replacement_letter_multi(&cur_letter, replacement_letters, msa_seq_infop, hmmp, seq_pos,2) == YES){ /* adding of nr_occurences for these letters is done on the fly in replacement_letter-function */ } else /* add occurence of this letter */{ a_index = get_alphabet_index(&cur_letter, hmmp->alphabet_2, hmmp->a_size_2); if(a_index < 0) { printf("Could not read msa seq from file: letter '%s' is not in alphabet\n", cur_letter.letter); exit(0); } else { (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos,a_index, hmmp->a_size_2+1))->nr_occurences += 1; } } seq_pos++; } if(get_query_letters == YES) { memcpy((msa_seq_infop->msa_seq_2 + l * (hmmp->a_size_2 + 1))->query_letter, &(cur_letter.letter), sizeof(char) * 5); /* query letter */ l++; } } } } /* read third alphabet, save nr of occurences of each letter in each position, * including nr of gaps */ if(hmmp->nr_alphabets > 2) { seq_pos = 0; inside_seq = NO; seq_index = 0; get_letter_columns = NO; k = 0; l = 0; nr_lead_columns = 0; last = '!'; for(m = 0; m < nr_seqs;) { i = fgetc(seqfile); if(i == EOF) { break; } c = (char)i; //printf("alpha3:c = %c\n",c); if(c == '<' && last != '<') { seq_pos = 0; inside_seq = YES; seq_index++; if(seq_index == lead_seq) { get_query_letters = YES; } last = c; } else if(c == '<') { last = c; } else if(c == '#') { fgets(line, MAX_LINE, seqfile); cur_line = line; while(*cur_line == '#') { cur_line++; } while(*cur_line != '+') { if(*cur_line == '_' || *cur_line == ' ' || (*cur_line == '-' && *(cur_line + 1) == ';') || *cur_line == '.') /* gap */ { (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos, 0, hmmp->a_size_3+1))->share = 0.0; (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos, 0, hmmp->a_size_3+1))->nr_occurences = 0.0; seq_pos++; cur_line++; } else if(*cur_line == 'X') { (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos, 0, hmmp->a_size+3))->share = 0.0; (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos, 0, hmmp->a_size+3))->nr_occurences = -1.0; seq_pos++; cur_line++; } else { letter_val = strtod(cur_line, &cur_line); (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos,0, hmmp->a_size_3+1))->share = letter_val; (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos,0, hmmp->a_size_3+1))->nr_occurences = 1.0; } if(*cur_line != ';') { printf("Strange continuous sequence file format\n"); exit(0); } else { cur_line = cur_line + 1; seq_pos++; } } break; } else if(c == '>') { inside_seq = NO; get_query_letters = NO; m++; last = '!'; } else if(inside_seq == YES) { /* reading one letter */ letter_pos = 0; cur_letter.letter[letter_pos] = c; letter_pos++; while((c = (char)fgetc(seqfile)) != ';') { cur_letter.letter[letter_pos] = c; letter_pos++; } if(letter_pos > 4) { printf("max letter size = 4 characters\n"); exit(0); } cur_letter.letter[letter_pos] = '\0'; if(seq_pos + 1 > msa_length) { msa_length = seq_pos + 1; /* update nr of positions */ } if(cur_letter.letter[0] == '_' || cur_letter.letter[0] == ' ' || cur_letter.letter[0] == '-' || cur_letter.letter[0] == '.') /* gap */ { (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos, hmmp->a_size_3, hmmp->a_size_3+1))->nr_occurences += 1; seq_pos++; } else /* non gap character */ { if(use_replacement_letters == YES && replacement_letter_multi(&cur_letter, replacement_letters, msa_seq_infop, hmmp, seq_pos,3) == YES){ /* adding of nr_occurences for these letters is done on the fly in replacement_letter-function */ } else /* add occurence of this letter */{ a_index = get_alphabet_index(&cur_letter, hmmp->alphabet_3, hmmp->a_size_3); if(a_index < 0) { printf("Could not read msa seq from file: letter '%s' is not in alphabet\n", cur_letter.letter); exit(0); } else { (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos,a_index, hmmp->a_size_3+1))->nr_occurences += 1; } } seq_pos++; } if(get_query_letters == YES) { memcpy((msa_seq_infop->msa_seq_3 + l * (hmmp->a_size_3 + 1))->query_letter, &(cur_letter.letter), sizeof(char) * 5); /* query letter */ l++; } } } } /* read fourth alphabet, save nr of occurences of each letter in each position, * including nr of gaps */ if(hmmp->nr_alphabets > 3) { seq_pos = 0; inside_seq = NO; seq_index = 0; k = 0; l = 0; nr_lead_columns = 0; last = '!'; for(m = 0; m < nr_seqs;) { i = fgetc(seqfile); if(i == EOF) { break; } c = (char)i; //printf("alpha4:c = %c\n",c); if(c == '<' && last != '<') { seq_pos = 0; inside_seq = YES; seq_index++; if(seq_index == lead_seq) { get_query_letters = YES; } last = c; } else if(c == '<') { last = c; } else if(c == '#') { fgets(line, MAX_LINE, seqfile); cur_line = line; while(*cur_line == '#') { cur_line++; } while(*cur_line != '+') { if(*cur_line == '_' || *cur_line == ' ' || (*cur_line == '-' && *(cur_line + 1) == ';') || *cur_line == '.') /* gap */ { (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos, 0, hmmp->a_size_4+1))->share = 0.0; (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos, 0, hmmp->a_size_4+1))->nr_occurences = 0.0; seq_pos++; cur_line++; } else if(*cur_line == 'X') { (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos, 0, hmmp->a_size_4+1))->share = 0.0; (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos, 0, hmmp->a_size_4+1))->nr_occurences = -1.0; seq_pos++; cur_line++; } else { letter_val = strtod(cur_line, &cur_line); (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos,0, hmmp->a_size_4+1))->share = letter_val; (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos,0, hmmp->a_size_4+1))->nr_occurences = 1.0; } if(*cur_line != ';') { printf("Strange continuous sequence file format\n"); exit(0); } else { cur_line = cur_line + 1; seq_pos++; } } break; } else if(c == '>') { inside_seq = NO; get_query_letters = NO; m++; last = '!'; } else if(inside_seq == YES) { /* reading one letter */ letter_pos = 0; cur_letter.letter[letter_pos] = c; letter_pos++; while((c = (char)fgetc(seqfile)) != ';') { cur_letter.letter[letter_pos] = c; letter_pos++; } if(letter_pos > 4) { printf("max letter size = 4 characters\n"); exit(0); } cur_letter.letter[letter_pos] = '\0'; if(seq_pos + 1 > msa_length) { msa_length = seq_pos + 1; /* update nr of positions */ } if(cur_letter.letter[0] == '_' || cur_letter.letter[0] == ' ' || cur_letter.letter[0] == '-' || cur_letter.letter[0] == '.') /* gap */ { (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos, hmmp->a_size_4, hmmp->a_size_4+1))->nr_occurences += 1; seq_pos++; } else /* non gap character */ { if(use_replacement_letters == YES && replacement_letter_multi(&cur_letter, replacement_letters, msa_seq_infop, hmmp, seq_pos,4) == YES){ /* adding of nr_occurences for these letters is done on the fly in replacement_letter-function */ } else /* add occurence of this letter */{ a_index = get_alphabet_index(&cur_letter, hmmp->alphabet_4, hmmp->a_size_4); if(a_index < 0) { printf("Could not read msa seq from file: letter '%s' is not in alphabet\n", cur_letter.letter); exit(0); } else { (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos,a_index, hmmp->a_size_4+1))->nr_occurences += 1; } } seq_pos++; } if(get_query_letters == YES) { memcpy((msa_seq_infop->msa_seq_4 + l * (hmmp->a_size_4 + 1))->query_letter, &(cur_letter.letter), sizeof(char) * 5); /* query letter */ l++; } } } } #ifdef DEBUG_MSA_SEQ_STD printf("reached checkpoint 2\n"); printf("total_nr_gaps = %d\n", tot_nr_gaps);#endif *(msa_seq_infop->lead_columns_start + k) = END; msa_seq_infop->lead_columns_end = msa_seq_infop->lead_columns_start + k; msa_seq_infop->nr_lead_columns = nr_lead_columns; //msa_seq_infop->nr_seqs = nr_seqs; msa_seq_infop->msa_seq_length = msa_length; msa_seq_infop->gap_shares = (double*)malloc_or_die(msa_length * sizeof(double)); tot_nr_gaps = 0; gaps_per_column = 0; /* go through each position and calculate distribution for all letters in alphabet 1 */ for(i = 0; i < msa_length; i++) { occurences_per_column = 0; gaps_per_column = 0; for(j = 0; j < hmmp->a_size+1; j++) { occurences_per_column += (msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences; if(j == hmmp->a_size) { tot_nr_gaps += (int)(msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences; gaps_per_column = (int)(msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences; } } if(use_priordistribution_1 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ { update_shares_prior_multi(&em_di_1, hmmp, msa_seq_infop, i,1); } /* simple share update just using the standard quotient */ for(j = 0; j < hmmp->a_size+1; j++) { if(hmmp->alphabet_type == DISCRETE) { (msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->share = (msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences / occurences_per_column; (msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->label = '.'; } } /* calculate gap_share */ if(hmmp->alphabet_type == DISCRETE) { *(msa_seq_infop->gap_shares + i) = (double)(gaps_per_column) / occurences_per_column; } } /* go through each position and calculate distribution for all letters in alphabet 2 */ if(hmmp->nr_alphabets > 1) { for(i = 0; i < msa_length; i++) { occurences_per_column = 0; gaps_per_column = 0; for(j = 0; j < hmmp->a_size_2+1; j++) { occurences_per_column += (msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->nr_occurences; } if(use_priordistribution_2 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ { update_shares_prior_multi(&em_di_2, hmmp, msa_seq_infop, i,2); //printf("inne if\n"); } /* simple share update just using the standard quotient */ for(j = 0; j < hmmp->a_size_2+1; j++) { if(hmmp->alphabet_type_2 == DISCRETE) { (msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->share = (msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->nr_occurences / occurences_per_column; (msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->label = '.'; } } } } /* go through each position and calculate distribution for all letters in alphabet 3 */ if(hmmp->nr_alphabets > 2) { for(i = 0; i < msa_length; i++) { occurences_per_column = 0; gaps_per_column = 0; for(j = 0; j < hmmp->a_size_3+1; j++) { occurences_per_column += (msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->nr_occurences; } if(use_priordistribution_3 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ { update_shares_prior_multi(&em_di_3, hmmp, msa_seq_infop, i,3); } /* simple share update just using the standard quotient */ for(j = 0; j < hmmp->a_size_3+1; j++) { if(hmmp->alphabet_type_3 == DISCRETE) { (msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->share = (msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->nr_occurences / occurences_per_column; (msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->label = '.'; } } } } /* go through each position and calculate distribution for all letters in alphabet 4 */ if(hmmp->nr_alphabets > 3) { for(i = 0; i < msa_length; i++) { occurences_per_column = 0; gaps_per_column = 0; for(j = 0; j < hmmp->a_size_4+1; j++) { occurences_per_column += (msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->nr_occurences; } if(use_priordistribution_4 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ { update_shares_prior_multi(&em_di_4, hmmp, msa_seq_infop, i,4); } /* simple share update just using the standard quotient */ for(j = 0; j < hmmp->a_size_4+1; j++) { if(hmmp->alphabet_type_4 == DISCRETE) { (msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->share = (msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->nr_occurences / occurences_per_column; (msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->label = '.'; } } } } #ifdef DEBUG_MSA_SEQ_STD
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -