📄 readseqs_multialpha.c
字号:
cur_letter_pos = 0; for(letter_pos = 0;seq_ptr[letter_pos] != '\n';letter_pos++) { if(seq_ptr[letter_pos] == ' ') { } else { letter = seq_ptr[letter_pos]; cur_letter.letter[cur_letter_pos] = letter; cur_letter_pos++; seq_ptr = seq_ptr + letter_pos + 1; break; } } if(cur_letter_pos >= 5) { printf("Maximum of four characters for one letter\n"); exit(0); } else { cur_letter.letter[cur_letter_pos] = '\0'; } /* store lead seq and query columns + label */ if(cur_letter.letter[0] != '-') { //*(msa_seq_infop->lead_columns_start + k) = seq_pos; /* letter column */ //(msa_seq_infop->msa_seq_2 + (*(msa_seq_infop->lead_columns_start + k)) * (hmmp->a_size_2+1))->label = label; k++; } if(1 == 1) { //memcpy((msa_seq_infop->msa_seq_2 + l * (hmmp->a_size_2 + 1))->query_letter, &(cur_letter.letter), //sizeof(char) * 5); /* query letter */ l++; } seq_pos++; } } #ifdef DEBUG_MSA_SEQ_PRF printf("reached checkpoint 2.1\n"); printf("total_nr_gaps = %d\n", tot_nr_gaps);#endif } if(hmmp->nr_alphabets > 2) { msa_seq_infop->msa_seq_3 = (struct msa_letter_s*) malloc_or_die(msa_length * (hmmp->a_size_3+1) * sizeof(struct msa_letter_s)); /* read alphabet 1, save nr of occurences of each letter in each position, * including nr of gaps */ rewind(seqfile); seq_pos = 0; k = 0; l = 0; inside_seq = NO; while(fgets(s, MAX_LINE, seqfile) != NULL) { if(strncmp(s,"END 3",5 ) == 0) { break; } if(strncmp(s,"START 3",7) == 0) { inside_seq = YES; } else if(strncmp(s,"NR of aligned sequences",23) == 0) { nr_seqs = strtol(s + 24, NULL, 10); } else if(strncmp(s,"COL",3) == 0 && inside_seq == YES) { /* split into columns depending on a_size, add '-' */ col_pos = 11; seq_ptr = s + col_pos; for(m = 0; m < hmmp->a_size_3 + 1; m++) { nr_occurences = strtod(seq_ptr, &endptr); if(endptr == seq_ptr) { printf("Error reading column: no frequency was read\n"); exit(0); } else { /* add nr of occurences to datastructure */ seq_ptr = endptr; (msa_seq_infop->msa_seq_3 + get_mtx_index(seq_pos,m, hmmp->a_size_3+1))->nr_occurences = nr_occurences; } } /* read space */ strtod(seq_ptr, &endptr); seq_ptr = endptr; /* read label */ label_pos = 0; for(label_pos = 0;seq_ptr[label_pos] != '\n';label_pos++) { if(seq_ptr[label_pos] == ' ') { } else { label = seq_ptr[label_pos]; seq_ptr = seq_ptr + label_pos + 1; break; } } /* read query letter */ letter_pos = 0; cur_letter_pos = 0; for(letter_pos = 0;seq_ptr[letter_pos] != '\n';letter_pos++) { if(seq_ptr[letter_pos] == ' ') { } else { letter = seq_ptr[letter_pos]; cur_letter.letter[cur_letter_pos] = letter; cur_letter_pos++; seq_ptr = seq_ptr + letter_pos + 1; break; } } if(cur_letter_pos >= 5) { printf("Maximum of four characters for one letter\n"); exit(0); } else { cur_letter.letter[cur_letter_pos] = '\0'; } /* store lead seq and query columns + label */ if(cur_letter.letter[0] != '-') { //*(msa_seq_infop->lead_columns_start + k) = seq_pos; /* letter column */ //(msa_seq_infop->msa_seq_3 + (*(msa_seq_infop->lead_columns_start + k)) * (hmmp->a_size_3+1))->label = label; k++; } if(1 == 1) { //memcpy((msa_seq_infop->msa_seq_3 + l * (hmmp->a_size_3 + 1))->query_letter, &(cur_letter.letter), // sizeof(char) * 5); /* query letter */ l++; } seq_pos++; } } #ifdef DEBUG_MSA_SEQ_PRF printf("reached checkpoint 2.2\n"); printf("total_nr_gaps = %d\n", tot_nr_gaps);#endif } if(hmmp->nr_alphabets > 3) { msa_seq_infop->msa_seq_4 = (struct msa_letter_s*) malloc_or_die(msa_length * (hmmp->a_size_4+1) * sizeof(struct msa_letter_s)); /* read alphabet 4, save nr of occurences of each letter in each position, * including nr of gaps */ rewind(seqfile); seq_pos = 0; k = 0; l = 0; inside_seq = NO; while(fgets(s, MAX_LINE, seqfile) != NULL) { if(strncmp(s,"END 4",5 ) == 0) { break; } if(strncmp(s,"START 4",7) == 0) { inside_seq = YES; } else if(strncmp(s,"NR of aligned sequences",23) == 0) { nr_seqs = strtol(s + 24, NULL, 10); } else if(strncmp(s,"COL",3) == 0 && inside_seq == YES) { /* split into columns depending on a_size, add '-' */ col_pos = 11; seq_ptr = s + col_pos; for(m = 0; m < hmmp->a_size_4 + 1; m++) { nr_occurences = strtod(seq_ptr, &endptr); if(endptr == seq_ptr) { printf("Error reading column: no frequency was read\n"); exit(0); } else { /* add nr of occurences to datastructure */ seq_ptr = endptr; (msa_seq_infop->msa_seq_4 + get_mtx_index(seq_pos,m, hmmp->a_size_4+1))->nr_occurences = nr_occurences; } } /* read space */ strtod(seq_ptr, &endptr); seq_ptr = endptr; /* read label */ label_pos = 0; for(label_pos = 0;seq_ptr[label_pos] != '\n';label_pos++) { if(seq_ptr[label_pos] == ' ') { } else { label = seq_ptr[label_pos]; seq_ptr = seq_ptr + label_pos + 1; break; } } /* read query letter */ letter_pos = 0; cur_letter_pos = 0; for(letter_pos = 0;seq_ptr[letter_pos] != '\n';letter_pos++) { if(seq_ptr[letter_pos] == ' ') { } else { letter = seq_ptr[letter_pos]; cur_letter.letter[cur_letter_pos] = letter; cur_letter_pos++; seq_ptr = seq_ptr + letter_pos + 1; break; } } if(cur_letter_pos >= 5) { printf("Maximum of four characters for one letter\n"); exit(0); } else { cur_letter.letter[cur_letter_pos] = '\0'; } /* store lead seq and query columns + label */ if(cur_letter.letter[0] != '-') { //*(msa_seq_infop->lead_columns_start + k) = seq_pos; /* letter column */ //(msa_seq_infop->msa_seq_4 + (*(msa_seq_infop->lead_columns_start + k)) * (hmmp->a_size_4+1))->label = label; k++; } if(1 == 1) { //memcpy((msa_seq_infop->msa_seq_4 + l * (hmmp->a_size_4 + 1))->query_letter, &(cur_letter.letter), //sizeof(char) * 5); /* query letter */ l++; } seq_pos++; } } #ifdef DEBUG_MSA_SEQ_PRF printf("reached checkpoint 2.3\n"); printf("total_nr_gaps = %d\n", tot_nr_gaps);#endif } /* from everything should be untouched */ *(msa_seq_infop->lead_columns_start + k) = END; msa_seq_infop->lead_columns_end = msa_seq_infop->lead_columns_start + k; msa_seq_infop->nr_lead_columns = nr_lead_columns; msa_seq_infop->nr_seqs = nr_seqs; msa_seq_infop->msa_seq_length = msa_length; msa_seq_infop->gap_shares = (double*)malloc_or_die(msa_length * sizeof(double)); tot_nr_gaps = 0; gaps_per_column = 0; /* go through each position and calculate distribution for all letters in alphabet */ for(i = 0; i < msa_length; i++) { occurences_per_column = 0; gaps_per_column = 0; for(j = 0; j < hmmp->a_size+1; j++) { occurences_per_column += (msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences; if(j == hmmp->a_size) { tot_nr_gaps += (int)(msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences; gaps_per_column = (int)(msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences; } } if(use_priordistribution == DIRICHLET) /* calculate share update using dirichlet prior mixture */ { update_shares_prior_multi(&em_di, hmmp, msa_seq_infop, i,1); } /* simple share update just using the standard quotient */ for(j = 0; j < hmmp->a_size+1; j++) { (msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->share = (msa_seq_infop->msa_seq_1 + get_mtx_index(i,j, hmmp->a_size+1))->nr_occurences / occurences_per_column; } /* calculate gap_share */ *(msa_seq_infop->gap_shares + i) = (double)(gaps_per_column) / occurences_per_column; } /* go through each position and calculate distribution for all letters in alphabet 2 */ if(hmmp->nr_alphabets > 1) { for(i = 0; i < msa_length; i++) { occurences_per_column = 0; gaps_per_column = 0; for(j = 0; j < hmmp->a_size_2+1; j++) { occurences_per_column += (msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->nr_occurences; if(j == hmmp->a_size_2) { tot_nr_gaps += (int)(msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->nr_occurences; gaps_per_column = (int)(msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->nr_occurences; } } if(use_priordistribution_2 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ { update_shares_prior_multi(&em_di_2, hmmp, msa_seq_infop, i,2); } /* simple share update just using the standard quotient */ for(j = 0; j < hmmp->a_size_2+1; j++) { (msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->share = (msa_seq_infop->msa_seq_2 + get_mtx_index(i,j, hmmp->a_size_2+1))->nr_occurences / occurences_per_column; } /* calculate gap_share */ *(msa_seq_infop->gap_shares + i) = (double)(gaps_per_column) / occurences_per_column; } } /* go through each position and calculate distribution for all letters in alphabet 3 */ if(hmmp->nr_alphabets > 2) { for(i = 0; i < msa_length; i++) { occurences_per_column = 0; gaps_per_column = 0; for(j = 0; j < hmmp->a_size_3+1; j++) { occurences_per_column += (msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->nr_occurences; if(j == hmmp->a_size_3) { tot_nr_gaps += (int)(msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->nr_occurences; gaps_per_column = (int)(msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->nr_occurences; } } if(use_priordistribution_3 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ { update_shares_prior_multi(&em_di_3, hmmp, msa_seq_infop, i,3); } /* simple share update just using the standard quotient */ for(j = 0; j < hmmp->a_size_3+1; j++) { (msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->share = (msa_seq_infop->msa_seq_3 + get_mtx_index(i,j, hmmp->a_size_3+1))->nr_occurences / occurences_per_column; } /* calculate gap_share */ *(msa_seq_infop->gap_shares + i) = (double)(gaps_per_column) / occurences_per_column; } } /* go through each position and calculate distribution for all letters in alphabet 4 */ if(hmmp->nr_alphabets > 3) { for(i = 0; i < msa_length; i++) { occurences_per_column = 0; gaps_per_column = 0; for(j = 0; j < hmmp->a_size_4+1; j++) { occurences_per_column += (msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->nr_occurences; if(j == hmmp->a_size_4) { tot_nr_gaps += (int)(msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->nr_occurences; gaps_per_column = (int)(msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->nr_occurences; } } if(use_priordistribution_4 == DIRICHLET) /* calculate share update using dirichlet prior mixture */ { update_shares_prior_multi(&em_di_4, hmmp, msa_seq_infop, i,4); } /* simple share update just using the standard quotient */ for(j = 0; j < hmmp->a_size_4+1; j++) { (msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->share = (msa_seq_infop->msa_seq_4 + get_mtx_index(i,j, hmmp->a_size_4+1))->nr_occurences / occurences_per_column; } /* calculate gap_share */ *(msa_seq_infop->gap_shares + i) = (double)(gaps_per_column) / occurences_per_column; } } #ifdef DEBUG_MSA_SEQ_PRF printf("reached checkpoint 3\n"); printf("total_nr_gaps = %d\n", tot_nr_gaps);#endif/* allocate memory for gaps array and set initial pointers for each position*/ msa_seq_infop->gaps = (int**)malloc_or_die(msa_length * sizeof(int*) + (tot_nr_gaps + msa_length) * sizeof(int)); nr_gaps = 0; for(i = 0; i < msa_length; i++) { *(msa_seq_infop->gaps + i) = (int*)(msa_seq_infop->gaps + msa_length) + i; **(msa_seq_infop->gaps + i) = END; }#ifdef DEBUG_MSA_SEQ_PRF printf("%x\n", msa_seq_infop->gaps); printf("%x\n", *(msa_seq_infop->gaps)); printf("%x\n", *(msa_seq_infop->gaps + 1)); printf("%x\n", *(msa_seq_infop->gaps + 2)); printf("%x\n", *(msa_seq_infop->gaps + 3)); printf("nr_seqs: %d\n", nr_seqs);#endif#ifdef DEBUG_MSA_SEQ_PRF dump_msa_seqs_multi(msa_seq_infop, hmmp); exit(0);#endif /* cleanup and return */ if(use_priordistribution == DIRICHLET) { free(em_di.q_values); free(em_di.alpha_sums); free(em_di.logbeta_values); free(em_di.prior_values); } if(use_priordistribution == DIRICHLET && hmmp->nr_alphabets > 1) { free(em_di_2.q_values); free(em_di_2.alpha_sums); free(em_di_2.logbeta_values); free(em_di_2.prior_values); } if(use_priordistribution == DIRICHLET && hmmp->nr_alphabets > 2) { free(em_di_3.q_values); free(em_di_3.alpha_sums); free(em_di_3.logbeta_values); free(em_di_3.prior_values); } if(use_priordistribution == DIRICHLET && hmmp->nr_alphabets > 3) { free(em_di_4.q_values); free(em_di_4.alpha_sums); free(em_di_4.logbeta_values); free(em_di_4.prior_values); } return;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -