📄 readseqs_multialpha.c
字号:
printf("reached checkpoint 3\n"); printf("total_nr_gaps = %d\n", tot_nr_gaps); #endif/* allocate memory for gaps array and set initial pointers for each position*/ msa_seq_infop->gaps = (int**)malloc_or_die(msa_length * sizeof(int*) + (tot_nr_gaps + msa_length) * sizeof(int)); nr_gaps = 0; for(i = 0; i < msa_length; i++) { *(msa_seq_infop->gaps + i) = (int*)(msa_seq_infop->gaps + msa_length) + nr_gaps; nr_gaps += (int)(msa_seq_infop->msa_seq_1 + get_mtx_index(i, hmmp->a_size, hmmp->a_size+1))->nr_occurences; nr_gaps += 1; }#ifdef DEBUG_MSA_SEQ_STD printf("%x\n", msa_seq_infop->gaps); printf("%x\n", *(msa_seq_infop->gaps)); printf("%x\n", *(msa_seq_infop->gaps + 1)); printf("%x\n", *(msa_seq_infop->gaps + 2)); printf("%x\n", *(msa_seq_infop->gaps + 3)); printf("nr_seqs: %d\n", nr_seqs);#endif /********** verify that this is done correctly even if there are multiple alphabets */ /* go through every sequence and store which sequences have gaps at what * positions in gaps array */ rewind(seqfile); inside_seq = NO; seq_pos = 0; cur_posp = msa_seq_infop->gaps; last = '!'; for(i = 0; i < nr_seqs;) { c = (char)fgetc(seqfile); if(c == '<' && last == '<') { break; } else if(c == '<') { inside_seq = YES; cur_posp = msa_seq_infop->gaps; last = '<'; } else if(c == '>') { inside_seq = NO; i++; last = '!'; } else if(inside_seq == YES && (c == '_' || c == ' ' || c == '-' || c == '.')) { (**cur_posp) = i+1; #ifdef DEBUG_MSA_SEQ_STD printf("posp = %x\n", cur_posp); printf("*posp = %x\n", *cur_posp); printf("**posp = %x\n", **cur_posp);#endif (*cur_posp)++; cur_posp++; while((c = (char)fgetc(seqfile)) != ';') { } last = '!'; } else if(inside_seq == YES) { while((c = (char)fgetc(seqfile)) != ';') { } cur_posp++; last = '!'; } } cur_posp = msa_seq_infop->gaps; for(i = 0; i < msa_seq_infop->msa_seq_length; i++) { (**cur_posp) = END; cur_posp++; } nr_gaps = 0; for(i = 0; i < msa_length; i++) { *(msa_seq_infop->gaps + i) = (int*)(msa_seq_infop->gaps + msa_length) + nr_gaps; nr_gaps += (int)(msa_seq_infop->msa_seq_1 + get_mtx_index(i, hmmp->a_size, hmmp->a_size+1))->nr_occurences; nr_gaps += 1; } #ifdef DEBUG_MSA_SEQ_STD dump_msa_seqs_multi(msa_seq_infop, hmmp); printf("reached checkpoint 4\n"); exit(0);#endif /* cleanup and return */ if(use_priordistribution_1 == DIRICHLET) { if(em_di_1.nr_components > 0) { free(em_di_1.q_values); free(em_di_1.alpha_sums); free(em_di_1.logbeta_values); free(em_di_1.prior_values); } } if(use_priordistribution_2 == DIRICHLET) { if(em_di_2.nr_components > 0) { free(em_di_2.q_values); free(em_di_2.alpha_sums); free(em_di_2.logbeta_values); free(em_di_2.prior_values); } } if(use_priordistribution_3 == DIRICHLET) { if(em_di_3.nr_components > 0) { free(em_di_3.q_values); free(em_di_3.alpha_sums); free(em_di_3.logbeta_values); free(em_di_3.prior_values); } } if(use_priordistribution_4 == DIRICHLET) { if(em_di_4.nr_components > 0) { free(em_di_4.q_values); free(em_di_4.alpha_sums); free(em_di_4.logbeta_values); free(em_di_4.prior_values); } } return;}/* Note: msa_seq_infop->seq and msa_seq_infop->gaps will be allocated here but must be * freed by caller */void get_sequences_msa_prf_multi(FILE *seqfile, FILE *priorfile, struct msa_sequences_multi_s *msa_seq_infop, struct hmm_multi_s *hmmp){ int i,j,k,l,m; int done, inside_seq, seq_pos, letter_pos, a_index, nr_seqs, cur_pos, cur_seq; int msa_length, seq_length, longest_seq, longest_seq_length; int seq_index, nr_lead_columns; int gaps_per_column, tot_nr_gaps, nr_gaps; double occurences_per_column; int get_letter_columns, get_query_letters; int use_priordistribution,use_priordistribution_2, use_priordistribution_3, use_priordistribution_4; char c; char s[MAX_LINE]; struct letter_s cur_letter; int **cur_posp; struct emission_dirichlet_s em_di,em_di_2, em_di_3, em_di_4; int is_first; int use_replacement_letters; int is_empty; int col_pos; char *seq_ptr; char *endptr; double nr_occurences; int label_pos, cur_letter_pos; char label, letter; int read_priorfile; /* find out length of alignment and allocate memory for probability matrix by * reading all sequence rows and remembering the longest */ done = NO; inside_seq = NO; msa_length = 0; seq_length = 0; seq_index = 0;if(priorfile == NULL) { use_priordistribution = NONE; use_priordistribution_2 = NONE; use_priordistribution_3 = NONE; use_priordistribution_4 = NONE; } else { read_priorfile = read_multi_prior_file_multi(&em_di, hmmp, priorfile, 1); if(read_priorfile > 0) { use_priordistribution = DIRICHLET; } else if(read_priorfile < 0) { printf("Error: Incorrect priorfile format\n"); exit(0); } else { use_priordistribution = NONE; } if(hmmp->nr_alphabets > 1) { read_priorfile = read_multi_prior_file_multi(&em_di_2, hmmp, priorfile, 2); if(read_priorfile > 0) { use_priordistribution_2 = DIRICHLET; } else if(read_priorfile < 0) { printf("Error: Incorrect priorfile format\n"); exit(0); } else { use_priordistribution_2 = NONE; } } if(hmmp->nr_alphabets > 2) { read_priorfile = read_multi_prior_file_multi(&em_di_3, hmmp, priorfile, 3); if(read_priorfile > 0) { use_priordistribution_3 = DIRICHLET; } else if(read_priorfile < 0) { printf("Error: Incorrect priorfile format\n"); exit(0); } else { use_priordistribution_3 = NONE; } } if(hmmp->nr_alphabets > 3) { read_priorfile = read_multi_prior_file_multi(&em_di_4, hmmp, priorfile, 4); if(read_priorfile > 0) { use_priordistribution_4 = DIRICHLET; } else if(read_priorfile < 0) { printf("Error: Incorrect priorfile format\n"); exit(0); } else { use_priordistribution_4 = NONE; } } } /* check if file is empty */ is_empty = YES; while(done != YES) { c = (char)fgetc(seqfile); if((int)c == EOF) { break; } else { is_empty = NO; break; } } if(is_empty == YES) { if(verbose == YES) { printf("File is empty\n"); } } else { } rewind(seqfile); while(fgets(s, MAX_LINE, seqfile) != NULL) { if(strncmp(s,"COL",3) == 0) { msa_length = atoi(&s[4]); } }#ifdef DEBUG_MSA_SEQ_PRF printf("reached checkpoint 1\n"); printf("msa_length = %d\n", msa_length);#endif msa_seq_infop->lead_columns_start = (int*)malloc_or_die((msa_length+1) * sizeof(int)); msa_seq_infop->msa_seq_1 = (struct msa_letter_s*) malloc_or_die(msa_length * (hmmp->a_size+1) * sizeof(struct msa_letter_s));#ifdef DEBUG_MSA_SEQ_PRF printf("malloced ok\n");#endif /* read alphabet 1, save nr of occurences of each letter in each position, * including nr of gaps */ rewind(seqfile); seq_pos = 0; nr_lead_columns = 0; k = 0; l = 0; inside_seq = NO; while(fgets(s, MAX_LINE, seqfile) != NULL) { if(strncmp(s,"END 1",5 ) == 0) { break; } if(strncmp(s,"START 1",7) == 0) { inside_seq = YES; } else if(strncmp(s,"NR of aligned sequences",23) == 0) { nr_seqs = strtol(s + 24, NULL, 10); } else if(strncmp(s,"COL",3) == 0 && inside_seq == YES) { /* split into columns depending on a_size, add '-' */ col_pos = 11; seq_ptr = s + col_pos; for(m = 0; m < hmmp->a_size + 1; m++) { nr_occurences = strtod(seq_ptr, &endptr); if(endptr == seq_ptr) { printf("Error reading column: no frequency was read\n"); exit(0); } else { /* add nr of occurences to datastructure */ seq_ptr = endptr; (msa_seq_infop->msa_seq_1 + get_mtx_index(seq_pos,m, hmmp->a_size+1))->nr_occurences = nr_occurences; } } /* read space */ strtod(seq_ptr, &endptr); seq_ptr = endptr; /* read label */ label_pos = 0; for(label_pos = 0;seq_ptr[label_pos] != '\n';label_pos++) { if(seq_ptr[label_pos] == ' ') { } else { label = seq_ptr[label_pos]; seq_ptr = seq_ptr + label_pos + 1; break; } } /* read query letter */ letter_pos = 0; cur_letter_pos = 0; for(letter_pos = 0;seq_ptr[letter_pos] != '\n';letter_pos++) { if(seq_ptr[letter_pos] == ' ') { } else { letter = seq_ptr[letter_pos]; cur_letter.letter[cur_letter_pos] = letter; cur_letter_pos++; seq_ptr = seq_ptr + letter_pos + 1; break; } } if(cur_letter_pos >= 5) { printf("Maximum of four characters for one letter\n"); exit(0); } else { cur_letter.letter[cur_letter_pos] = '\0'; } /* store lead seq and query columns + label */ if(cur_letter.letter[0] != '-') { *(msa_seq_infop->lead_columns_start + k) = seq_pos; /* letter column */ if(label == '-') { label = '.'; } /* only add label to first msa_letter of the first alphabet */ (msa_seq_infop->msa_seq_1 + (*(msa_seq_infop->lead_columns_start + k)) * (hmmp->a_size+1))->label = label; k++; nr_lead_columns++; } if(1 == 1) { memcpy((msa_seq_infop->msa_seq_1 + l * (hmmp->a_size + 1))->query_letter, &(cur_letter.letter), sizeof(char) * 5); /* query letter */ l++; } seq_pos++; } } #ifdef DEBUG_MSA_SEQ_PRF printf("reached checkpoint 2\n"); printf("total_nr_gaps = %d\n", tot_nr_gaps);#endif /* read alphabet 2, save nr of occurences of each letter in each position, * including nr of gaps */ if(hmmp->nr_alphabets > 1) { msa_seq_infop->msa_seq_2 = (struct msa_letter_s*) malloc_or_die(msa_length * (hmmp->a_size_2+1) * sizeof(struct msa_letter_s)); rewind(seqfile); seq_pos = 0; k = 0; l = 0; inside_seq = NO; while(fgets(s, MAX_LINE, seqfile) != NULL) { if(strncmp(s,"END 2",5 ) == 0) { break; } if(strncmp(s,"START 2",7) == 0) { inside_seq = YES; } else if(strncmp(s,"NR of aligned sequences",23) == 0) { nr_seqs = strtol(s + 24, NULL, 10); } else if(strncmp(s,"COL",3) == 0 && inside_seq == YES) { /* split into columns depending on a_size, add '-' */ col_pos = 11; seq_ptr = s + col_pos; for(m = 0; m < hmmp->a_size_2 + 1; m++) { nr_occurences = strtod(seq_ptr, &endptr); if(endptr == seq_ptr) { printf("Error reading column: no frequency was read\n"); exit(0); } else { /* add nr of occurences to datastructure */ seq_ptr = endptr; (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos,m, hmmp->a_size_2+1))->nr_occurences = nr_occurences; } } /* read space */ strtod(seq_ptr, &endptr); seq_ptr = endptr; /* read label */ label_pos = 0; for(label_pos = 0;seq_ptr[label_pos] != '\n';label_pos++) { if(seq_ptr[label_pos] == ' ') { } else { label = seq_ptr[label_pos]; seq_ptr = seq_ptr + label_pos + 1; break; } } /* read query letter */ letter_pos = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -