📄 opt_prfhmm.c
字号:
for(i = 0; i < msa_seq_info.msa_seq_length; i++) { tot_nr_chars = tot_nr_chars + 2; if(((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label == 'M') { nr_mtx_columns++; } if(((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label == 'I') { nr_insert_columns++; } } row = (char*)(malloc_or_die(tot_nr_chars * sizeof(char))); nr_mtx_columns++; /* allocate memory for matrices, add one (pseudocount) to each post */ transitions = (int*)malloc_or_die(8 * nr_mtx_columns * sizeof(int)); insert_emissions = (double*)malloc_or_die(nr_mtx_columns * (hmm.a_size+1) * sizeof(double)); for(i = 1; i < 8; i++) { for(j = 0; j < nr_mtx_columns; j++) { *(transitions + get_mtx_index(i,j,nr_mtx_columns)) = 1; } } /* store indices of match columns */ cur_msa_column = 1; cur_mtx_column = 0; for(i = 0; i < msa_seq_info.msa_seq_length; i++) { if(((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label == 'M') { *(transitions + cur_mtx_column) = cur_msa_column; cur_mtx_column++; cur_msa_column++; } if(((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label == 'I') { cur_msa_column++; } } /* for all rows, add counts to matrix */ rewind(seq_infile); for(cur_alphabet = 1; cur_alphabet <= 1; cur_alphabet++) { alphabet_done = NO; while((fgets(row, tot_nr_chars, seq_infile)) != NULL && alphabet_done == NO) { cur_msa_column = 1; cur_mtx_column = 0; cur_insert_column = 0; last_sign = NEW; cur_sign = NEW; for(i = cur_alphabet;;i++) { /* read symbol */ cur_mtx_col_nr = *(transitions + cur_mtx_column); cur = row[i]; if(row[0] == '/') { break; } if(cur == '<') { alphabet_done = YES; break; } else if(cur == '>') { break; } else if(cur == ' ' || cur == '-' || cur == '_' || cur == '.') { last_sign = cur_sign; if(cur_msa_column == cur_mtx_col_nr) /* gap */{ if(last_sign == ACID || last_sign == NEW) { cur_sign = GAP; *(transitions + get_mtx_index(2,cur_mtx_column,nr_mtx_columns)) += 1; } else if(last_sign == GAP) { cur_sign = GAP; *(transitions + get_mtx_index(4,cur_mtx_column,nr_mtx_columns)) += 1; } else { } if(is_gap == YES) { is_gap = NO; cur_insert_column++; } cur_msa_column++; cur_mtx_column++; } else if(cur_msa_column < cur_mtx_col_nr) /* insert gap*/{ if(last_sign == ACID || last_sign == NEW) { cur_sign = ACID; } else if(last_sign == INSERT_ACID) { cur_sign = INSERT_ACID; } else if(last_sign == GAP) { cur_sign = GAP; } is_gap = YES; cur_msa_column++; } } else if(cur == ';') { } else /* ACID */{ last_sign = cur_sign; if(cur_msa_column == cur_mtx_col_nr) /* match */{ if(last_sign == ACID || last_sign == NEW) { cur_sign = ACID; *(transitions + get_mtx_index(1,cur_mtx_column,nr_mtx_columns)) += 1; } else if(last_sign == GAP) { cur_sign = ACID; *(transitions + get_mtx_index(5,cur_mtx_column,nr_mtx_columns)) += 1; } else if(last_sign == INSERT_ACID) { cur_sign = ACID; *(transitions + get_mtx_index(7,cur_mtx_column,nr_mtx_columns)) += 1; } if(is_gap == YES) { is_gap = NO; cur_insert_column++; } cur_msa_column++; cur_mtx_column++; } else if(cur_msa_column < cur_mtx_col_nr) /* insert */{ if(last_sign == ACID || last_sign == NEW) { cur_sign = INSERT_ACID; *(transitions + get_mtx_index(3,cur_mtx_column,nr_mtx_columns)) += 1; } else if(last_sign == INSERT_ACID) { cur_sign = INSERT_ACID; *(transitions + get_mtx_index(6,cur_mtx_column,nr_mtx_columns)) += 1; } else if(last_sign == GAP) { cur_sign = INSERT_ACID; } is_gap = YES; cur_msa_column++; } } } } } /* check that msa length and nr columns match */ if((nr_mtx_columns-1) * 3 + 6 != hmm.nr_v) { printf("Warning: length of hmm is not equal to length of profile\n"); } else { /* start-transitions */ s_tot = *(transitions + get_mtx_index(1,0,nr_mtx_columns)) + *(transitions + get_mtx_index(2,0,nr_mtx_columns)); sd = *(transitions + get_mtx_index(2,0,nr_mtx_columns)); sm = *(transitions + get_mtx_index(1,0,nr_mtx_columns)); *(hmm.transitions + get_mtx_index(1, 2, hmm.nr_v)) = SI_PROB; *(hmm.transitions + get_mtx_index(1, 3, hmm.nr_v)) = (double)sd / (double)s_tot *(1.0 - SI_PROB); *(hmm.transitions + get_mtx_index(1, 4, hmm.nr_v)) = (double)sm / (double)s_tot *(1.0 - SI_PROB); if(local == YES) { tot_local_im_prob = *(hmm.transitions + get_mtx_index(1, 3, hmm.nr_v)) + *(hmm.transitions + get_mtx_index(1, 4, hmm.nr_v)); local_im_prob = tot_local_im_prob / nr_mtx_columns; local_me_prob = local_im_prob; *(hmm.transitions + get_mtx_index(1, 3, hmm.nr_v)) = 0.0; *(hmm.transitions + get_mtx_index(1, 4, hmm.nr_v)) = local_im_prob * 2; for(i = 2; i < nr_mtx_columns; i++) { *(hmm.transitions + get_mtx_index(1, 3*i + 1, hmm.nr_v)) = local_im_prob; } } /* for all dd-trans and dm-trans, calculate values */ for(i = 1; i < nr_mtx_columns - 1; i++) { d_tot = *(transitions + get_mtx_index(4,i,nr_mtx_columns)) + *(transitions + get_mtx_index(5,i,nr_mtx_columns)); dd = *(transitions + get_mtx_index(4,i,nr_mtx_columns)); dm = *(transitions + get_mtx_index(5,i,nr_mtx_columns)); *(hmm.transitions + get_mtx_index(3 * i, 3 * (i+1), hmm.nr_v)) = (double)dd / (double)d_tot; *(hmm.transitions + get_mtx_index(3 * i, 3 * (i+1) + 1, hmm.nr_v)) = (double)dm / (double)d_tot; } /* for all mm-trans, calculate values */ /* for all md-trans, calculate values */ /* for all mi-trans, calculate values */ for(i = 1; i < nr_mtx_columns - 1; i++) { m_tot = *(transitions + get_mtx_index(1,i,nr_mtx_columns)) + *(transitions + get_mtx_index(2,i,nr_mtx_columns)) + *(transitions + get_mtx_index(3,i,nr_mtx_columns)); mm = *(transitions + get_mtx_index(1,i,nr_mtx_columns)); md = *(transitions + get_mtx_index(2,i,nr_mtx_columns)); mi = *(transitions + get_mtx_index(3,i,nr_mtx_columns)); //printf("mm = %d, md = %d, mi = %d\n", mm, md, mi); *(hmm.transitions + get_mtx_index(3 * i + 1, 3 * (i+1) + 1, hmm.nr_v)) = (double)mm / (double)m_tot * (1.0 - local_me_prob); *(hmm.transitions + get_mtx_index(3 * i + 1, 3 * (i+1), hmm.nr_v)) = (double)md / (double)m_tot * (1.0 - local_me_prob); *(hmm.transitions + get_mtx_index(3 * i + 1, 3 * i + 2, hmm.nr_v)) = (double)mi / (double)m_tot * (1.0 - local_me_prob); // if local, save small prob of exiting the model for all match states if(local == YES) { *(hmm.transitions + get_mtx_index(3 * i + 1, hmm.nr_v - 4, hmm.nr_v)) = local_me_prob; } } /* for end insertions, set default values */ *(hmm.transitions + get_mtx_index((nr_mtx_columns - 1)*3 + 2, (nr_mtx_columns - 1)*3 + 3, hmm.nr_v)) = EI_PROB;; *(hmm.transitions + get_mtx_index((nr_mtx_columns - 1)*3 + 2, (nr_mtx_columns - 1)*3 + 4, hmm.nr_v)) = 1.0 - EI_PROB; /* for all ii-trans, calculate values * for all im-trans, calculate values */ for(i = 1; i < nr_mtx_columns - 1; i++) { i_tot = *(transitions + get_mtx_index(6,i,nr_mtx_columns)) + *(transitions + get_mtx_index(7,i,nr_mtx_columns)); ii = *(transitions + get_mtx_index(6,i,nr_mtx_columns)); im = *(transitions + get_mtx_index(7,i,nr_mtx_columns)); *(hmm.transitions + get_mtx_index(3 * i + 2, 3 * i + 2 , hmm.nr_v)) = (double)ii / (double)i_tot; *(hmm.transitions + get_mtx_index(3 * i + 2, 3 * (i+1) + 1, hmm.nr_v)) = (double)im / (double)i_tot; } /* start and end insertion lengths */ transprob_ii = (double)(INSERT_SIZE) / (double)(INSERT_SIZE + 1); *(hmm.transitions + get_mtx_index(2, 1, hmm.nr_v)) = 1.0 - transprob_ii; *(hmm.transitions + get_mtx_index(2, 2, hmm.nr_v)) = transprob_ii; *(hmm.transitions + get_mtx_index((nr_mtx_columns - 1)*3 + 3, (nr_mtx_columns - 1)*3 + 2, hmm.nr_v)) = 1.0 - transprob_ii; *(hmm.transitions + get_mtx_index((nr_mtx_columns - 1)*3 + 3, (nr_mtx_columns - 1)*3 + 3, hmm.nr_v)) = transprob_ii; } /* cleanup and return */ free(transitions); free(insert_emissions); free(row); return;}void label_msa_fast(){ int i; for(i = 0; i < msa_seq_info.msa_seq_length; i++) { if(*((msa_seq_info.gap_shares) + i) < FAST_MATCH_LIMIT) { ((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label = 'M'; } else { ((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label = 'I'; } }}void label_msa_query(FILE *seq_infile){ char cur; int i; rewind(seq_infile); cur = (char)fgetc(seq_infile); if(cur != '<') { } i = 0; while(1) { /* read symbol */ cur = (char)fgetc(seq_infile); if(cur == '>') { break; } else if(cur == ' ' || cur == '-' || cur == '_' || cur == '.') { ((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label = 'I'; } else { ((msa_seq_info.msa_seq_1) + i*(hmm.a_size+1))->label = 'M'; } i++; /* read ';' */ cur = (char)fgetc(seq_infile); if(cur != ';') { } } rewind(seq_infile);}void henikoff_weighting(struct msa_sequences_multi_s *msa_seq_infop, FILE *seq_infile, struct hmm_multi_s *hmmp){ struct sequences_multi_s seq_info; double *m, *m_2, *m_3, *m_4; double *weights, *total_weights, *total_weights_1, *total_weights_2, *total_weights_3, *total_weights_4, *total_weights_all; int i, j, k; int nr_alphabets; struct letter_s *seq; int a_index; double divider; char *alphabet; int alphabet_nr; struct msa_letter_s *msa_seq; int a_size; double weight_sum, *occ_sums; nr_alphabets = hmmp->nr_alphabets; /* read seqfile */ rewind(seq_infile); //get_sequences_std_multi(seq_infile, &seq_info, &hmm); total_weights_1 = (double*)(malloc_or_die(msa_seq_infop->nr_seqs * sizeof(double))); total_weights_2 = (double*)(malloc_or_die(msa_seq_infop->nr_seqs * sizeof(double))); total_weights_3 = (double*)(malloc_or_die(msa_seq_infop->nr_seqs * sizeof(double))); total_weights_4 = (double*)(malloc_or_die(msa_seq_infop->nr_seqs * sizeof(double))); for(i = 0; i < msa_seq_infop->nr_seqs; i++) { *(total_weights_1 + i) = 0.0; *(total_weights_2 + i) = 0.0; *(total_weights_3 + i) = 0.0; *(total_weights_4 + i) = 0.0; } for(alphabet_nr = 1; alphabet_nr <= nr_alphabets; alphabet_nr++) { switch(alphabet_nr) { case 1: a_size = hmmp->a_size; msa_seq = msa_seq_infop->msa_seq_1; alphabet = hmmp->alphabet; total_weights = total_weights_1; break; case 2: a_size = hmmp->a_size_2; msa_seq = msa_seq_infop->msa_seq_2; alphabet = hmmp->alphabet_2; total_weights = total_weights_2; break; case 3: a_size = hmmp->a_size_3; msa_seq = msa_seq_infop->msa_seq_3; alphabet = hmmp->alphabet_3; total_weights = total_weights_3; break; case 4: a_size = hmmp->a_size_4; msa_seq = msa_seq_infop->msa_seq_4; alphabet = hmmp->alphabet_4; total_weights = total_weights_4; break; } /* for all columns, count m_i */ m = (double*)(malloc_or_die(msa_seq_infop->msa_seq_length * sizeof(double))); for(i = 0; i < msa_seq_infop->msa_seq_length; i++) { *(m + i) = 0.0; } for(i = 0; i < msa_seq_infop->msa_seq_length; i++) { for(j = 0; j < a_size; j++) { if((msa_seq + get_mtx_index(i,j,a_size+1))->nr_occurences >= 1.0) { *(m + i) += 1.0; } } } /* for all columns, and sequences, set weigth = 1/m_i * nr_of_letter_j */ weights = (double*)(malloc_or_die(msa_seq_infop->msa_seq_length * msa_seq_infop->nr_seqs * sizeof(double))); // loop over all sequences, over all positions // get alphabet index for this position, set weight at this postition to 1/(nr_of_occs_of_this_aa_in_this_pos_of_msa * tot_nr_occs) // if alphabet index < 0, set weight = 0.0 (gap or replacement letter) for(i = 0; i < msa_seq_infop->nr_seqs; i++) { if(alphabet_nr == 1) { seq = (seq_info.seqs + i)->seq_1; } else if(alphabet_nr == 2) { seq = (seq_info.seqs + i)->seq_2; } else if(alphabet_nr == 3) { seq = (seq_info.seqs + i)->seq_3; } else if(alphabet_nr == 4) { seq = (seq_info.seqs + i)->seq_4; } j = 0; while((seq->letter)[0] != '\0') { a_index = get_alphabet_index(seq->letter, alphabet, a_size); if(a_index < 0) { *(weights + get_mtx_index(i, j, msa_seq_infop->msa_seq_length)) = 0.0; } else { *(weights + get_mtx_index(i, j, msa_seq_infop->msa_seq_length)) = 1.0 / (*(m + j) * (msa_seq + get_mtx_index(j, a_index, a_size + 1))->nr_occurences); } j++; seq++; } } /* set total sequence weights to average over whole sequence */ for(i = 0; i < msa_seq_infop->nr_seqs; i++) { *(total_weights + i) = 0.0; divider = 0.0; for(j = 0; j < msa_seq_infop->msa_seq_length; j++) { if(*(weights + get_mtx_index(i, j, msa_seq_infop->msa_seq_length)) != 0.0) { divider += 1.0; *(total_weights + i) += *(weights + get_mtx_index(i, j, msa_seq_infop->msa_seq_length)); } } if(divider != 0.0) { *(total_weights + i) = *(total_weights + i) / divider; } } /* normalize weights */ weight_sum = 0.0; for(i = 0; i < msa_seq_infop->nr_seqs; i++) { weight_sum += *(total_weights + i); } for(i = 0; i < msa_seq_infop->nr_seqs; i++) { *(total_weights + i) = *(total_weights + i) / weight_sum; } //dump_weights(weights, msa_seq_info.nr_seqs * msa_seq_info.msa_seq_length); //dump_weights(total_weights, msa_seq_info.nr_seqs); free(m); free(weights); } /* get average weight over all alphabets + normalize with number of sequences */ total_weights_all = (double*)(malloc_or_die(msa_seq_infop->nr_seqs * sizeof(double))); for(i = 0; i < msa_seq_infop->nr_seqs; i++) { *(total_weights_all + i) = 0.0; *(total_weights_all + i) += *(total_weights_1 + i); *(total_weights_all + i) += *(total_weights_2 + i); *(total_weights_all + i) += *(total_weights_3 + i); *(total_weights_all + i) += *(total_weights_4 + i); *(total_weights_all + i) = *(total_weights_all + i)/(double)(nr_alphabets); *(total_weights_all + i) = *(total_weights_all + i) * (double)(msa_seq_infop->nr_seqs); } /* insert weighted shares and nr_occs in msa_seq_info struct */ for(alphabet_nr = 1; alphabet_nr <= nr_alphabets; alphabet_nr++) { switch(alphabet_nr) { case 1: a_size = hmmp->a_size; msa_seq = msa_seq_infop->msa_seq_1; alphabet = hmmp->alphabet; break; case 2: a_size = hmmp->a_size_2; msa_seq = msa_seq_infop->msa_seq_2; alphabet = hmmp->alphabet_2; break; case 3: a_size = hmmp->a_size_3; msa_seq = msa_seq_infop->msa_seq_3; alphabet = hmmp->alphabet_3; break; case 4: a_size = hmmp->a_size_4; msa_seq = msa_seq_infop->msa_seq_4; alphabet = hmmp->alphabet_4; break; } for(i = 0; i < msa_seq_infop->msa_seq_length; i++) { for(j = 0; j < a_size + 1; j++) { (msa_seq + get_mtx_index(i,j,a_size + 1))->share = 0.0; (msa_seq + get_mtx_index(i,j,a_size + 1))->prior_share = 0.0; (msa_seq + get_mtx_index(i,j,a_size + 1))->nr_occurences = 0.0; } } occ_sums = (double*)(malloc_or_die(msa_seq_infop->msa_seq_length * sizeof(double))); for(i = 0; i < msa_seq_infop->nr_seqs; i++) { if(alphabet_nr == 1) { seq = (seq_info.seqs + i)->seq_1; } else if(alphabet_nr == 2) { seq = (seq_info.seqs + i)->seq_2; } else if(alphabet_nr == 3) { seq = (seq_info.seqs + i)->seq_3; } else if(alphabet_nr == 4) { seq = (seq_info.seqs + i)->seq_4; } j = 0; while((seq->letter)[0] != '\0') { a_index = get_alphabet_index(seq->letter, alphabet, a_size); if(a_index < 0) { if(seq->letter[0] == '-' || seq->letter[0] == ' ' || seq->letter[0] == '_' || seq->letter[0] == '.') { (msa_seq + get_mtx_index(j, a_size, a_size + 1))->nr_occurences += 1.0 * *(total_weights_all + i); *(occ_sums + j) += 1.0 * *(total_weights_all + i); } else { for(k = 0; k < hmm.a_size; k++) { (msa_seq + get_mtx_index(j, k, a_size + 1))->nr_occurences += 1.0/(double)(a_size) * *(total_weights_all + i); *(occ_sums + j) += 1.0 * *(total_weights_all + i); } } } else { (msa_seq + get_mtx_index(j, a_index, a_size + 1))->nr_occurences += 1.0 * *(total_weights_all + i); *(occ_sums + j) += 1.0 * *(total_weights_all + i); } j++; seq++; } } for(i = 0; i < msa_seq_infop->msa_seq_length; i++) { for(j = 0; j < a_size + 1; j++) { if(*(occ_sums + i) != 0.0) { (msa_seq + get_mtx_index(i, j, a_size + 1))->share = (msa_seq + get_mtx_index(i, j, a_size + 1))->nr_occurences / *(occ_sums + i); } } } free(occ_sums); } free(seq_info.seqs); free(total_weights_1); free(total_weights_2); free(total_weights_3); free(total_weights_4); free(total_weights_all); //dump_msa_seqs_multi(&msa_seq_info, &hmm); //exit(0);}void printhelp(){ printhelp_opt();}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -