📄 std_funcs.c
字号:
/* get sequence data */ j = 0; for(i = msa_seq_infop->msa_seq_length - 1; i >= 0; i--) { memcpy(reverse_msa_seq_infop->msa_seq + (i * (a_size + 1)), msa_seq_infop->msa_seq + (j * (a_size + 1)), sizeof(struct msa_letter_s) * (a_size+1)); j++; } /* get gap data (not implemented) */ for(i = 0; i < msa_seq_infop->msa_seq_length; i++) { *(reverse_msa_seq_infop->gaps + (msa_seq_infop->msa_seq_length + i)) = END; } for(i = 0; i < msa_seq_infop->msa_seq_length; i++) { *(reverse_msa_seq_infop->gaps + i) = (int*)(reverse_msa_seq_infop->gaps + (msa_seq_infop->msa_seq_length + i)); } /* get gap shares */ j = 0; for(i = msa_seq_infop->msa_seq_length - 1; i >= 0; i--) { memcpy(reverse_msa_seq_infop->gap_shares + i, msa_seq_infop->gap_shares + j, 1 * sizeof(double)); j++; } /* get lead_columns */ j = 0; for(i = msa_seq_infop->nr_lead_columns - 1; i >= 0; i--) { *(reverse_msa_seq_infop->lead_columns_start + i) = (msa_seq_infop->msa_seq_length - 1) - *(msa_seq_infop->lead_columns_start + j); j++; } *(reverse_msa_seq_infop->lead_columns_start + msa_seq_infop->nr_lead_columns) = END;#ifdef DEBUG_REVERSE_SEQ dump_msa_seqs(msa_seq_infop, a_size); dump_msa_seqs(reverse_msa_seq_infop, a_size);#endif}void get_reverse_msa_seq_multi(struct msa_sequences_multi_s *msa_seq_infop, struct msa_sequences_multi_s *reverse_msa_seq_infop, struct hmm_multi_s *hmmp){ /* note that this function does not implement the gaps-pointing, everything points to END */ int i,j; int nr_alphabets; int a_size, a_size_2, a_size_3, a_size_4; nr_alphabets = hmmp->nr_alphabets; a_size = hmmp->a_size; a_size_2 = hmmp->a_size_2; a_size_3 = hmmp->a_size_3; a_size_4 = hmmp->a_size_4; reverse_msa_seq_infop->nr_seqs = msa_seq_infop->nr_seqs; reverse_msa_seq_infop->msa_seq_length = msa_seq_infop->msa_seq_length; reverse_msa_seq_infop->nr_lead_columns = msa_seq_infop->nr_lead_columns; reverse_msa_seq_infop->msa_seq_1 = (struct msa_letter_s*)(malloc_or_die(msa_seq_infop->msa_seq_length * (a_size + 1) * sizeof(struct msa_letter_s))); if(nr_alphabets > 1) { reverse_msa_seq_infop->msa_seq_2 = (struct msa_letter_s*)(malloc_or_die(msa_seq_infop->msa_seq_length * (a_size_2 + 1) * sizeof(struct msa_letter_s))); } if(nr_alphabets > 2) { reverse_msa_seq_infop->msa_seq_3 = (struct msa_letter_s*)(malloc_or_die(msa_seq_infop->msa_seq_length * (a_size_3 + 1) * sizeof(struct msa_letter_s))); } if(nr_alphabets > 3) { reverse_msa_seq_infop->msa_seq_4 = (struct msa_letter_s*)(malloc_or_die(msa_seq_infop->msa_seq_length * (a_size_4 + 1) * sizeof(struct msa_letter_s))); } reverse_msa_seq_infop->gaps = (int**)(malloc_or_die(msa_seq_infop->msa_seq_length * sizeof(int*) + msa_seq_infop->msa_seq_length * sizeof(int))); reverse_msa_seq_infop->lead_columns_start = (int*)(malloc_or_die((msa_seq_infop->nr_lead_columns +1)* sizeof(int))); reverse_msa_seq_infop->lead_columns_end = reverse_msa_seq_infop->lead_columns_start + (msa_seq_infop->nr_lead_columns); reverse_msa_seq_infop->gap_shares = (double*)(malloc_or_die(msa_seq_infop->msa_seq_length * sizeof(double))); /* get sequence data */ j = 0; for(i = msa_seq_infop->msa_seq_length - 1; i >= 0; i--) { memcpy(reverse_msa_seq_infop->msa_seq_1 + (i * (a_size + 1)), msa_seq_infop->msa_seq_1 + (j * (a_size + 1)), sizeof(struct msa_letter_s) * (a_size+1)); j++; } if(nr_alphabets > 1) { j = 0; for(i = msa_seq_infop->msa_seq_length - 1; i >= 0; i--) { memcpy(reverse_msa_seq_infop->msa_seq_2 + (i * (a_size_2 + 1)), msa_seq_infop->msa_seq_2 + (j * (a_size_2 + 1)), sizeof(struct msa_letter_s) * (a_size_2+1)); j++; } } if(nr_alphabets > 2) { j = 0; for(i = msa_seq_infop->msa_seq_length - 1; i >= 0; i--) { memcpy(reverse_msa_seq_infop->msa_seq_3 + (i * (a_size_3 + 1)), msa_seq_infop->msa_seq_3 + (j * (a_size_3 + 1)), sizeof(struct msa_letter_s) * (a_size_3+1)); j++; } } if(nr_alphabets > 3) { j = 0; for(i = msa_seq_infop->msa_seq_length - 1; i >= 0; i--) { memcpy(reverse_msa_seq_infop->msa_seq_4 + (i * (a_size_4 + 1)), msa_seq_infop->msa_seq_4 + (j * (a_size_4 + 1)), sizeof(struct msa_letter_s) * (a_size_4+1)); j++; } } /* get gap data (not implemented) */ for(i = 0; i < msa_seq_infop->msa_seq_length; i++) { *(reverse_msa_seq_infop->gaps + (msa_seq_infop->msa_seq_length + i)) = END; } for(i = 0; i < msa_seq_infop->msa_seq_length; i++) { *(reverse_msa_seq_infop->gaps + i) = (int*)(reverse_msa_seq_infop->gaps + (msa_seq_infop->msa_seq_length + i)); } /* get gap shares */ j = 0; for(i = msa_seq_infop->msa_seq_length - 1; i >= 0; i--) { memcpy(reverse_msa_seq_infop->gap_shares + i, msa_seq_infop->gap_shares + j, 1 * sizeof(double)); j++; } /* get lead_columns */ j = 0; for(i = msa_seq_infop->nr_lead_columns - 1; i >= 0; i--) { *(reverse_msa_seq_infop->lead_columns_start + i) = (msa_seq_infop->msa_seq_length - 1) - *(msa_seq_infop->lead_columns_start + j); j++; } *(reverse_msa_seq_infop->lead_columns_start + msa_seq_infop->nr_lead_columns) = END;#ifdef DEBUG_REVERSE_SEQ dump_msa_seqs(msa_seq_infop, a_size); dump_msa_seqs(reverse_msa_seq_infop, a_size);#endif}void get_msa_labels(FILE *labelfile, struct msa_sequences_s *msa_seq_infop, struct hmm_s *hmmp) { char row[30000]; int i,j; rewind(labelfile); while(1) { if(fgets(row,30000,labelfile) != NULL) { if(row[0] == '/') { for(i = 1; row[i] != '/';i++) { (msa_seq_infop->msa_seq + (*(msa_seq_infop->lead_columns_start + i-1)) * (hmmp->a_size+1))->label = row[i]; } } } else { break; } }}void get_msa_labels_all_columns(FILE *labelfile, struct msa_sequences_s *msa_seq_infop, struct hmm_s *hmmp) { char row[30000]; int i,j; rewind(labelfile); while(1) { if(fgets(row,30000,labelfile) != NULL) { if(row[0] == '/') { for(i = 1; row[i] != '/';i++) { (msa_seq_infop->msa_seq + (i-1) * (hmmp->a_size+1))->label = row[i]; } } } else { break; } }}int update_shares_prior(struct emission_dirichlet_s *em_di, struct hmm_s *hmmp, struct msa_sequences_s *msa_seq_infop, int l){ int nr_components, comps, a_index; double occ_sums; double q_value, scaling_factor, X_sum, *X_values, ed_res1, *logbeta_an_values; double exponent, prior_prob, tot_prior_prob; /************the update part ********************/ nr_components = em_di->nr_components; logbeta_an_values = malloc_or_die(nr_components * sizeof(double)); scaling_factor = -FLT_MAX; X_sum = 0.0; X_values = malloc_or_die(hmmp->a_size * sizeof(double)); /* calculate logB(alpha + n) for all components + * calculate scaling factor for logB(alpha + n) - logB(alpha) */ for(comps = 0; comps < nr_components; comps++) { ed_res1 = 0; occ_sums = 0; for(a_index = 0; a_index < hmmp->a_size; a_index++) { ed_res1 += lgamma(*(em_di->prior_values + get_mtx_index(comps, a_index, hmmp->a_size)) + (double)((msa_seq_infop->msa_seq + get_mtx_index(l,a_index,hmmp->a_size+1))->nr_occurences)); occ_sums += (msa_seq_infop->msa_seq + get_mtx_index(l,a_index, hmmp->a_size+1))->nr_occurences; } ed_res1 = ed_res1 - lgamma(*(em_di->alpha_sums + comps) + (double)(occ_sums)); *(logbeta_an_values + comps) = ed_res1; if((ed_res1 = ed_res1 - *(em_di->logbeta_values + comps)) > scaling_factor) { scaling_factor = ed_res1; } } /* calculate all the Xi's */ for(a_index = 0; a_index < hmmp->a_size; a_index++) { *(X_values + a_index) = 0; for(comps = 0; comps < nr_components; comps++) { q_value = *(em_di->q_values + comps); exponent = (*(logbeta_an_values + comps) - *(em_di->logbeta_values + comps) - scaling_factor); prior_prob = (*(em_di->prior_values + get_mtx_index(comps,a_index, hmmp->a_size)) + (double)((msa_seq_infop->msa_seq + get_mtx_index(l,a_index,hmmp->a_size+1))->nr_occurences)); tot_prior_prob = (*(em_di->alpha_sums + comps) + (double)(occ_sums)); *(X_values + a_index) += q_value * exp(exponent) * prior_prob / tot_prior_prob;#ifdef DEBUG_PRI printf("\nscaling factor = %f\n", scaling_factor); printf("a_index = %d\n", a_index); printf("q_value = %f\n", q_value); printf("exponent = %f\n", exponent); printf("prior_prob = %f\n", prior_prob); printf("tot_prior_prob = %f\n\n", tot_prior_prob);#endif } X_sum += *(X_values + a_index); } /* update share values */ for(a_index = 0; a_index < hmmp->a_size; a_index++) { ed_res1 = *(X_values + a_index) / X_sum; if(ed_res1 != 0.0) { (msa_seq_infop->msa_seq + get_mtx_index(l, a_index, hmmp->a_size+1))->prior_share = ed_res1; } else { (msa_seq_infop->msa_seq + get_mtx_index(l, a_index, hmmp->a_size+1))->prior_share = ed_res1; } } /* cleanup */ free(logbeta_an_values); free(X_values);}int replacement_letter(struct letter_s *cur_letterp, struct replacement_letter_s *replacement_letters, struct msa_sequences_s *msa_seq_infop, struct hmm_s *hmmp, int seq_pos){ int i,j,k; struct letter_s *repl_letter; int same_letter; /* find out if letter in cur_letterp is a replacement_letter */ for(i = 0; i < replacement_letters->nr_rl; i++) { repl_letter = replacement_letters->letters + i; same_letter = YES; j = 0; while(*(repl_letter->letter + j) != '\0' && *(cur_letterp->letter + j) != '\0') { if(*(repl_letter->letter + j) == *(cur_letterp->letter + j)) { } else { same_letter = NO; } j++; } if(*(repl_letter->letter + j) != '\0' || *(cur_letterp->letter + j) != '\0') { same_letter = NO; } else if(same_letter == YES) { break; } } if(same_letter == NO) { return NO; } else { /* k represents the regular letter, i represents which repl_letter this is */ for(k = 0; k < hmmp->a_size; k++) { (msa_seq_infop->msa_seq + get_mtx_index(seq_pos,k, hmmp->a_size+1))->nr_occurences += *(replacement_letters->probs + get_mtx_index(i,k,hmmp->a_size)); } return YES; } }void get_labels_multi(FILE *labelfile, struct sequences_multi_s *seq_infop, struct hmm_multi_s *hmmp, int seq_nr) { char row[30000]; int i,j; rewind(labelfile); while(1) { if(fgets(row,30000,labelfile) != NULL) { if(row[0] == '/') { for(i = 1; row[i] != '/';i++) { ((seq_infop->seqs + seq_nr)->seq_1 + (i-1))->label = row[i]; if(hmmp->nr_alphabets > 1) { ((seq_infop->seqs + seq_nr)->seq_2 + (i-1))->label = row[i]; } if(hmmp->nr_alphabets > 2) { ((seq_infop->seqs + seq_nr)->seq_3 + (i-1))->label = row[i]; } if(hmmp->nr_alphabets > 3) { ((seq_infop->seqs + seq_nr)->seq_4 + (i-1))->label = row[i]; } } } } else { break; } } rewind(labelfile);}void get_msa_labels_multi(FILE *labelfile, struct msa_sequences_multi_s *msa_seq_infop, struct hmm_multi_s *hmmp) { char row[30000]; int i,j; rewind(labelfile); while(1) { if(fgets(row,30000,labelfile) != NULL) { if(row[0] == '/') { for(i = 1; row[i] != '/';i++) { (msa_seq_infop->msa_seq_1 + (*(msa_seq_infop->lead_columns_start + i-1)) * (hmmp->a_size+1))->label = row[i]; if(hmmp->nr_alphabets > 1) { (msa_seq_infop->msa_seq_2 + (*(msa_seq_infop->lead_columns_start + i-1)) * (hmmp->a_size_2+1))->label = row[i]; } if(hmmp->nr_alphabets > 2) { (msa_seq_infop->msa_seq_3 + (*(msa_seq_infop->lead_columns_start + i-1)) * (hmmp->a_size_3+1))->label = row[i]; } if(hmmp->nr_alphabets > 3) { (msa_seq_infop->msa_seq_4 + (*(msa_seq_infop->lead_columns_start + i-1)) * (hmmp->a_size_4+1))->label = row[i]; } } } } else { break; } }}void get_msa_labels_all_columns_multi(FILE *labelfile, struct msa_sequences_multi_s *msa_seq_infop, struct hmm_multi_s *hmmp) { char row[30000]; int i,j; rewind(labelfile); while(1) { if(fgets(row,30000,labelfile) != NULL) { if(row[0] == '/') { for(i = 1; row[i] != '/';i++) { (msa_seq_infop->msa_seq_1 + (i-1) * (hmmp->a_size+1))->label = row[i]; if(hmmp->nr_alphabets > 1) { (msa_seq_infop->msa_seq_2 + (i-1) * (hmmp->a_size_2+1))->label = row[i]; } if(hmmp->nr_alphabets > 2) { (msa_seq_infop->msa_seq_3 + (i-1) * (hmmp->a_size_3+1))->label = row[i]; } if(hmmp->nr_alphabets > 3) { (msa_seq_infop->msa_seq_4 + (i-1) * (hmmp->a_size_4+1))->label = row[i]; } } } } else { break; } }}int update_shares_prior_multi(struct emission_dirichlet_s *em_di, struct hmm_multi_s *hmmp, struct msa_sequences_multi_s *msa_seq_infop, int l, int alphabet){ int nr_components, comps, a_index; double occ_sums; double q_value, scaling_factor, X_sum, *X_values, ed_res1, *logbeta_an_values; double exponent, prior_prob, tot_prior_prob; int a_size; struct msa_letter_s *msa_seq; /* check if this alphabet has a prior */ if(em_di->nr_components <= 0) { //printf("em_di nr comps = %d\n",em_di->nr_components); //printf("alphabet = %d\n", alphabet); return; } /* set a_size and msa_seq according to alphabet */ if(alphabet == 1) { a_size = hmmp->a_size; msa_seq = msa_seq_infop->msa_seq_1; } if(alphabet == 2) { a_size = hmmp->a_size_2; msa_seq = msa_seq_infop->msa_seq_2; } if(alphabet == 3) { a_size = hmmp->a_size_3; msa_seq = msa_seq_infop->msa_seq_3; } if(alphabet == 4) { a_size = hmmp->a_size_4; msa_seq = msa_seq_infop->msa_seq_4; } /************the update part ********************/ nr_components = em_di->nr_components; logbeta_an_values = malloc_or_die(nr_components * sizeof(double)); scaling_factor = -FLT_MAX; X_sum = 0.0; X_values = malloc_or_die(a_size * sizeof(double)); /* calculate logB(alpha + n) for all components + * calculate scaling factor for logB(alpha + n) - logB(alpha) */ for(comps = 0; comps < nr_components; comps++) { ed_res1 = 0; occ_sums = 0; for(a_index = 0; a_index < a_size; a_index++) { ed_res1 += lgamma(*(em_di->prior_values + get_mtx_index(comps, a_index, a_size)) + (double)((msa_seq + get_mtx_index(l,a_index,a_size+1))->nr_occurences)); occ_sums += (msa_seq + get_mtx_index(l,a_index, a_size+1))->nr_occurences; } ed_res1 = ed_res1 - lgamma(*(em_di->alpha_sums + comps) + (double)(occ_sums)); *(logbeta_an_values + comps) = ed_res1; if((ed_res1 = ed_res1 - *(em_di->logbeta_values + comps)) > scaling_factor) { scaling_factor = ed_res1; } } /* calculate all the Xi's */ for(a_index = 0; a_index < a_size; a_index++) { *(X_values + a_index) = 0; for(comps = 0; comps < nr_components; comps++) { q_value = *(em_di->q_values + comps); exponent = (*(logbeta_an_values + comps) - *(em_di->logbeta_values + comps) - scaling_factor); prior_prob = (*(em_di->prior_values + get_mtx_index(comps,a_index, a_size)) + (double)((msa_seq + get_mtx_index(l,a_index,a_size+1))->nr_occurences)); tot_prior_prob = (*(em_di->alpha_sums + comps) + (double)(occ_sums)); *(X_values + a_index) += q_value * exp(exponent) * prior_prob / tot_prior_prob;#ifdef DEBUG_PRI printf("\nscaling factor = %f\n", scaling_factor); printf("a_index = %d\n", a_index); printf("q_value = %f\n", q_value); printf("exponent = %f\n", exponent); printf("prior_prob = %f\n", prior_prob); printf("tot_prior_prob = %f\n\n", tot_prior_prob);#endif } X_sum += *(X_values + a_index); } /* update share values */ //printf("a_size = %d\n",a_size); for(a_index = 0; a_index < a_size; a_index++) { ed_res1 = *(X_values + a_index) / X_sum; if(ed_res1 != 0.0) { (msa_seq + get_mtx_index(l, a_index, a_size+1))->prior_share = ed_res1; } else { (msa_seq + get_mtx_index(l, a_index, a_size+1))->prior_share = ed_res1; } //printf("msa_seq = %f\n",(msa_seq + get_mtx_index(l, a_index, a_size+1))->prior_share); } //printf("returning\n"); /* cleanup */ free(logbeta_an_values); free(X_values);}int replacement_letter_multi(struct letter_s *cur_letterp, struct replacement_letter_multi_s *replacement_letters, struct msa_sequences_multi_s *msa_seq_infop, struct hmm_multi_s *hmmp, int seq_pos, int alphabet){ int i,j,k; struct letter_s *repl_letter; int same_letter; /* find out if letter in cur_letterp is a replacement_letter */ if(alphabet == 1) { if(replacement_letters->nr_rl_1 <= 0) { return NO; } for(i = 0; i < replacement_letters->nr_rl_1; i++) { repl_letter = replacement_letters->letters_1 + i; same_le
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -