📄 hmmsearch.c
字号:
if(seq_format == STANDARD || seq_format == FASTA) { nullmodel_score = get_nullmodel_score_multi(seq_info.seqs->seq_1, seq_info.seqs->seq_2, seq_info.seqs->seq_3, seq_info.seqs->seq_4, seq_len, multi_scoring_method); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { nullmodel_score = get_nullmodel_score_msa_multi(seq_len, use_labels, use_prior, use_gap_shares, normalize, scoring_method, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4); } if(run_viterbi == YES) { logodds = viterbi_score - nullmodel_score; } if(run_forward == YES) { logodds = forward_score - nullmodel_score; } /* print to outfile */ fprintf(outfile, "Logodds: %.4f\n", logodds); } /* get reversi score + print + deallocate */ if(print_reversi == YES) { if(seq_format == STANDARD || seq_format == FASTA) { get_reverse_seq_multi(seq_info.seqs, &reverse_seq, &reverse_seq_2, &reverse_seq_3, &reverse_seq_4, hmmp, seq_len); if(run_forward == YES) { forward_multi(hmmp, reverse_seq, reverse_seq_2, reverse_seq_3, reverse_seq_4, &rev_forw_mtx, &rev_forw_scale, use_labels, multi_scoring_method); rev_forward_score = log10((rev_forw_mtx + get_mtx_index(seq_len+1, hmmp->nr_v-1, hmmp->nr_v))->prob); for (j = seq_len; j > 0; j--) { rev_forward_score = rev_forward_score + log10(*(rev_forw_scale + j)); } } else if(run_viterbi == YES) { viterbi_multi(hmmp, reverse_seq, reverse_seq_2, reverse_seq_3, reverse_seq_4, &rev_viterbi_mtx, use_labels, multi_scoring_method); rev_viterbi_score = (rev_viterbi_mtx + get_mtx_index(seq_len+1, hmmp->nr_v-1, hmmp->nr_v))->prob; if(rev_viterbi_score == DEFAULT) { rev_viterbi_score = log10(0.0); } } } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { get_reverse_msa_seq_multi(msa_seq_infop, &reverse_msa_seq_info, hmmp); if(run_forward == YES) { msa_forward_multi(hmmp, &reverse_msa_seq_info, use_lead_columns, use_gap_shares, use_prior, &rev_forw_mtx, &rev_forw_scale, use_labels, normalize, scoring_method, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4); rev_forward_score = log10((rev_forw_mtx + get_mtx_index(seq_len+1, hmmp->nr_v-1, hmmp->nr_v))->prob); for (j = seq_len; j > 0; j--) { rev_forward_score = rev_forward_score + log10(*(rev_forw_scale + j)); } } else if(run_viterbi == YES) { msa_viterbi_multi(hmmp, &reverse_msa_seq_info, use_lead_columns, use_gap_shares, use_prior, &rev_viterbi_mtx, use_labels, normalize, scoring_method, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4); rev_viterbi_score = (rev_viterbi_mtx + get_mtx_index(seq_len+1, hmmp->nr_v-1, hmmp->nr_v))->prob; if(rev_viterbi_score == DEFAULT) { rev_viterbi_score = log10(0.0); } } } if(run_viterbi == YES) { reversi = viterbi_score - rev_viterbi_score; } if(run_forward == YES) { reversi = forward_score - rev_forward_score; } /* print to outfile */ fprintf(outfile, "Reversi: %.4f\n", reversi); /* deallocate */ if(seq_format == STANDARD || seq_format == FASTA) { free(reverse_seq); if(hmmp->nr_alphabets > 1) { free(reverse_seq_2); } if(hmmp->nr_alphabets > 2) { free(reverse_seq_3); } if(hmmp->nr_alphabets > 3) { free(reverse_seq_4); } } if(seq_format == MSA_STANDARD || seq_format == PROFILE) { free(reverse_msa_seq_info.msa_seq_1); if(hmmp->nr_alphabets > 1) { free(reverse_msa_seq_info.msa_seq_2); } if(hmmp->nr_alphabets > 2) { free(reverse_msa_seq_info.msa_seq_3); } if(hmmp->nr_alphabets > 3) { free(reverse_msa_seq_info.msa_seq_4); } free(reverse_msa_seq_info.gaps); free(reverse_msa_seq_info.gap_shares); free(reverse_msa_seq_info.lead_columns_start); } if(run_forward == YES) { free(rev_forw_mtx); free(rev_forw_scale); } if(run_viterbi == YES) { free(rev_viterbi_mtx); } } /* get labeling + print + deallocate */ if(print_labels == YES) { if(seq_format == STANDARD || seq_format == FASTA) { if(run_viterbi == YES) { a = 0; labels = (char*)malloc_or_die((seq_len + 1) * sizeof(char)); get_viterbi_label_path_multi(viterbi_mtx + get_mtx_index(seq_len+1,hmmp->nr_v-1,hmmp->nr_v), hmmp, viterbi_mtx, seq_len + 1, hmmp->nr_v, labels, &a); labels[seq_len] = '\0'; } } if(seq_format == MSA_STANDARD || seq_format == PROFILE) { if(run_viterbi == YES) { a = 0; labels = (char*)malloc_or_die((seq_len + 1) * sizeof(char)); get_viterbi_label_path_multi(viterbi_mtx + get_mtx_index(seq_len+1, hmmp->nr_v-1, hmmp->nr_v), hmmp, viterbi_mtx, seq_len + 1, hmmp->nr_v, labels, &a); labels[seq_len] = '\0'; } } /* print to outfile */ fprintf(outfile, "Labeling:\n"); j = 0; while(1) { if(labels[j] == '\0') { fprintf(outfile, "\n"); break; } else { fprintf(outfile, "%c", labels[j]); j++; } if(j%60 == 0) { fprintf(outfile, "\n"); } } /* deallocate */ free(labels); } /* get posterior probs + print + deallocate */ if(print_post_probs == YES) { get_post_prob_matrix(&post_prob_matrix, raw_forward_score, forw_mtx, backw_mtx, seq_len); //dump_post_prob_matrix(seq_len + 2, hmmp->nr_v, post_prob_matrix); post_prob_label_matrix = (double*)(malloc_or_die((seq_len+2) * hmmp->nr_labels * sizeof(double))); for(i = 0; i < seq_len + 2; i++) { for(j = 1; j < hmmp->nr_v - 1; j++) { int label_nr = -1; for(k = 0; k < hmmp->nr_labels; k++) { if(hmmp->vertex_labels[j] == hmmp->labels[k]) { label_nr = k; break; } } if(label_nr < 0) { printf("Internal error: strange label\n"); exit(0); } *(post_prob_label_matrix + get_mtx_index(i,label_nr,hmmp->nr_labels)) += *(post_prob_matrix + get_mtx_index(i,j,hmmp->nr_v)); } } //dump_post_prob_matrix(seq_len + 2, hmmp->nr_labels, post_prob_label_matrix); fprintf(outfile, "Posterior probabilities:\n"); fprintf(outfile, "pos\t"); for(i = 0; i < hmmp->nr_labels; i++) { fprintf(outfile, "%c\t", hmmp->labels[i]); } fprintf(outfile, "\n"); for(i = 1; i < seq_len + 1; i++) { fprintf(outfile, "%d\t", i); for(j = 0; j < hmmp->nr_labels; j++) { fprintf(outfile, "%.3f\t", *(post_prob_label_matrix + get_mtx_index(i,j,hmmp->nr_labels))); } fprintf(outfile, "\n"); } fprintf(outfile, "\n"); free(post_prob_label_matrix); free(post_prob_matrix); } /* get path + print + deallocate */ if(print_path == YES) { if(seq_format == STANDARD || seq_format == FASTA) { if(print_path == YES) { path = (int*)malloc_or_die((seq_len+1) * 2 * sizeof(int)); path_incrementable = path; b = 0; get_viterbi_path_multi(viterbi_mtx + get_mtx_index(seq_len+1,hmmp->nr_v-1,hmmp->nr_v), hmmp, viterbi_mtx, seq_len + 1, hmmp->nr_v, path_incrementable, &b); } } if(seq_format == MSA_STANDARD || seq_format == PROFILE) { if(print_path == YES) { path = (int*)malloc_or_die((seq_len+1) * 2 * sizeof(int)); path_incrementable = path; b = 0; get_viterbi_path_multi(viterbi_mtx + get_mtx_index(seq_len+1,hmmp->nr_v-1,hmmp->nr_v), hmmp, viterbi_mtx, seq_len + 1, hmmp->nr_v, path_incrementable, &b); } } /* print to outfile */ fprintf(outfile, "Path:\n"); if(print_path == YES) { for(j = 0; j < seq_len; j++) { if(j % 60 == 0 && j != 0) { fprintf(outfile, "\n"); } fprintf(outfile, "%d ", path[j]); } fprintf(outfile, "\n"); } /* deallocate */ free(path); } fprintf(outfile, "\n"); /* deallocate result matrix info */ if(run_forward == YES) { free(forw_mtx); free(forw_scale); } if(run_backward == YES) { free(backw_mtx); } if(run_viterbi == YES || print_path == YES) { free(viterbi_mtx); } if(run_n_best == YES) { free(one_best_mtx); free(scaling_factors); } }/*********************help functions***************************************/double get_nullmodel_score_multi(struct letter_s *seq, struct letter_s *seq_2, struct letter_s *seq_3, struct letter_s *seq_4, int seq_len, int multi_scoring_method){ int avg_seq_len; double emiss_prob, emiss_prob_2, emiss_prob_3, emiss_prob_4; double e1, e2, e3, e4; double trans_prob; double null_model_score; int letter, letter_2, letter_3, letter_4; int l,i; double t_res; /* calculate null model score for seq */ if(null_model.nr_alphabets < 0) { /* use default null model */ emiss_prob = 1.0 / (double)hmm.a_size; if(hmm.nr_alphabets > 1) { emiss_prob_2 = 1.0 / (double)hmm.a_size_2; } if(hmm.nr_alphabets > 2) { emiss_prob_3 = 1.0 / (double)hmm.a_size_3; } if(hmm.nr_alphabets > 3) { emiss_prob_4 = 1.0 / (double)hmm.a_size_4; } trans_prob = (double)(seq_len)/(double)(seq_len + 1); if(multi_scoring_method == JOINT_PROB) { if(hmm.nr_alphabets == 1) { null_model_score = seq_len * (log10(emiss_prob) + log10(trans_prob)); } else if(hmm.nr_alphabets == 2) { null_model_score = seq_len * (log10(emiss_prob) + log10(emiss_prob_2) + log10(trans_prob)); } else if(hmm.nr_alphabets == 3) { null_model_score = seq_len * (log10(emiss_prob) + log10(emiss_prob_2) + log10(emiss_prob_3) + log10(trans_prob)); } else if(hmm.nr_alphabets == 2) { null_model_score = seq_len * (log10(emiss_prob) + log10(emiss_prob_2) + log10(emiss_prob_3) + log10(emiss_prob_4) + log10(trans_prob)); } } else { /* use other multialpha scoring method, not implemented yet */ printf("Error: only JOINT_PROB scoring is implemented\n"); exit(0); } } else { /* use specified null model */ null_model_score = 0.0; for(l = 0; l < seq_len; l++) { letter = get_alphabet_index(&seq[l], hmm.alphabet, hmm.a_size); if(hmm.nr_alphabets > 1) { letter_2 = get_alphabet_index(&seq_2[l], hmm.alphabet_2, hmm.a_size_2); } if(hmm.nr_alphabets > 2) { letter_3 = get_alphabet_index(&seq_3[l], hmm.alphabet_3, hmm.a_size_3); } if(hmm.nr_alphabets > 3) { letter_4 = get_alphabet_index(&seq_4[l], hmm.alphabet_4, hmm.a_size_4); } if(letter >= 0) { e1 = null_model.emissions[letter]; //null_model_score += log10(null_model.emissions[letter]) + log10(null_model.trans_prob); } else { /* need replacement letters */ letter = get_replacement_letter_index_multi(&seq[l], &replacement_letters, 1); if(letter >= 0) { e1 = 0.0; for(i = 0; i < hmm.a_size; i++) { e1 += *(hmm.replacement_letters->probs_1 + get_mtx_index(letter, i, hmm.a_size)) * null_model.emissions[i]; } //null_model_score += log10(t_res) + log10(null_model.trans_prob); } else { printf("Could not find letter %s when scoring null model\n", &seq[l]); return DEFAULT; } } if(hmm.nr_alphabets > 1) { if(letter_2 >= 0) { e2 = null_model.emissions_2[letter_2]; } else { /* need replacement letters */ letter_2 = get_replacement_letter_index_multi(&seq_2[l], &replacement_letters, 2); if(letter_2 >= 0) { e2 = 0.0; for(i = 0; i < hmm.a_size_2; i++) { e2 += *(hmm.replacement_letters->probs_2 + get_mtx_index(letter_2, i, hmm.a_size_2)) * null_model.emissions_2[i]; } } else { printf("Could not find letter %s when scoring null model\n", &seq_2[l]); return DEFAULT; } } } if(hmm.nr_alphabets > 2) { if(letter_3 >= 0) { e3 = null_model.emissions_3[letter_3]; } else { /* need replacement letters */ letter_3 = get_replacement_letter_index_multi(&seq_3[l], &replacement_letters, 3); if(letter_3 >= 0) { e3 = 0.0; for(i = 0; i < hmm.a_size_3; i++) { e3 += *(hmm.replacement_letters->probs_3 + get_mtx_index(letter_3, i, hmm.a_size_3)) * null_model.emissions_3[i]; } } else { printf("Could not find letter %s when scoring null model\n", &seq_3[l]); return DEFAULT; } } } if(hmm.nr_alphabets > 3) { if(letter_4 >= 0) { e4 = null_model.emissions_4[letter_4]; } else { /* need replacement letters */ letter_4 = get_replacement_letter_index_multi(&seq_4[l], &replacement_letters, 4); if(letter_4 >= 0) { e4 = 0.0; for(i = 0; i < hmm.a_size_4; i++) { e4 += *(hmm.replacement_letters->probs_4 + get_mtx_index(letter_4, i, hmm.a_size_4)) * null_model.emissions_4[i]; } } else { printf("Could not find letter %s when scoring null model\n", &seq[l]); return DEFAULT; } } } if(multi_scoring_method == JOINT_PROB) { null_model_score += log10(e1) + log10(null_model.trans_prob); if(hmm.nr_alphabets > 1) { null_model_score += log10(e2); } if(hmm.nr_alphabets > 2) { null_model_score += log10(e3); } if(hmm.nr_alphabets > 3) { null_model_score += log10(e4); } } else { /* use other multialpha scoring method, not implemented yet */ printf("Error: only JOINT_PROB scoring is implemented\n"); exit(0); } } } return null_model_score;}double get_nullmodel_score_msa_multi(int seq_len, int prf_mode, int use_prior, int use_gap_shares, int normalize, int scoring_method, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4){ int avg_seq_len; double emiss_prob, emiss_prob_2, emiss_prob_3, emiss_prob_4; double trans_prob; double null_model_score; int letter; int k,l,p,m; double col_score, col_score_2, col_score_3, col_score_4; int using_default_null_model; double seq_normalizer, state_normalizer; using_default_null_model = NO; /* calculate null model score for seq */ if(null_model.nr_alphabets < 0) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -