📄 modseqalign.c
字号:
/* read hmm */ /* get hmm from file */ if(hmmfile != NULL) { hmmfiletype = readhmm_check(hmmfile); if(hmmfiletype == SINGLE_HMM) { readhmm(hmmfile, &hmm); } else if(hmmfiletype == MULTI_HMM) { readhmm_multialpha(hmmfile, &hmm); } hmm.subst_mtx = subst_mtxp; hmm.subst_mtx_2 = subst_mtxp_2; hmm.subst_mtx_3 = subst_mtxp_3; hmm.subst_mtx_4 = subst_mtxp_4; } else { /* cannot happen */ printf("Internal error: hmmfile not found, should have been detected earlier\n"); } hmm.replacement_letters = &replacement_letters; /* read template seq */ /* allocate space for msa_seq_info structs */ if(seq_format == MSA_STANDARD || seq_format == PROFILE) { msa_seq_infop_template = (struct msa_sequences_multi_s*)(malloc_or_die(1 * sizeof(struct msa_sequences_multi_s))); } else if(seq_format == FASTA || seq_format == STANDARD) { seq_info_template.seqs = malloc_or_die(sizeof(struct sequence_multi_s) * 1); seq_info_template.nr_alphabets = hmm.nr_alphabets; seq_info_template.nr_seqs = 1; seq_info_template.longest_seq = 0; seq_info_template.shortest_seq = INT_MAX; seq_info_template.avg_seq_len = 0; } /* check sequence file for labels + check nolabel flag */ if(seqfile_has_labels(template_seqfile) == YES) { use_labels = YES; } else { use_labels = NO; } if(args_info.nolabels_given) { use_labels = NO; } if(seq_format == FASTA) { if(hmm.nr_alphabets > 1) { printf("fasta is a one alphabet format only\n"); exit(0); } else { get_sequence_fasta_multi(template_seqfile, &seq_info_template, 0); hmm.alphabet_type = DISCRETE; if(use_labels == YES) { get_labels_multi(template_seqfile, &seq_info_template, &hmm, 0); } } } else if(seq_format == STANDARD) { get_sequence_std_multi(template_seqfile, &seq_info_template, &hmm, 0); if(use_labels == YES) { get_labels_multi(template_seqfile, &seq_info_template, &hmm, 0); } } else if(seq_format == MSA_STANDARD) { if(use_lead_columns == YES) { get_sequences_msa_std_multi(template_seqfile, NULL, msa_seq_infop_template, &hmm, lead_seq, &replacement_letters); } else { get_sequences_msa_std_multi(template_seqfile, NULL, msa_seq_infop_template, &hmm, -1, &replacement_letters); } /* get labels */ if(use_labels == YES) { if(use_lead_columns == YES) { get_msa_labels_multi(template_seqfile, msa_seq_infop_template, &hmm); } else { get_msa_labels_all_columns_multi(template_seqfile, msa_seq_infop_template, &hmm); } } } else if(seq_format == PROFILE) { get_sequences_msa_prf_multi(template_seqfile, NULL, msa_seq_infop_template, &hmm); } fclose(template_seqfile); if(seq_format == STANDARD || seq_format == FASTA) { seq_info_template.avg_seq_len = ((int)(seq_info_template.avg_seq_len / seq_info_template.nr_seqs)); //dump_labeled_seqs_multi(&seq_info_template); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { } /* score template seq, get path */ /* get seq_length */ if(seq_format == STANDARD || seq_format == FASTA) { seq_len_template = get_seq_length(seq_info_template.seqs->seq_1); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { if(use_lead_columns == YES) { seq_len_template = msa_seq_infop_template->nr_lead_columns; } else { seq_len_template = msa_seq_infop_template->msa_seq_length; } } if(seq_format == STANDARD || seq_format == FASTA) { viterbi_multi(&hmm, seq_info_template.seqs->seq_1, seq_info_template.seqs->seq_2, seq_info_template.seqs->seq_3, seq_info_template.seqs->seq_4, &viterbi_mtx, use_labels, multi_scoring_method); viterbi_score = (viterbi_mtx + get_mtx_index(seq_len_template+1, hmm.nr_v-1, hmm.nr_v))->prob; if(viterbi_score == DEFAULT) { viterbi_score = log10(0.0); } } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { msa_viterbi_multi(&hmm, msa_seq_infop_template, use_lead_columns, use_gap_shares, use_prior, &viterbi_mtx, use_labels, normalize, scoring_method, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4); viterbi_score = (viterbi_mtx + get_mtx_index(seq_len_template+1, hmm.nr_v-1, hmm.nr_v))->prob; if(viterbi_score == DEFAULT) { viterbi_score = log10(0.0); } } if(seq_format == STANDARD || seq_format == FASTA) { path_template = (int*)malloc_or_die((seq_len_template+1) * 2 * sizeof(int)); path_incrementable_template = path_template; b = 0; get_viterbi_path_multi(viterbi_mtx + get_mtx_index(seq_len_template+1,hmm.nr_v-1,hmm.nr_v), &hmm, viterbi_mtx, seq_len_template + 1, hmm.nr_v, path_incrementable_template, &b); } if(seq_format == MSA_STANDARD || seq_format == PROFILE) { path_template = (int*)malloc_or_die((seq_len_template+1) * 2 * sizeof(int)); path_incrementable_template = path_template; b = 0; get_viterbi_path_multi(viterbi_mtx + get_mtx_index(seq_len_template+1,hmm.nr_v-1,hmm.nr_v), &hmm, viterbi_mtx, seq_len_template + 1, hmm.nr_v, path_incrementable_template, &b); } /* print*/#ifdef DEBUG_ALIGN printf("Path:\n"); for(k = 0; k < seq_len_template; k++) { if(k % 60 == 0 && k != 0) { printf("\n"); } printf("%d ", path_template[k]); } printf("\n");#endif /* deallocate stuff */ free(viterbi_mtx); /* read seq 2 */ /* allocate space for msa_seq_info structs */ if(seq_format == MSA_STANDARD || seq_format == PROFILE) { msa_seq_infop_target = (struct msa_sequences_multi_s*)(malloc_or_die(1 * sizeof(struct msa_sequences_multi_s))); } else if(seq_format == FASTA || seq_format == STANDARD) { seq_info_target.seqs = malloc_or_die(sizeof(struct sequence_multi_s) * 1); seq_info_target.nr_alphabets = hmm.nr_alphabets; seq_info_target.nr_seqs = 1; seq_info_target.longest_seq = 0; seq_info_target.shortest_seq = INT_MAX; seq_info_target.avg_seq_len = 0; } /* check sequence file for labels + check nolabel flag */ if(seqfile_has_labels(target_seqfile) == YES) { use_labels = YES; } else { use_labels = NO; } if(args_info.nolabels_given) { use_labels = NO; } if(seq_format == FASTA) { if(hmm.nr_alphabets > 1) { printf("fasta is a one alphabet format only\n"); exit(0); } else { get_sequence_fasta_multi(target_seqfile, &seq_info_target, 0); hmm.alphabet_type = DISCRETE; if(use_labels == YES) { get_labels_multi(target_seqfile, &seq_info_target, &hmm, 0); } } } else if(seq_format == STANDARD) { get_sequence_std_multi(target_seqfile, &seq_info_target, &hmm, 0); if(use_labels == YES) { get_labels_multi(target_seqfile, &seq_info_target, &hmm, 0); } } else if(seq_format == MSA_STANDARD) { if(use_lead_columns == YES) { get_sequences_msa_std_multi(target_seqfile, NULL, msa_seq_infop_target, &hmm, lead_seq, &replacement_letters); } else { get_sequences_msa_std_multi(target_seqfile, NULL, msa_seq_infop_target, &hmm, -1, &replacement_letters); } /* get labels */ if(use_labels == YES) { if(use_lead_columns == YES) { get_msa_labels_multi(target_seqfile, msa_seq_infop_target, &hmm); } else { get_msa_labels_all_columns_multi(target_seqfile, msa_seq_infop_target, &hmm); } } } else if(seq_format == PROFILE) { get_sequences_msa_prf_multi(target_seqfile, NULL, msa_seq_infop_target, &hmm); } fclose(target_seqfile); if(seq_format == STANDARD || seq_format == FASTA) { seq_info_target.avg_seq_len = ((int)(seq_info_target.avg_seq_len / seq_info_target.nr_seqs)); //dump_labeled_seqs_multi(&seq_info_target); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { } /* score seq 2, get path */ /* get seq_length */ if(seq_format == STANDARD || seq_format == FASTA) { seq_len_target = get_seq_length(seq_info_target.seqs->seq_1); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { if(use_lead_columns == YES) { seq_len_target = msa_seq_infop_target->nr_lead_columns; } else { seq_len_target = msa_seq_infop_target->msa_seq_length; } } if(seq_format == STANDARD || seq_format == FASTA) { viterbi_multi(&hmm, seq_info_target.seqs->seq_1, seq_info_target.seqs->seq_2, seq_info_target.seqs->seq_3, seq_info_target.seqs->seq_4, &viterbi_mtx, use_labels, multi_scoring_method); viterbi_score = (viterbi_mtx + get_mtx_index(seq_len_target+1, hmm.nr_v-1, hmm.nr_v))->prob; if(viterbi_score == DEFAULT) { viterbi_score = log10(0.0); } } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { msa_viterbi_multi(&hmm, msa_seq_infop_target, use_lead_columns, use_gap_shares, use_prior, &viterbi_mtx, use_labels, normalize, scoring_method, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4); viterbi_score = (viterbi_mtx + get_mtx_index(seq_len_target+1, hmm.nr_v-1, hmm.nr_v))->prob; if(viterbi_score == DEFAULT) { viterbi_score = log10(0.0); } } if(seq_format == STANDARD || seq_format == FASTA) { path_target = (int*)malloc_or_die((seq_len_target+1) * 2 * sizeof(int)); path_incrementable_target = path_target; b = 0; get_viterbi_path_multi(viterbi_mtx + get_mtx_index(seq_len_target+1,hmm.nr_v-1,hmm.nr_v), &hmm, viterbi_mtx, seq_len_target + 1, hmm.nr_v, path_incrementable_target, &b); } if(seq_format == MSA_STANDARD || seq_format == PROFILE) { path_target = (int*)malloc_or_die((seq_len_target+1) * 2 * sizeof(int)); path_incrementable_target = path_target; b = 0; get_viterbi_path_multi(viterbi_mtx + get_mtx_index(seq_len_target+1,hmm.nr_v-1,hmm.nr_v), &hmm, viterbi_mtx, seq_len_target + 1, hmm.nr_v, path_incrementable_target, &b); } /* print*/#ifdef DEBUG_ALIGN printf("Path:\n"); for(k = 0; k < seq_len_target; k++) { if(k % 60 == 0 && k != 0) { printf("\n"); } printf("%d ", path_target[k]); } printf("\n");#endif /* deallocate stuff */ free(viterbi_mtx); /* align paths so that maximum number of letters are emitted by the same state */ //get seq names and send along fprintf(outfile, "Aligment based on same-state-emission\n"); fprintf(outfile, "Target file: %s\n", args_info.target_arg); fprintf(outfile, "Template file: %s\n", args_info.template_arg); align_paths(&path_template, &path_target, seq_len_target, seq_len_template, outfile, seq_format); /* garbage collection */ if(replfile != NULL) { if(replacement_letters.nr_rl_1 > 0) { free(replacement_letters.letters_1); free(replacement_letters.probs_1); } if(replacement_letters.nr_rl_2 > 0) { free(replacement_letters.letters_2); free(replacement_letters.probs_2); } if(replacement_letters.nr_rl_3 > 0) { free(replacement_letters.letters_3); free(replacement_letters.probs_3); } if(replacement_letters.nr_rl_4 > 0) { free(replacement_letters.letters_4); free(replacement_letters.probs_4); } fclose(replfile); } /* free allocated sequence memory for msa/profile and single seqs respectively */ if(seq_format == MSA_STANDARD || seq_format == PROFILE) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -