📄 hmmsearch.c
字号:
if((scoring_method == SUBST_MTX_PRODUCT || scoring_method == SUBST_MTX_DOT_PRODUCT || scoring_method == SUBST_MTX_DOT_PRODUCT_PRIOR) && read_subst_mtx == NO) { printf("Error: No substitution matrix supplied\n"); exit(0); } /* read null model */ get_null_model_multi(nullfile); if(query == SEQ) { /* get hmms and score all seqs against them */ while(fgets(hmm_name, MAX_LINE, hmmnamefile) != NULL) { /* get hmm from file */ hmm_name[strlen(hmm_name)-1] = '\0'; if((hmmfile = fopen(hmm_name, "r")) == NULL) { perror(hmm_name); continue; } hmmfiletype = readhmm_check(hmmfile); if(hmmfiletype == SINGLE_HMM) { readhmm(hmmfile, &hmm); } else if(hmmfiletype == MULTI_HMM) { readhmm_multialpha(hmmfile, &hmm); } hmm.subst_mtx = subst_mtxp; hmm.subst_mtx_2 = subst_mtxp_2; hmm.subst_mtx_3 = subst_mtxp_3; hmm.subst_mtx_4 = subst_mtxp_4; hmm.replacement_letters = &replacement_letters; /* open outfile + print init info */ if((pure_hmm_name_p = strrchr(hmm_name, '/')) != NULL) { strcpy(pure_hmm_name_a, pure_hmm_name_p+1); } else { strcpy(pure_hmm_name_a, hmm_name); } strcpy(cur_savefile_name, outfilepath_name); if(strlen(cur_savefile_name) != 0 && cur_savefile_name[strlen(cur_savefile_name) - 1] != '/') { strcat(cur_savefile_name, "/"); } strcat(cur_savefile_name, pure_hmm_name_a); strcat(cur_savefile_name, ".res"); if((outfile = fopen(cur_savefile_name, "w")) == NULL) { perror(cur_savefile_name); continue; } else { fprintf(outfile, "# Scores for HMM: '%s'\n\n", pure_hmm_name_a); } /* rewind file with the sequence names */ rewind(seqnamefile); /* loop over the sequences */ while(fgets(seq_name, MAX_LINE, seqnamefile) != NULL) { /* open seqfile */ seq_name[strlen(seq_name)-1] = '\0'; if((seqfile = fopen(seq_name, "r")) == NULL) { perror(seq_name); continue; } /* check sequence file for labels + check nolabel flag */ if(seqfile_has_labels(seqfile) == YES) { use_labels = YES; } else { use_labels = NO; } if(args_info.nolabels_given) { use_labels = NO; } /* allocate space for msa_seq_info structs */ if(seq_format == MSA_STANDARD || seq_format == PROFILE) { msa_seq_infop = (struct msa_sequences_multi_s*)(malloc_or_die(1 * sizeof(struct msa_sequences_multi_s))); } else if(seq_format == FASTA || seq_format == STANDARD) { seq_info.seqs = malloc_or_die(sizeof(struct sequence_multi_s) * 1); seq_info.nr_alphabets = hmm.nr_alphabets; seq_info.nr_seqs = 1; seq_info.longest_seq = 0; seq_info.shortest_seq = INT_MAX; seq_info.avg_seq_len = 0; } if(seq_format == FASTA) { if(hmm.nr_alphabets > 1) { printf("Error: fasta is a one alphabet format only\n"); exit(0); } else { get_sequence_fasta_multi(seqfile, &seq_info, 0); hmm.alphabet_type = DISCRETE; if(use_labels == YES) { get_labels_multi(seqfile, &seq_info, &hmm, 0); } } } else if(seq_format == STANDARD) { get_sequence_std_multi(seqfile, &seq_info, &hmm, 0); if(use_labels == YES) { get_labels_multi(seqfile, &seq_info, &hmm, 0); } } else if(seq_format == MSA_STANDARD) { if(prf_mode == LEAD_SEQ) { get_sequences_msa_std_multi(seqfile, priorfile, msa_seq_infop, &hmm, lead_seq, &replacement_letters); } else { get_sequences_msa_std_multi(seqfile, priorfile, msa_seq_infop, &hmm, -1, &replacement_letters); } /* get labels */ if(use_labels == YES) { if(prf_mode == LEAD_SEQ) { get_msa_labels_multi(seqfile, msa_seq_infop, &hmm); } else if(prf_mode == ALL_SEQS) { get_msa_labels_all_columns_multi(seqfile, msa_seq_infop, &hmm); } else { printf("Internal error: strange prf_mode value\n"); exit(0); } } } else if(seq_format == PROFILE) { get_sequences_msa_prf_multi(seqfile, priorfile, msa_seq_infop, &hmm); } if(seq_format == STANDARD || seq_format == FASTA) { seq_info.avg_seq_len = ((int)(seq_info.avg_seq_len / seq_info.nr_seqs)); //dump_labeled_seqs_multi(&seq_info); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { //dump_msa_seqs_multi(msa_seq_infop, &hmm); //exit(0); } fclose(seqfile); /* print seq info */ if(verbose == YES) { printf("Running seq %s on %s\n", seq_name, hmm_name); } /* retrain model if run_max_d option is given */ if(run_max_d == YES) { if(verbose == YES) { printf("retraining ...\n"); } if(seq_format == STANDARD || seq_format == FASTA) { copy_hmm_struct(&hmm, &retrain_hmm); baum_welch_dirichlet_multi(&retrain_hmm, seq_info.seqs, 1, NO, use_labels, NO, NO, multi_scoring_method, NO); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { if(prf_mode == ALL_SEQS) { use_lead_columns = NO; } else if(prf_mode == LEAD_SEQ) { use_lead_columns = YES; } copy_hmm_struct(&hmm, &retrain_hmm); msa_baum_welch_dirichlet_multi(&retrain_hmm, msa_seq_infop, 1, NO, use_gap_shares, use_lead_columns, use_labels, NO, NO, normalize, scoring_method, NO, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4, NO); } if(verbose == YES) { printf("retraining done\n"); } } if((pure_seq_name_p = strrchr(seq_name, '/')) != NULL) { strcpy(pure_seq_name_a, pure_seq_name_p+1); } else { strcpy(pure_seq_name_a, seq_name); } fprintf(outfile, "%s\n", pure_seq_name_a); /* get score info for this seq and print it to outfile */ if(verbose == YES) { printf("scoring ...\n"); } get_scores(run_max_d, run_forward, run_backward, run_viterbi, run_n_best, seq_format, prf_mode, use_gap_shares, use_prior, use_labels, normalize, scoring_method, multi_scoring_method, print_labels, print_post_probs, print_log_like, print_log_odds, print_reversi, print_path, outfile); if(verbose == YES) { printf("scoring done\n"); } /* deallocate retrain_hmm */ if(run_max_d == YES) { hmm_garbage_collection_multi_no_dirichlet(NULL, &retrain_hmm); } /* deallocate seqinfo */ if(seq_format == STANDARD || seq_format == FASTA) { free(((seq_info.seqs))->seq_1); if(hmm.nr_alphabets > 1) { free((seq_info.seqs)->seq_2); } if(hmm.nr_alphabets > 2) { free((seq_info.seqs)->seq_3); } if(hmm.nr_alphabets > 3) { free((seq_info.seqs)->seq_4); } free(seq_info.seqs); } if(seq_format == MSA_STANDARD || seq_format == PROFILE) { free((*(msa_seq_infop)).msa_seq_1); if(hmm.nr_alphabets > 1) { free((*(msa_seq_infop)).msa_seq_2); } if(hmm.nr_alphabets > 2) { free((*(msa_seq_infop)).msa_seq_3); } if(hmm.nr_alphabets > 3) { free((*(msa_seq_infop)).msa_seq_4); } free((*(msa_seq_infop)).gaps); } free(msa_seq_infop); } /* deallocate hmm_info */ hmm_garbage_collection_multi(hmmfile, &hmm); fclose(outfile); } } else if(query == HMM) { /* for each sequence * score it on all hmms and print results * done as if query = seq, but results are saved differently */ /* print init info in outfiles */ rewind(seqnamefile); while(fgets(seq_name, MAX_LINE, seqnamefile) != NULL) { /* open outfile + print init info */ seq_name[strlen(seq_name)-1] = '\0'; if((pure_seq_name_p = strrchr(seq_name, '/')) != NULL) { strcpy(pure_seq_name_a, pure_seq_name_p+1); } else { strcpy(pure_seq_name_a, seq_name); } strcpy(cur_savefile_name, outfilepath_name); if(strlen(cur_savefile_name) != 0 && cur_savefile_name[strlen(cur_savefile_name) - 1] != '/') { strcat(cur_savefile_name, "/"); } strcat(cur_savefile_name, pure_seq_name_a); strcat(cur_savefile_name, ".res"); if((outfile = fopen(cur_savefile_name, "w")) == NULL) { perror(cur_savefile_name); continue; } else { fprintf(outfile, "# Scores for sequence: '%s'\n\n", pure_seq_name_a); fclose(outfile); } } while(fgets(hmm_name, MAX_LINE, hmmnamefile) != NULL) { /* get hmm from file */ hmm_name[strlen(hmm_name)-1] = '\0'; if((hmmfile = fopen(hmm_name, "r")) == NULL) { perror(hmm_name); continue; } hmmfiletype = readhmm_check(hmmfile); if(hmmfiletype == SINGLE_HMM) { readhmm(hmmfile, &hmm); } else if(hmmfiletype == MULTI_HMM) { readhmm_multialpha(hmmfile, &hmm); } hmm.subst_mtx = subst_mtxp; hmm.subst_mtx_2 = subst_mtxp_2; hmm.subst_mtx_3 = subst_mtxp_3; hmm.subst_mtx_4 = subst_mtxp_4; hmm.replacement_letters = &replacement_letters; if((pure_hmm_name_p = strrchr(hmm_name, '/')) != NULL) { strcpy(pure_hmm_name_a, pure_hmm_name_p+1); } else { strcpy(pure_hmm_name_a, hmm_name); } /* rewind file with the sequence names */ rewind(seqnamefile); /* loop over the sequences */ while(fgets(seq_name, MAX_LINE, seqnamefile) != NULL) { /* open seqfile */ seq_name[strlen(seq_name)-1] = '\0'; if((seqfile = fopen(seq_name, "r")) == NULL) { perror(seq_name); continue; } /* open outfile + print init info */ if((pure_seq_name_p = strrchr(seq_name, '/')) != NULL) { strcpy(pure_seq_name_a, pure_seq_name_p+1); } else { strcpy(pure_seq_name_a, seq_name); } strcpy(cur_savefile_name, outfilepath_name); if(strlen(cur_savefile_name) != 0 && cur_savefile_name[strlen(cur_savefile_name) - 1] != '/') { strcat(cur_savefile_name, "/"); } strcat(cur_savefile_name, pure_seq_name_a); strcat(cur_savefile_name, ".res"); if((outfile = fopen(cur_savefile_name, "a")) == NULL) { perror(cur_savefile_name); continue; } else { } /* check sequence file for labels + check nolabel flag */ if(seqfile_has_labels(seqfile) == YES) { use_labels = YES; } else { use_labels = NO; } if(args_info.nolabels_given) { use_labels = NO; } /* allocate space for msa_seq_info structs */ if(seq_format == MSA_STANDARD || seq_format == PROFILE) { msa_seq_infop = (struct msa_sequences_multi_s*)(malloc_or_die(1 * sizeof(struct msa_sequences_multi_s))); } else if(seq_format == FASTA || seq_format == STANDARD) { seq_info.seqs = malloc_or_die(sizeof(struct sequence_multi_s) * 1); seq_info.nr_alphabets = hmm.nr_alphabets; seq_info.nr_seqs = 1; seq_info.longest_seq = 0; seq_info.shortest_seq = INT_MAX; seq_info.avg_seq_len = 0; } if(seq_format == FASTA) { if(hmm.nr_alphabets > 1) { printf("Error: fasta is a one alphabet format only\n"); exit(0); } else { get_sequence_fasta_multi(seqfile, &seq_info, 0); hmm.alphabet_type = DISCRETE; if(use_labels == YES) { get_labels_multi(seqfile, &seq_info, &hmm, 0); } } } else if(seq_format == STANDARD) { get_sequence_std_multi(seqfile, &seq_info, &hmm, 0); if(use_labels == YES) { get_labels_multi(seqfile, &seq_info, &hmm, 0); } } else if(seq_format == MSA_STANDARD) { if(prf_mode == LEAD_SEQ) { get_sequences_msa_std_multi(seqfile, priorfile, msa_seq_infop, &hmm, lead_seq, &replacement_letters); } else { get_sequences_msa_std_multi(seqfile, priorfile, msa_seq_infop, &hmm, -1, &replacement_letters); } /* get labels */ if(use_labels == YES) { if(prf_mode == LEAD_SEQ) { get_msa_labels_multi(seqfile, msa_seq_infop, &hmm); } else if(prf_mode == ALL_SEQS) { get_msa_labels_all_columns_multi(seqfile, msa_seq_infop, &hmm); } else { printf("Internal error: strange prf_mode value\n"); exit(0); } } } else if(seq_format == PROFILE) { get_sequences_msa_prf_multi(seqfile, priorfile, msa_seq_infop, &hmm); } if(seq_format == STANDARD || seq_format == FASTA) { seq_info.avg_seq_len = ((int)(seq_info.avg_seq_len / seq_info.nr_seqs)); //dump_labeled_seqs_multi(&seq_info); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { } fclose(seqfile); fprintf(outfile, "%s:\n", pure_hmm_name_a); if(verbose == YES) { printf("Running seq %s on %s\n", seq_name, hmm_name); } /* retrain model if run_max_d option is given */ if(run_max_d == YES) { if(verbose == YES) { printf("retraining ... \n"); } if(seq_format == STANDARD || seq_format == FASTA) { copy_hmm_struct(&hmm, &retrain_hmm); baum_welch_dirichlet_multi(&retrain_hmm, seq_info.seqs, 1, NO, use_labels, NO, NO, multi_scoring_method, NO); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { if(prf_mode == ALL_SEQS) { use_lead_columns = NO; } else if(prf_mode == LEAD_SEQ) { use_lead_columns = YES; } copy_hmm_struct(&hmm, &retrain_hmm); msa_baum_welch_dirichlet_multi(&retrain_hmm, msa_seq_infop, 1, NO, use_gap_shares, use_lead_columns, use_labels,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -