📄 hmmtrain.c
字号:
printf("Internal error: hmmfile not found, should have been detected earlier\n"); } /* check if priors are present and if noprior option is set */ if(args_info.noprior_given) { use_prior = NO; } else if(hmm.nr_ed > 0) { use_prior = YES; } /* get replacement letters */ if(replfile != NULL) { get_replacement_letters_multi(replfile, &replacement_letters); } else { replacement_letters.nr_alphabets = 0; replacement_letters.nr_rl_1 = 0; replacement_letters.nr_rl_2 = 0; replacement_letters.nr_rl_3 = 0; replacement_letters.nr_rl_4 = 0; replacement_letters.letters_1 = NULL; replacement_letters.probs_1 = NULL; replacement_letters.letters_2 = NULL; replacement_letters.probs_2 = NULL; replacement_letters.letters_3 = NULL; replacement_letters.probs_3 = NULL; replacement_letters.letters_4 = NULL; replacement_letters.probs_4 = NULL; } hmm.replacement_letters = &replacement_letters; /* get training seqences */ /* Note: memory for the sequences will be allocated in get_sequences method, but * must be freed here */ /* count nr of sequences */ nr_seqs = 0; while(fgets(seq_name, MAX_LINE, seqnamefile) != NULL) { nr_seqs++; } rewind(seqnamefile); /* allocate space for msa_seq_info structs */ if(seq_format == MSA_STANDARD || seq_format == PROFILE) { msa_seq_infop = (struct msa_sequences_multi_s*)(malloc_or_die(nr_seqs * sizeof(struct msa_sequences_multi_s))); } else if(seq_format == FASTA || seq_format == STANDARD) { seq_info.seqs = malloc_or_die(sizeof(struct sequence_multi_s) * nr_seqs); seq_info.nr_alphabets = hmm.nr_alphabets; seq_info.nr_seqs = nr_seqs; seq_info.longest_seq = 0; seq_info.shortest_seq = INT_MAX; seq_info.avg_seq_len = 0; } /* read all the sequences */ seq_counter = 0; nr_read_seqs = 0; while(fgets(seq_name, MAX_LINE, seqnamefile) != NULL) { seq_name[strlen(seq_name)-1] = '\0'; if((seqfile = fopen(seq_name, "r")) == NULL) { perror(seq_name); continue; } /* check sequence file for labels + check nolabel flag */ if(seqfile_has_labels(seqfile) == YES) { use_labels = YES; } else { use_labels = NO; } if(args_info.nolabels_given) { use_labels = NO; } if(seq_format == FASTA) { if(hmm.nr_alphabets > 1) { printf("fasta is a one alphabet format only\n"); exit(0); } else { get_sequence_fasta_multi(seqfile, &seq_info, nr_read_seqs); hmm.alphabet_type = DISCRETE; if(use_labels == YES) { get_labels_multi(seqfile, &seq_info, &hmm, nr_read_seqs); } } } else if(seq_format == STANDARD) { get_sequence_std_multi(seqfile, &seq_info, &hmm, nr_read_seqs); if(use_labels == YES) { get_labels_multi(seqfile, &seq_info, &hmm, nr_read_seqs); } } else if(seq_format == MSA_STANDARD) { if(use_lead_columns == YES) { get_sequences_msa_std_multi(seqfile, NULL, msa_seq_infop + seq_counter, &hmm, lead_seq, &replacement_letters); } else { get_sequences_msa_std_multi(seqfile, NULL, msa_seq_infop + seq_counter, &hmm, -1, &replacement_letters); } /* get labels */ if(use_labels == YES) { if(use_lead_columns == YES) { get_msa_labels_multi(seqfile, msa_seq_infop + seq_counter, &hmm); } else { get_msa_labels_all_columns_multi(seqfile, msa_seq_infop + seq_counter, &hmm); } } } else if(seq_format == PROFILE) { get_sequences_msa_prf_multi(seqfile, NULL, msa_seq_infop + seq_counter, &hmm); } nr_read_seqs++; seq_counter++; fclose(seqfile); } if(seq_format == STANDARD || seq_format == FASTA) { seq_info.avg_seq_len = ((int)(seq_info.avg_seq_len / seq_info.nr_seqs)); //dump_labeled_seqs_multi(&seq_info); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { } /* do training */ if(nr_read_seqs > 0) { if(training_method == BW_STD) { /* do baum welch training */ if(seq_format == STANDARD || seq_format == FASTA) { baum_welch_dirichlet_multi(&hmm, seq_info.seqs, seq_info.nr_seqs, annealing, use_labels, use_transition_pseudo_counts, use_emission_pseudo_counts, multi_scoring_method, use_prior); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { msa_baum_welch_dirichlet_multi(&hmm, msa_seq_infop, nr_seqs, annealing, use_gap_shares, use_lead_columns, use_labels, use_transition_pseudo_counts, use_emission_pseudo_counts, normalize, scoring_method, use_nr_occurences, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4, use_prior); } } else if(training_method == CML_STD && use_labels == YES) { /* do cml training */ if(seq_format == STANDARD || seq_format == FASTA) { extended_baum_welch_dirichlet_multi(&hmm, seq_info.seqs, seq_info.nr_seqs, annealing, use_labels, use_transition_pseudo_counts, use_emission_pseudo_counts, multi_scoring_method, use_prior); } else if(seq_format == MSA_STANDARD || seq_format == PROFILE) { extended_msa_baum_welch_dirichlet_multi(&hmm, msa_seq_infop, nr_seqs, annealing, use_gap_shares, use_lead_columns, use_labels, use_transition_pseudo_counts, use_emission_pseudo_counts, normalize, scoring_method, use_nr_occurences, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4, use_prior); } } else if(training_method == CML_STD && use_labels == NO) { printf("Error: cannot do CML-training on unlabeled sequences\n"); exit(0); } } else { printf("Error: no sequences read: cannot train\n"); exit(0); } /* save trained hmm */ if(hmmfiletype == MULTI_HMM) { if(outfile != NULL) { savehmm_multialpha(outfile, &hmm); } else { outfile = fopen("a.hmg", "w"); savehmm_multialpha(outfile, &hmm); } } else if(hmmfiletype == SINGLE_HMM) { if(outfile != NULL) { savehmm(outfile, &hmm); } else { outfile = fopen("a.hmg", "w"); savehmm(outfile, &hmm); } } /* garbage collection */ if(replfile != NULL) { if(replacement_letters.nr_rl_1 > 0) { free(replacement_letters.letters_1); free(replacement_letters.probs_1); } if(replacement_letters.nr_rl_2 > 0) { free(replacement_letters.letters_2); free(replacement_letters.probs_2); } if(replacement_letters.nr_rl_3 > 0) { free(replacement_letters.letters_3); free(replacement_letters.probs_3); } if(replacement_letters.nr_rl_4 > 0) { free(replacement_letters.letters_4); free(replacement_letters.probs_4); } fclose(replfile); } /* free allocated sequence memory for msa/profile and single seqs respectively */ if(seq_format == MSA_STANDARD || seq_format == PROFILE) { for(i = 0; i < nr_read_seqs; i++) { free((*(msa_seq_infop + i)).msa_seq_1); if(hmm.nr_alphabets > 1) { free((*(msa_seq_infop + i)).msa_seq_2); } if(hmm.nr_alphabets > 2) { free((*(msa_seq_infop + i)).msa_seq_3); } if(hmm.nr_alphabets > 3) { free((*(msa_seq_infop + i)).msa_seq_4); } free((*(msa_seq_infop + i)).gaps); free((*(msa_seq_infop + i)).gap_shares); free((*(msa_seq_infop + i)).lead_columns_start); } free(msa_seq_infop); } else if(seq_format == FASTA || seq_format == STANDARD) { for(i = 0; i < nr_read_seqs; i++) { free(((seq_info.seqs) + i)->seq_1); if(hmm.nr_alphabets > 1) { free((seq_info.seqs + i)->seq_2); } if(hmm.nr_alphabets > 2) { free((seq_info.seqs + i)->seq_3); } if(hmm.nr_alphabets > 3) { free((seq_info.seqs + i)->seq_4); } } free(seq_info.seqs); } fclose(outfile); if(substmtxfile != NULL) { free(subst_mtxp); if(hmm.nr_alphabets > 1) { free(subst_mtxp_2); } if(hmm.nr_alphabets > 2) { free(subst_mtxp_3); } if(hmm.nr_alphabets > 3) { free(subst_mtxp_4); } fclose(substmtxfile); } if(freqfile != NULL) { free(aa_freqs); if(hmm.nr_alphabets > 1) { free(aa_freqs_2); } if(hmm.nr_alphabets > 2) { free(aa_freqs_3); } if(hmm.nr_alphabets > 3) { free(aa_freqs_4); } fclose(freqfile); } hmm_garbage_collection_multi(hmmfile, &hmm); return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -