📄 hmmtrain.c
字号:
#include <math.h>#include <limits.h>#include "structs.h"#include "funcs.h"#include "cmdline_hmmtrain.h"#define MAX_LINE 4000#define MAX_SEQS 4000#define FIRST_SEQ 1extern int verbose;/* memory for transition and emission matrices, from_vertex array and to_vertex_array will be allocated in readhmm, but must be freed here */static struct hmm_multi_s hmm;static struct msa_sequences_multi_s *msa_seq_infop;static struct sequences_multi_s seq_info;static struct replacement_letter_multi_s replacement_letters;double *subst_mtxp;double *subst_mtxp_2;double *subst_mtxp_3;double *subst_mtxp_4;double *aa_freqs;double *aa_freqs_2;double *aa_freqs_3;double *aa_freqs_4;int main(int argc, char* argv[]){ int i; FILE *hmmfile, *outfile, *seqfile, *replfile, *seqnamefile, *substmtxfile, *freqfile; double d; int seq_format; int use_gap_shares; int use_lead_columns; int lead_seq; int use_labels; int annealing; int use_transition_pseudo_counts, use_emission_pseudo_counts; int nr_seqs, seq_counter, nr_read_seqs; char seq_name[200]; int normalize; int scoring_method; int read_subst_mtx; int use_nr_occurences; int multi_scoring_method; int training_method; int hmmfiletype; int use_prior; struct gengetopt_args_info args_info; seq_format = STANDARD; hmmfile = NULL; outfile = NULL; seqfile = NULL; replfile = NULL; seqnamefile = NULL; substmtxfile = NULL; freqfile = NULL; use_gap_shares = YES; use_lead_columns = YES; lead_seq = FIRST_SEQ; use_labels = NO; annealing = NO; use_transition_pseudo_counts = NO; use_emission_pseudo_counts = NO; normalize = NO; scoring_method = SJOLANDER; read_subst_mtx = NO; use_nr_occurences = NO; multi_scoring_method = JOINT_PROB; subst_mtxp = NULL; subst_mtxp_2 = NULL; subst_mtxp_3 = NULL; subst_mtxp_4 = NULL; aa_freqs = NULL; aa_freqs_2 = NULL; aa_freqs_3 = NULL; aa_freqs_4 = NULL; training_method = BW_STD; use_prior = YES; /* parse command line */ if(cmdline_parser(argc, argv, &args_info) != 0) { exit(1); } /* compulsory options */ if(args_info.hmminfile_given) { if((hmmfile = fopen(args_info.hmminfile_arg, "r")) == NULL) { perror(args_info.hmminfile_arg); exit(0); } else { printf("Opened file %s for reading model file\n",args_info.hmminfile_arg); } } if(args_info.seqnamefile_given) { if((seqnamefile = fopen(args_info.seqnamefile_arg, "r")) == NULL) { perror(args_info.seqnamefile_arg); exit(0); } else { printf("Opened file %s for reading sequence names\n",args_info.seqnamefile_arg); } } if(args_info.outfile_given) { if((outfile = fopen(args_info.outfile_arg, "w")) == NULL) { perror(args_info.outfile_arg); exit(0); } else { printf("Opened file %s for writing\n",args_info.outfile_arg); } } if(args_info.seqformat_given) { if(strcmp(args_info.seqformat_arg, "fa") == 0) { seq_format = FASTA; } else if(strcmp(args_info.seqformat_arg, "s") == 0) { seq_format = STANDARD; } else if(strcmp(args_info.seqformat_arg, "msa") == 0) { seq_format = MSA_STANDARD; } else if(strcmp(args_info.seqformat_arg, "prf") == 0) { seq_format = PROFILE; } else { printf("Incorrect sequence format: %s\n", args_info.seqformat_arg); exit(0); } } /* non compulsory options */ if(args_info.smxfile_given) { if((substmtxfile = fopen(args_info.smxfile_arg, "r")) == NULL) { perror(args_info.smxfile_arg); exit(0); } else { read_subst_mtx = YES; printf("Opened file %s for reading substitution matrix\n",args_info.smxfile_arg); } } if(args_info.freqfile_given) { if((freqfile = fopen(args_info.freqfile_arg, "r")) == NULL) { perror(args_info.freqfile_arg); exit(0); } else { printf("Opened file %s for reading background frequencies\n",args_info.freqfile_arg); } } if(args_info.replfile_given) { if((replfile = fopen(args_info.replfile_arg, "r")) == NULL) { perror(args_info.replfile_arg); exit(0); } else { printf("Opened file %s for reading replacement letters\n",args_info.replfile_arg); } } if(args_info.alg_given) { if(strcmp(args_info.alg_arg, "bw") == 0) { training_method = BW_STD; } else if(strcmp(args_info.alg_arg, "cml") == 0) { training_method = CML_STD; } else { printf("Incorrect training method option: %s\n", args_info.alg_arg); exit(0); } } /* msa scoring options */ if(args_info.msascoring_given) { if(strcmp(args_info.msascoring_arg, "DP") == 0) { scoring_method = DOT_PRODUCT; } else if(strcmp(args_info.msascoring_arg, "DPPI") == 0) { scoring_method = DOT_PRODUCT_PICASSO; } else if(strcmp(args_info.msascoring_arg, "PI") == 0) { scoring_method = PICASSO; } else if(strcmp(args_info.msascoring_arg, "PIS") == 0) { scoring_method = PICASSO_SYM; } else if(strcmp(args_info.msascoring_arg, "GM") == 0) { scoring_method = SJOLANDER; } else if(strcmp(args_info.msascoring_arg, "GMR") == 0) { scoring_method = SJOLANDER_REVERSED; } //else if(strcmp(args_info.msascoring_arg, "SMP") == 0) { // scoring_method = SUBST_MTX_PRODUCT; //} //else if(strcmp(args_info.msascoring_arg, "SMDP") == 0) { // scoring_method = SUBST_MTX_DOT_PRODUCT; //} //else if(strcmp(args_info.msascoring_arg, "SMDPP") == 0) { // scoring_method = SUBST_MTX_DOT_PRODUCT_PRIOR; //} else { printf("Incorrect scoring method option: %s\n", args_info.msascoring_arg); exit(0); } } if(args_info.usecolumns_given) { if(strcmp(args_info.usecolumns_arg, "all") == 0) { use_lead_columns = NO; } else { lead_seq = atoi(args_info.usecolumns_arg); use_lead_columns = YES; if(lead_seq <= 0) { printf("Incorrect use-column option: %s\n", args_info.usecolumns_arg); exit(0); } } } /* flags */ if(args_info.nolabels_given) { /* checked after seqread */ } if(args_info.noprior_given) { /* checked after hmm read */ } if(args_info.tpcounts_given) { use_transition_pseudo_counts = YES; } if(args_info.epcounts_given) { use_emission_pseudo_counts = YES; } if(args_info.verbose_given) { verbose = YES; } /* read subst mtx */ if(substmtxfile != NULL) { read_subst_matrix_multi(&subst_mtxp, &subst_mtxp_2, &subst_mtxp_3, &subst_mtxp_4, substmtxfile); } /* get frequency file */ if(freqfile != NULL) { read_frequencies_multi(freqfile, &aa_freqs, &aa_freqs_2, &aa_freqs_3, &aa_freqs_4); } if((scoring_method == SUBST_MTX_PRODUCT || scoring_method == SUBST_MTX_DOT_PRODUCT || scoring_method == SUBST_MTX_DOT_PRODUCT_PRIOR) && read_subst_mtx == NO) { printf("Error: No substitution matrix supplied\n"); exit(0); } /* get hmm from file */ if(hmmfile != NULL) { hmmfiletype = readhmm_check(hmmfile); if(hmmfiletype == SINGLE_HMM) { readhmm(hmmfile, &hmm); } else if(hmmfiletype == MULTI_HMM) { readhmm_multialpha(hmmfile, &hmm); } hmm.subst_mtx = subst_mtxp; hmm.subst_mtx_2 = subst_mtxp_2; hmm.subst_mtx_3 = subst_mtxp_3; hmm.subst_mtx_4 = subst_mtxp_4; } else { /* cannot happen */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -