📄 hmmsearch.c
字号:
#include <stdio.h>#include <string.h>#include <stdlib.h>#include <math.h>#include <limits.h>#include "structs.h"#include "funcs.h"#include "cmdline_hmmsearch.h"#define NORM_LOG_LIKE 0#define LOGODDS 1#define HMM 20#define SEQ 21#define LONGEST_SEQ -1 /* Note: this constant is also defined in read_seqs.c */#define FIRST_SEQ 1#define LEAD_SEQ 10#define ALL_SEQS 11#define MAX_LINE 500extern int verbose;static struct hmm_multi_s hmm;static struct hmm_multi_s retrain_hmm;static struct msa_sequences_multi_s *msa_seq_infop;static struct sequences_multi_s seq_info;static struct replacement_letter_multi_s replacement_letters;static struct null_model_multi_s null_model;double *subst_mtxp;double *subst_mtxp_2;double *subst_mtxp_3;double *subst_mtxp_4;double *aa_freqs;double *aa_freqs_2;double *aa_freqs_3;double *aa_freqs_4;void get_null_model_multi(FILE *nullfile);double get_nullmodel_score_multi(struct letter_s *seq, struct letter_s *seq_2, struct letter_s *seq_3, struct letter_s *seq_4, int seq_len, int multi_scoring_method);double get_nullmodel_score_msa_multi(int seq_len, int prf_mode, int use_prior, int use_gap_shares, int normalize, int scoring_method, int multi_scoring_method, double *aa_freqs, double *aa_freqs_2, double *aa_freqs_3, double *aa_freqs_4);void get_scores(int run_max_d, int run_forward, int run_backward, int run_viterbi, int run_n_best, int seq_format, int prf_mode, int use_gap_shares, int use_prior, int use_labels, int normalize, int scoring_method, int multi_scoring_method, int print_labels, int print_post_probs, int print_log_like, int print_log_odds, int print_reversi, int print_path, FILE *outfile);void get_post_prob_matrix(double **ret_post_prob_matrixp, double forw_score, struct forward_s *forw_mtx, struct backward_s *backw_mtx, int seq_len);int main(int argc, char* argv[]){ /* command line options */ FILE *outfile; /* file to prints results to */ FILE *hmmnamefile; /* file to read hmm names from */ FILE *hmmfile; /* file to read hmms from */ FILE *seqfile; /* file to read sequences from */ FILE *nullfile; /* file to read null model from */ FILE *seqnamefile; /* file for reading sequence names */ FILE *replfile; /* file for reading special replacement letters */ FILE *priorfile; /* file to read priordistribution from */ FILE *substmtxfile; /* file to read substitution matrix from */ FILE *freqfile; int seq_format; /* input sequence format */ int align_alg, label_alg; /* algorithm for calculating match score */ int query; /* are hmms or seqs the queryside */ char seq_name[200]; char hmm_name[200]; char outfilepath_name[200]; char cur_savefile_name[200]; char *pure_seq_name_p, pure_seq_name_a[200]; /* sequences name without path name */ char *pure_hmm_name_p, pure_hmm_name_a[200]; /* sequences name without path name */ char *last_slash, *last_dot; int prf_mode; /* method for doing the msa-hmm search */ int lead_seq; /* index of the sequence on which the search is performed */ int use_gap_shares; /* account for/don't account for gaps use when calculating the share of a symbol */ int use_prior; /* are we working with priors or not */ int use_labels; int normalize; int scoring_method; int read_subst_mtx; int multi_scoring_method; int print_log_like; int print_log_odds; int print_reversi; int print_labels; int print_post_probs; int print_path; int run_forward; int run_backward; int run_viterbi; int run_n_best; int run_max_d; int hmmfiletype; int use_lead_columns; struct gengetopt_args_info args_info; /* temporary variables */ int i,j; /*standard loop indices */ /*init some variables */ outfile = NULL; hmmnamefile = NULL; hmmfile = NULL; seqfile = NULL; nullfile = NULL; seqnamefile = NULL; replfile = NULL; priorfile = NULL; substmtxfile = NULL; freqfile = NULL; seq_format = STANDARD; align_alg = FORW_ALG; label_alg = ONE_BEST_ALG; query = SEQ; outfilepath_name[0] = '\0'; lead_seq = -1; prf_mode = ALL_SEQS; use_gap_shares = YES; use_prior = NO; use_labels = NO; normalize = NO; scoring_method = DOT_PRODUCT; read_subst_mtx = NO; multi_scoring_method = JOINT_PROB; subst_mtxp = NULL; subst_mtxp_2 = NULL; subst_mtxp_3 = NULL; subst_mtxp_4 = NULL; aa_freqs = NULL; aa_freqs_2 = NULL; aa_freqs_3 = NULL; aa_freqs_4 = NULL; print_log_like = NO; print_log_odds = NO; print_reversi = NO; print_labels = NO; print_post_probs = NO; print_path = NO; run_forward = NO; run_backward = NO; run_viterbi = NO; run_n_best = NO; run_max_d = NO; /* parse command line */ if(cmdline_parser(argc, argv, &args_info) != 0) { exit(1); } /* compulsory options */ if(args_info.hmmnamefile_given) { if((hmmnamefile = fopen(args_info.hmmnamefile_arg, "r")) == NULL) { perror(args_info.hmmnamefile_arg); exit(0); } else { printf("Opened file %s for reading hmm names\n",args_info.hmmnamefile_arg); } } if(args_info.seqnamefile_given) { if((seqnamefile = fopen(args_info.seqnamefile_arg, "r")) == NULL) { perror(args_info.seqnamefile_arg); exit(0); } else { printf("Opened file %s for reading sequence names\n",args_info.seqnamefile_arg); } } if(args_info.outpath_given) { strcpy(outfilepath_name, args_info.outpath_arg); } if(args_info.seqformat_given) { if(strcmp(args_info.seqformat_arg, "fa") == 0) { seq_format = FASTA; } else if(strcmp(args_info.seqformat_arg, "s") == 0) { seq_format = STANDARD; } else if(strcmp(args_info.seqformat_arg, "msa") == 0) { seq_format = MSA_STANDARD; } else if(strcmp(args_info.seqformat_arg, "prf") == 0) { seq_format = PROFILE; } else { printf("Incorrect sequence format: %s\n", args_info.seqformat_arg); exit(0); } } /* non compulsory options */ if(args_info.smxfile_given) { if((substmtxfile = fopen(args_info.smxfile_arg, "r")) == NULL) { perror(args_info.smxfile_arg); exit(0); } else { read_subst_mtx = YES; printf("Opened file %s for reading substitution matrix\n",args_info.smxfile_arg); } } if(args_info.freqfile_given) { if((freqfile = fopen(args_info.freqfile_arg, "r")) == NULL) { perror(args_info.freqfile_arg); exit(0); } else { printf("Opened file %s for reading background frequencies\n",args_info.freqfile_arg); } } if(args_info.replfile_given) { if((replfile = fopen(args_info.replfile_arg, "r")) == NULL) { perror(args_info.replfile_arg); exit(0); } else { printf("Opened file %s for reading replacement letters\n",args_info.replfile_arg); } } if(args_info.priorfile_given) { if((priorfile = fopen(args_info.priorfile_arg, "r")) == NULL) { perror(args_info.priorfile_arg); exit(0); } else { printf("Opened file %s for reading priors\n",args_info.priorfile_arg); } } if(args_info.nullfile_given) { if((nullfile = fopen(args_info.nullfile_arg, "r")) == NULL) { perror(args_info.nullfile_arg); exit(0); } else { printf("Opened file %s for reading null model\n",args_info.nullfile_arg); } } if(args_info.anchor_given) { if(strcmp(args_info.anchor_arg, "hmm") == 0) { query = SEQ; } else if(strcmp(args_info.anchor_arg, "seq") == 0) { query = HMM; } else { printf("Error: Unrecognized anchor option\n"); exit(0); } } if(args_info.labeloutput_given) { print_labels = YES; print_post_probs = YES; } if(args_info.alignmentoutput_given) { print_reversi = YES; print_log_odds = YES; print_log_like = YES; } if(args_info.viterbi_given) { label_alg = VITERBI_ALG; align_alg = VITERBI_ALG; } if(args_info.forward_given) { align_alg = FORW_ALG; } if(args_info.nbest_given) { align_alg = ONE_BEST_ALG; } if(args_info.max_d_given) { run_max_d = YES; } if(args_info.path_given) { print_path = YES; } if(args_info.nopostout_given) { print_post_probs = NO; } if(args_info.nolabelout_given) { print_labels = NO; } if(args_info.nollout_given) { print_log_like = NO; } if(args_info.nooddsout_given) { print_log_odds = NO; } if(args_info.norevout_given) { print_reversi = NO; } if(args_info.alignpostout_given) { print_post_probs = YES; } if(args_info.alignlabelout_given) { print_labels = YES; } if(args_info.labelllout_given) { print_log_like = YES; } if(args_info.labeloddsout_given) { print_log_odds = YES; } if(args_info.labelrevout_given) { print_reversi = YES; } /* msa scoring options */ if(args_info.msascoring_given) { if(strcmp(args_info.msascoring_arg, "DP") == 0) { scoring_method = DOT_PRODUCT; } else if(strcmp(args_info.msascoring_arg, "DPPI") == 0) { scoring_method = DOT_PRODUCT_PICASSO; } else if(strcmp(args_info.msascoring_arg, "PI") == 0) { scoring_method = PICASSO; } else if(strcmp(args_info.msascoring_arg, "PIS") == 0) { scoring_method = PICASSO_SYM; } else if(strcmp(args_info.msascoring_arg, "GM") == 0) { scoring_method = SJOLANDER; } else if(strcmp(args_info.msascoring_arg, "GMR") == 0) { scoring_method = SJOLANDER_REVERSED; } //else if(strcmp(args_info.msascoring_arg, "SMP") == 0) { // scoring_method = SUBST_MTX_PRODUCT; //} //else if(strcmp(args_info.msascoring_arg, "SMDP") == 0) { // scoring_method = SUBST_MTX_DOT_PRODUCT; //} //else if(strcmp(args_info.msascoring_arg, "SMDPP") == 0) { // scoring_method = SUBST_MTX_DOT_PRODUCT_PRIOR; //} else { printf("Incorrect scoring method option: %s\n", args_info.msascoring_arg); exit(0); } } if(args_info.usecolumns_given) { if(strcmp(args_info.usecolumns_arg, "all") == 0) { prf_mode = ALL_SEQS; } else { lead_seq = atoi(args_info.usecolumns_arg); prf_mode = LEAD_SEQ; if(lead_seq <= 0) { printf("Incorrect use-column option: %s\n", args_info.usecolumns_arg); exit(0); } } } /* flags */ if(args_info.nolabels_given) { /* checked after seqread */ } if(args_info.verbose_given) { verbose = YES; } /* check which algorithms to run */ if(print_post_probs == YES) { run_forward = YES; run_backward = YES; } if(print_labels == YES) { if(label_alg == VITERBI_ALG) { run_viterbi = YES; } else if(label_alg == ONE_BEST_ALG) { run_n_best = YES; } } if(print_log_like == YES || print_log_odds == YES || print_reversi == YES) { if(align_alg == VITERBI_ALG) { run_viterbi = YES; } else if(align_alg == FORW_ALG) { run_forward = YES; } } /* read subst mtx */ if(substmtxfile != NULL) { read_subst_matrix_multi(&subst_mtxp, &subst_mtxp_2, &subst_mtxp_3, &subst_mtxp_4, substmtxfile); } /* get frequency file */ if(freqfile != NULL) { read_frequencies_multi(freqfile, &aa_freqs, &aa_freqs_2, &aa_freqs_3, &aa_freqs_4); } /* get prior file */ /* get replacement letters */ if(replfile != NULL) { get_replacement_letters_multi(replfile, &replacement_letters); } else { replacement_letters.nr_alphabets = 0; replacement_letters.nr_rl_1 = 0; replacement_letters.nr_rl_2 = 0; replacement_letters.nr_rl_3 = 0; replacement_letters.nr_rl_4 = 0; replacement_letters.letters_1 = NULL; replacement_letters.probs_1 = NULL; replacement_letters.letters_2 = NULL; replacement_letters.probs_2 = NULL; replacement_letters.letters_3 = NULL; replacement_letters.probs_3 = NULL; replacement_letters.letters_4 = NULL; replacement_letters.probs_4 = NULL; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -