📄 opt_prfhmm.c
字号:
#include <stdio.h>#include <string.h>#include <stdlib.h>#include <math.h>#include <float.h>#include "structs.h"#include "funcs.h"#define NEW 1#define ACID 2#define GAP 3#define INSERT_ACID 4#define INSERT_GAP 5#define FAST_MATCH_LIMIT 0.4 /* gap share must be lower than this number */#define SI_PROB 0.99#define EI_PROB 0.99//#define DEBUG_PRIextern int verbose;void set_trans_probs(FILE *seq_infile, int local);void label_msa_fast();void label_msa_query(FILE *seq_infile);void henikoff_weighting(struct msa_sequences_multi_s *msa_seq_infop, FILE *seq_infile, struct hmm_multi_s *hmmp);void printhelp();struct hmm_multi_s hmm;struct msa_sequences_multi_s msa_seq_info;struct replacement_letter_multi_s replacement_letters;struct emission_dirichlet_s em_di;struct emission_dirichlet_s em_di_2;struct emission_dirichlet_s em_di_3;struct emission_dirichlet_s em_di_4;double *subst_mtxp;double *subst_mtxp_2;double *subst_mtxp_3;double *subst_mtxp_4;int main(int argc, char* argv[]){ FILE *hmm_infile = NULL; FILE *hmm_outfile = NULL; FILE *seq_infile = NULL; FILE *flat_seq_infile = NULL; FILE *replfile = NULL; FILE *priorfile = NULL; FILE *priorfile_2 = NULL; FILE *priorfile_3 = NULL; FILE *priorfile_4 = NULL; FILE *substmtxfile = NULL; FILE *outfile_tmp = NULL; FILE *freqfile = NULL; FILE *freqfile_2 = NULL; FILE *freqfile_3 = NULL; FILE *freqfile_4 = NULL; int i = 0; int use_gap_shares; char seq_name[200]; int normalize; int scoring_method; int read_subst_mtx; int multi_scoring_method; int nr_alphabets; int pre_labeled; int model_maker; int local; int weighting_scheme; double *aa_freqs; double *aa_freqs_2; double *aa_freqs_3; double *aa_freqs_4; multi_scoring_method = JOINT_PROB; read_subst_mtx = NO; normalize = NO; scoring_method = SJOLANDER; nr_alphabets = 0; pre_labeled = NO; model_maker = FAST; local = NO; weighting_scheme = NONE; aa_freqs = NULL; aa_freqs_2 = NULL; aa_freqs_3 = NULL; aa_freqs_4 = NULL; if(argc > 1) { for(i = 1; i < argc; i++) { if(strcmp(argv[i], "-la") == 0) { i++; if((seq_infile = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp(); exit(0); } else { pre_labeled = YES; printf("Opened file %s for reading msa sequence\n",argv[i]); } } if(strcmp(argv[i], "-std") == 0) { i++; if((seq_infile = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp(); exit(0); } else { printf("Opened file %s for reading msa sequence\n",argv[i]); } } if(strcmp(argv[i], "-seq") == 0) { i++; if((flat_seq_infile = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp(); exit(0); } else { printf("Opened file %s for reading sequence\n",argv[i]); } } else if(strcmp(argv[i], "-hmm") == 0) { i++; if((hmm_infile = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp(); exit(0); } else { printf("Opened file %s for reading hmm\n",argv[i]); } } else if(strcmp(argv[i], "-o") == 0) { i++; if((hmm_outfile = fopen(argv[i], "w")) == NULL) { perror(argv[i]); printhelp(); exit(0); } else { printf("Opened file %s for writing\n",argv[i]); } } else if(strcmp(argv[i], "-scr") == 0) { i++; if(strcmp(argv[i], "DP") == 0) { scoring_method = DOT_PRODUCT; } else if(strcmp(argv[i], "GM") == 0) { scoring_method = SJOLANDER; } else if(strcmp(argv[i], "SMP") == 0) { scoring_method = SUBST_MTX_PRODUCT; } else if(strcmp(argv[i], "SMDP") == 0) { scoring_method = SUBST_MTX_DOT_PRODUCT; } else if(strcmp(argv[i], "SMDPP") == 0) { scoring_method = SUBST_MTX_DOT_PRODUCT_PRIOR; } else { printf("Incorrect option: %s\n", argv[i]); printhelp(); exit(0); } } else if(strcmp(argv[i], "-l") == 0) { i++; if(strcmp(argv[i], "F") == 0) { local = NO; } else if(strcmp(argv[i], "T") == 0) { local = YES; } else { printf("Incorrect option: %s\n", argv[i]); printhelp(); exit(0); } } else if(strcmp(argv[i], "-m") == 0) { i++; if(strcmp(argv[i], "F") == 0) { model_maker = FAST; } else if(strcmp(argv[i], "Q") == 0) { model_maker = QUERY; } else { printf("Incorrect option: %s\n", argv[i]); printhelp(); exit(0); } } else if(strcmp(argv[i], "-w") == 0) { i++; if(strcmp(argv[i], "H") == 0) { weighting_scheme = HENIKOFF; } else { printf("Incorrect option: %s\n", argv[i]); printhelp(); exit(0); } } else if(strcmp(argv[i], "-smx") == 0) { i++; if((substmtxfile = fopen(argv[i], "r")) == NULL) { printhelp(); exit(0); } else { printf("Opened file %s for reading substitution matrix\n",argv[i]); read_subst_mtx = YES; } } else if(strcmp(argv[i], "-pri") == 0) { i++; if((priorfile = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp(); exit(0); } else { printf("Opened file %s for reading priorfile\n",argv[i]); } } else if(strcmp(argv[i], "-pri2") == 0) { i++; if((priorfile_2 = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp(); exit(0); } else { printf("Opened file %s for reading priorfile 2\n",argv[i]); } } else if(strcmp(argv[i], "-pri3") == 0) { i++; if((priorfile_3 = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp(); exit(0); } else { printf("Opened file %s for reading priorfile 3\n",argv[i]); } } else if(strcmp(argv[i], "-pri4") == 0) { i++; if((priorfile_4 = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp(); exit(0); } else { printf("Opened file %s for reading priorfile 4\n",argv[i]); } } else if(strcmp(argv[i], "-freq1") == 0) { i++; if((freqfile = fopen(argv[i], "r")) == NULL) { printhelp(); exit(0); } else { printf("Opened file %s for reading prior frequencies\n",argv[i]); } } else if(strcmp(argv[i], "-freq2") == 0) { i++; if((freqfile_2 = fopen(argv[i], "r")) == NULL) { printhelp(); exit(0); } else { printf("Opened file %s for reading prior frequencies\n",argv[i]); } } else if(strcmp(argv[i], "-freq3") == 0) { i++; if((freqfile_3 = fopen(argv[i], "r")) == NULL) { printhelp(); exit(0); } else { printf("Opened file %s for reading prior frequencies\n",argv[i]); } } else if(strcmp(argv[i], "-freq4") == 0) { i++; if((freqfile_4 = fopen(argv[i], "r")) == NULL) { printhelp(); exit(0); } else { printf("Opened file %s for reading prior frequencies\n",argv[i]); } } else if(strcmp(argv[i], "-r") == 0) { i++; if((replfile = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp(); exit(0); } else { printf("Opened file %s for reading replacement letters\n",argv[i]); } } else if(strcmp(argv[i], "-verbose") == 0) { verbose = YES; } else if(strcmp(argv[i], "-h") == 0) { printhelp(); exit(0); } } /* read subst mtx */ if(read_subst_mtx == YES) { if(read_subst_matrix_multi(&subst_mtxp, &subst_mtxp_2, &subst_mtxp_3, &subst_mtxp_4, substmtxfile) == NO) { printhelp(); exit(0); } } /* get frequency file */ if(freqfile != NULL) { read_frequencies(freqfile, &aa_freqs); } /* get frequency file */ if(freqfile_2 != NULL) { read_frequencies(freqfile_2, &aa_freqs_2); } /* get frequency file */ if(freqfile_3 != NULL) { read_frequencies(freqfile_3, &aa_freqs_3); } /* get frequency file */ if(freqfile_4 != NULL) { read_frequencies(freqfile_4, &aa_freqs_4); } if((scoring_method == SUBST_MTX_PRODUCT || scoring_method == SUBST_MTX_DOT_PRODUCT || scoring_method == SUBST_MTX_DOT_PRODUCT_PRIOR) && read_subst_mtx == NO) { printf("Error: No substitution matrix supplied\n"); printhelp(); exit(0); } /* get hmm from file */ if(hmm_infile != NULL) { nr_alphabets = get_nr_alphabets(hmm_infile); rewind(hmm_infile); if(nr_alphabets == 11) { if((outfile_tmp = fopen("tmp", "w")) == NULL) { perror("tmp"); } transform_singlehmmfile_to_multi(hmm_infile, outfile_tmp); fclose(hmm_infile); fclose(outfile_tmp); if((hmm_infile = fopen("tmp", "r")) == NULL) { perror("tmp"); } } rewind(hmm_infile); readhmm_multialpha(hmm_infile, &hmm); hmm.subst_mtx = subst_mtxp; hmm.subst_mtx_2 = subst_mtxp_2; hmm.subst_mtx_3 = subst_mtxp_3; hmm.subst_mtx_4 = subst_mtxp_4; } else { printhelp(); exit(0); } /* get replacement letters */ if(replfile != NULL) { get_replacement_letters_multi(replfile, &replacement_letters); } else { replacement_letters.nr_alphabets = -1; } /* get training sequence */ /* Note: memory for the sequences will be allocated in get_sequences method, but * must be freed here */ get_sequences_msa_std_multi(seq_infile, NULL, NULL, NULL, NULL, &msa_seq_info, &hmm, -1, &replacement_letters); /* get labels */ if(pre_labeled == YES) { get_msa_labels_all_columns_multi(seq_infile, &msa_seq_info, &hmm); } else if(model_maker == FAST) { label_msa_fast(); } else if(model_maker == QUERY) { label_msa_query(seq_infile); } /* read priorfiles */ if(priorfile != NULL) { read_prior_file_multi(&em_di, &hmm, priorfile, 1); } if(priorfile_2 != NULL) { read_prior_file_multi(&em_di_2, &hmm, priorfile_2, 2); } if(priorfile_3 != NULL) { read_prior_file_multi(&em_di_3, &hmm, priorfile_3, 3); } if(priorfile_4 != NULL) { read_prior_file_multi(&em_di_4, &hmm, priorfile_4, 4); } while(1) { /* adjust sequence weights */ //dump_msa_seqs_multi(&msa_seq_info, &hmm); if(weighting_scheme == HENIKOFF) { henikoff_weighting(&msa_seq_info, flat_seq_infile, &hmm); } //dump_msa_seqs_multi(&msa_seq_info, &hmm); /* do initial match state training */ msa_baum_welch_dirichlet_multi(&hmm, &msa_seq_info, 1, NO, use_gap_shares, NO, YES, YES, NO, normalize, scoring_method, YES, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4, YES); /* set insert emission and transition parameters */ rewind(seq_infile); set_trans_probs(seq_infile, local); break; } if(hmm_outfile != NULL) { savehmm_multialpha(hmm_outfile, &hmm); } else { savehmm_multialpha(fopen("a.hmg", "w"), &hmm); } } /* cleanup */ if(priorfile != NULL) { free(em_di.q_values); free(em_di.alpha_sums); free(em_di.logbeta_values); free(em_di.prior_values); fclose(priorfile); } if(freqfile != NULL) { free(aa_freqs); fclose(freqfile); } if(freqfile_2 != NULL) { free(aa_freqs_2); fclose(freqfile_2); } if(freqfile_3 != NULL) { free(aa_freqs_3); fclose(freqfile_3); } if(freqfile_4 != NULL) { free(aa_freqs_4); fclose(freqfile_4); } }void set_trans_probs(FILE *seq_infile, int local){ char cur; int nr_mtx_columns, nr_insert_columns; int *transitions; double *insert_emissions; //double *insert_acids; int i,j; int a_index, is_gap, new; int cur_msa_column, cur_mtx_column, cur_mtx_col_nr, cur_insert_column; int last_sign, cur_sign; int d_tot, m_tot, i_tot, s_tot, dd, dm, mm, md, mi, ii, im, sd, sm; int use_insert_matrix; double transprob_ii; char *row; int INSERT_SIZE = 100; int tot_nr_chars; int alphabet_done; int cur_alphabet; double local_me_prob; double local_im_prob; double tot_local_im_prob; local_me_prob = 0.0; local_im_prob = 0.0; rewind(seq_infile); use_insert_matrix = YES; /* count nr of M and I labels in alignment */ nr_mtx_columns = 0; nr_insert_columns = 0; tot_nr_chars = 22;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -