📄 set_profile_hmm_parameters.c
字号:
#include <stdio.h>#include <string.h>#include <stdlib.h>#include <math.h>#include <float.h>#include "structs.h"#include "funcs.h"#define STD_CAL 1#define EXT_CAL 2#define NEW 1#define ACID 2#define GAP 3#define INSERT_ACID 4#define INSERT_GAP 5#define SI_PROB 0.33#define EI_PROB 0.5extern int verbose;void calibrate_hmm_std();void calibrate_hmm_ext(FILE *seq_infile, int prior_scaler);void add_replacement_letter(char cur);int read_priorfile(FILE *priorfile);int priorize_shares(int pos, double *insert_emissions, int prior_scaler);struct hmm_s hmm;struct msa_sequences_s msa_seq_info;struct replacement_letter_s replacement_letters;struct emission_dirichlet_s em_di;int main(int argc, char* argv[]){ FILE *hmm_infile = NULL; FILE *hmm_outfile = NULL; FILE *seq_infile = NULL; FILE *replfile = NULL; FILE *priorfile = NULL; int calibration_method = STD_CAL; int lead_seq = 1; int i = 0; int prior_scaler = 1; if(argc > 1) { for(i = 1; i < argc; i++) { if(strcmp(argv[i], "-ext") == 0) { i++; if((seq_infile = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp_cal(); exit(0); } else { calibration_method = EXT_CAL; printf("Opened file %s for reading sequences\n",argv[i]); } } else if(strcmp(argv[i], "-std") == 0) { calibration_method = STD_CAL; } else if(strcmp(argv[i], "-hmm") == 0) { i++; if((hmm_infile = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp_cal(); exit(0); } else { printf("Opened file %s for reading hmm\n",argv[i]); } } else if(strcmp(argv[i], "-o") == 0) { i++; if((hmm_outfile = fopen(argv[i], "w")) == NULL) { perror(argv[i]); printhelp_cal(); exit(0); } else { printf("Opened file %s for writing\n",argv[i]); } } else if(strcmp(argv[i], "-pri") == 0) { i++; if((priorfile = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp_cal(); exit(0); } else { printf("Opened file %s for reading priorfiles\n",argv[i]); } } else if(strcmp(argv[i], "-prisc") == 0) { i++; prior_scaler = atoi(argv[i]); if(prior_scaler <= 0) { printf("Incorrect option: %s\n", argv[i]); perror(argv[i]); printhelp_cal(); exit(0); } } else if(strcmp(argv[i], "-r") == 0) { i++; if((replfile = fopen(argv[i], "r")) == NULL) { perror(argv[i]); printhelp_cal(); exit(0); } else { printf("Opened file %s for reading replacement letters\n",argv[i]); } } else if(strcmp(argv[i], "-lead") == 0) { i++; lead_seq = atoi(argv[i]); if(lead_seq <= 0) { printf("Incorrect option: %s\n", argv[i]); perror(argv[i]); printhelp_cal(); exit(0); } } else if(strcmp(argv[i], "-verbose") == 0) { verbose = YES; } else if(strcmp(argv[i], "-h") == 0) { printhelp_cal(); exit(0); } } /* read hmm */ if(hmm_infile != NULL) { readhmm(hmm_infile, &hmm); } else { printhelp_cal(); exit(0); } /* get replacement letters */ if(replfile != NULL) { get_replacement_letters(replfile, &replacement_letters); } else { replacement_letters.nr_rl = 0; } /* read priorfile */ if(priorfile != NULL) { read_priorfile(priorfile); } if(calibration_method == STD_CAL) { calibrate_hmm_std(); } else if(calibration_method == EXT_CAL) { calibrate_hmm_ext(seq_infile, prior_scaler); } if(hmm_outfile != NULL) { savehmm(hmm_outfile, &hmm); } else { savehmm(fopen("a.hmg", "w"), &hmm); } } /* cleanup */ if(priorfile != NULL) { free(em_di.q_values); free(em_di.alpha_sums); free(em_di.logbeta_values); free(em_di.prior_values); fclose(priorfile); }}void calibrate_hmm_std(){ /* delete - match - insert */ double GAP_SIZE = 7.0; double INSERT_SIZE = 3.0; double transprob_dd, transprob_ii; int i; /* calibrate delete states */ transprob_dd = exp(log(0.5) / GAP_SIZE); for(i = 1; i < hmm.nr_v - 4; i += 3) { /* update transitions to next delete state and next match state */ *(hmm.transitions + get_mtx_index(i, i+3, hmm.nr_v)) = transprob_dd; *(hmm.transitions + get_mtx_index(i, i+4, hmm.nr_v)) = 1.0 - transprob_dd; } transprob_ii = exp(log(0.5) / INSERT_SIZE); for(i = 3; i < hmm.nr_v - 3; i += 3) { /* update transitions to next delete state and next match state */ *(hmm.transitions + get_mtx_index(i, i, hmm.nr_v)) = transprob_ii; *(hmm.transitions + get_mtx_index(i, i-1, hmm.nr_v)) = 1.0 - transprob_ii; } }void calibrate_hmm_ext(FILE *seq_infile, int prior_scaler){ char cur; int nr_mtx_columns, nr_insert_columns; int *transitions; double *insert_emissions; //double *insert_acids; int i,j; int a_index, is_gap, new; int cur_msa_column, cur_mtx_column, cur_mtx_col_nr, cur_insert_column; int last_sign, cur_sign; int d_tot, m_tot, i_tot, s_tot, dd, dm, mm, md, mi, ii, im, sd, sm; int use_insert_matrix; double transprob_ii; char *row; int INSERT_SIZE = 25; int tot_nr_chars; use_insert_matrix = YES; //printf("check 1\n"); /* read first row, count indices of occupied columns */ cur = (char)fgetc(seq_infile); if(cur != '<') { } nr_mtx_columns = 0; nr_insert_columns = 0; tot_nr_chars = 20; while(1) { /* read symbol */ cur = (char)fgetc(seq_infile); tot_nr_chars = tot_nr_chars + 2; if(cur == '>') { break; } else if(cur == ' ' || cur == '-' || cur == '_' || cur == '.') { if(new == YES) { nr_insert_columns++; new = NO; } } else { nr_mtx_columns++; new = YES; } /* read ';' */ cur = (char)fgetc(seq_infile); if(cur != ';') { } } //printf("tot_nr_chars = %d\n",tot_nr_chars); row = (char*)(malloc_or_die(tot_nr_chars * sizeof(char))); //printf("check 2\n"); nr_mtx_columns++; /* allocate memory for matrices, add one (pseudocount) to each post */ transitions = (int*)malloc_or_die(8 * nr_mtx_columns * sizeof(int)); insert_emissions = (double*)malloc_or_die(nr_mtx_columns * (hmm.a_size+1) * sizeof(double)); for(i = 1; i < 8; i++) { for(j = 0; j < nr_mtx_columns; j++) { *(transitions + get_mtx_index(i,j,nr_mtx_columns)) = 1; } } //printf("check 3\n"); /* read first row, store indices of occupied columns + add counts for first row */ rewind(seq_infile); cur_msa_column = 1; cur_mtx_column = 0; fgets(row, tot_nr_chars, seq_infile); //printf("check 4\n"); for(i = 1;;i++) { /* read symbol */ cur = row[i]; if(cur == '>') { break; } else if(cur == ' ' || cur == '-' || cur == '_' || cur == '.') { cur_msa_column++; } else if(cur == ';') { } else { *(transitions + cur_mtx_column) = cur_msa_column; *(transitions + get_mtx_index(1,cur_mtx_column, nr_mtx_columns)) += 1; cur_mtx_column++; cur_msa_column++; } } //printf("check 5\n"); /* for all rows, add counts to matrix */ while((fgets(row, tot_nr_chars, seq_infile)) != NULL) { cur_msa_column = 1; cur_mtx_column = 0; cur_insert_column = 0; last_sign = NEW; cur_sign = NEW; for(i = 1;;i++) { /* read symbol */ cur_mtx_col_nr = *(transitions + cur_mtx_column); cur = row[i]; if(cur == '>') { break; } else if(cur == ' ' || cur == '-' || cur == '_' || cur == '.') { last_sign = cur_sign; if(cur_msa_column == cur_mtx_col_nr) /* gap */{ if(last_sign == ACID || last_sign == NEW) { cur_sign = GAP; *(transitions + get_mtx_index(2,cur_mtx_column,nr_mtx_columns)) += 1; } else if(last_sign == GAP) { cur_sign = GAP; *(transitions + get_mtx_index(4,cur_mtx_column,nr_mtx_columns)) += 1; } else { } if(is_gap == YES) { is_gap = NO; cur_insert_column++; } cur_msa_column++; cur_mtx_column++; } else if(cur_msa_column < cur_mtx_col_nr) /* insert gap*/{ if(last_sign == ACID || last_sign == NEW) { cur_sign = ACID;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -